Line data Source code
1 : /* sort - sort lines of text (with all kinds of options).
2 : Copyright (C) 1988, 1991-2008 Free Software Foundation, Inc.
3 :
4 : This program is free software: you can redistribute it and/or modify
5 : it under the terms of the GNU General Public License as published by
6 : the Free Software Foundation, either version 3 of the License, or
7 : (at your option) any later version.
8 :
9 : This program is distributed in the hope that it will be useful,
10 : but WITHOUT ANY WARRANTY; without even the implied warranty of
11 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 : GNU General Public License for more details.
13 :
14 : You should have received a copy of the GNU General Public License
15 : along with this program. If not, see <http://www.gnu.org/licenses/>.
16 :
17 : Written December 1988 by Mike Haertel.
18 : The author may be reached (Email) at the address mike@gnu.ai.mit.edu,
19 : or (US mail) as Mike Haertel c/o Free Software Foundation.
20 :
21 : Ørn E. Hansen added NLS support in 1997. */
22 :
23 : #include <config.h>
24 :
25 : #include <getopt.h>
26 : #include <sys/types.h>
27 : #include <sys/wait.h>
28 : #include <signal.h>
29 : #include "system.h"
30 : #include "argmatch.h"
31 : #include "error.h"
32 : #include "hard-locale.h"
33 : #include "hash.h"
34 : #include "inttostr.h"
35 : #include "md5.h"
36 : #include "physmem.h"
37 : #include "posixver.h"
38 : #include "quote.h"
39 : #include "randread.h"
40 : #include "stdio--.h"
41 : #include "stdlib--.h"
42 : #include "strnumcmp.h"
43 : #include "xmemcoll.h"
44 : #include "xmemxfrm.h"
45 : #include "xstrtol.h"
46 :
47 : #if HAVE_SYS_RESOURCE_H
48 : # include <sys/resource.h>
49 : #endif
50 : #ifndef RLIMIT_DATA
51 : struct rlimit { size_t rlim_cur; };
52 : # define getrlimit(Resource, Rlp) (-1)
53 : #endif
54 :
55 : /* The official name of this program (e.g., no `g' prefix). */
56 : #define PROGRAM_NAME "sort"
57 :
58 : #define AUTHORS "Mike Haertel", "Paul Eggert"
59 :
60 : #if HAVE_LANGINFO_CODESET
61 : # include <langinfo.h>
62 : #endif
63 :
64 : /* Use SA_NOCLDSTOP as a proxy for whether the sigaction machinery is
65 : present. */
66 : #ifndef SA_NOCLDSTOP
67 : # define SA_NOCLDSTOP 0
68 : /* No sigprocmask. Always 'return' zero. */
69 : # define sigprocmask(How, Set, Oset) (0)
70 : # define sigset_t int
71 : # if ! HAVE_SIGINTERRUPT
72 : # define siginterrupt(sig, flag) /* empty */
73 : # endif
74 : #endif
75 :
76 : #ifndef STDC_HEADERS
77 : double strtod ();
78 : #endif
79 :
80 : #define UCHAR_LIM (UCHAR_MAX + 1)
81 :
82 : #ifndef DEFAULT_TMPDIR
83 : # define DEFAULT_TMPDIR "/tmp"
84 : #endif
85 :
86 : /* Exit statuses. */
87 : enum
88 : {
89 : /* POSIX says to exit with status 1 if invoked with -c and the
90 : input is not properly sorted. */
91 : SORT_OUT_OF_ORDER = 1,
92 :
93 : /* POSIX says any other irregular exit must exit with a status
94 : code greater than 1. */
95 : SORT_FAILURE = 2
96 : };
97 :
98 : enum
99 : {
100 : /* The number of times we should try to fork a compression process
101 : (we retry if the fork call fails). We don't _need_ to compress
102 : temp files, this is just to reduce disk access, so this number
103 : can be small. */
104 : MAX_FORK_TRIES_COMPRESS = 2,
105 :
106 : /* The number of times we should try to fork a decompression process.
107 : If we can't fork a decompression process, we can't sort, so this
108 : number should be big. */
109 : MAX_FORK_TRIES_DECOMPRESS = 8
110 : };
111 :
112 : /* The representation of the decimal point in the current locale. */
113 : static int decimal_point;
114 :
115 : /* Thousands separator; if -1, then there isn't one. */
116 : static int thousands_sep;
117 :
118 : /* Nonzero if the corresponding locales are hard. */
119 : static bool hard_LC_COLLATE;
120 : #if HAVE_NL_LANGINFO
121 : static bool hard_LC_TIME;
122 : #endif
123 :
124 : #define NONZERO(x) ((x) != 0)
125 :
126 : /* The kind of blanks for '-b' to skip in various options. */
127 : enum blanktype { bl_start, bl_end, bl_both };
128 :
129 : /* The character marking end of line. Default to \n. */
130 : static char eolchar = '\n';
131 :
132 : /* Lines are held in core as counted strings. */
133 : struct line
134 : {
135 : char *text; /* Text of the line. */
136 : size_t length; /* Length including final newline. */
137 : char *keybeg; /* Start of first key. */
138 : char *keylim; /* Limit of first key. */
139 : };
140 :
141 : /* Input buffers. */
142 : struct buffer
143 : {
144 : char *buf; /* Dynamically allocated buffer,
145 : partitioned into 3 regions:
146 : - input data;
147 : - unused area;
148 : - an array of lines, in reverse order. */
149 : size_t used; /* Number of bytes used for input data. */
150 : size_t nlines; /* Number of lines in the line array. */
151 : size_t alloc; /* Number of bytes allocated. */
152 : size_t left; /* Number of bytes left from previous reads. */
153 : size_t line_bytes; /* Number of bytes to reserve for each line. */
154 : bool eof; /* An EOF has been read. */
155 : };
156 :
157 : struct keyfield
158 : {
159 : size_t sword; /* Zero-origin 'word' to start at. */
160 : size_t schar; /* Additional characters to skip. */
161 : size_t eword; /* Zero-origin first word after field. */
162 : size_t echar; /* Additional characters in field. */
163 : bool const *ignore; /* Boolean array of characters to ignore. */
164 : char const *translate; /* Translation applied to characters. */
165 : bool skipsblanks; /* Skip leading blanks when finding start. */
166 : bool skipeblanks; /* Skip leading blanks when finding end. */
167 : bool numeric; /* Flag for numeric comparison. Handle
168 : strings of digits with optional decimal
169 : point, but no exponential notation. */
170 : bool random; /* Sort by random hash of key. */
171 : bool general_numeric; /* Flag for general, numeric comparison.
172 : Handle numbers in exponential notation. */
173 : bool month; /* Flag for comparison by month name. */
174 : bool reverse; /* Reverse the sense of comparison. */
175 : struct keyfield *next; /* Next keyfield to try. */
176 : };
177 :
178 : struct month
179 : {
180 : char const *name;
181 : int val;
182 : };
183 :
184 : /* The name this program was run with. */
185 : char *program_name;
186 :
187 : /* FIXME: None of these tables work with multibyte character sets.
188 : Also, there are many other bugs when handling multibyte characters.
189 : One way to fix this is to rewrite `sort' to use wide characters
190 : internally, but doing this with good performance is a bit
191 : tricky. */
192 :
193 : /* Table of blanks. */
194 : static bool blanks[UCHAR_LIM];
195 :
196 : /* Table of non-printing characters. */
197 : static bool nonprinting[UCHAR_LIM];
198 :
199 : /* Table of non-dictionary characters (not letters, digits, or blanks). */
200 : static bool nondictionary[UCHAR_LIM];
201 :
202 : /* Translation table folding lower case to upper. */
203 : static char fold_toupper[UCHAR_LIM];
204 :
205 : #define MONTHS_PER_YEAR 12
206 :
207 : /* Table mapping month names to integers.
208 : Alphabetic order allows binary search. */
209 : static struct month monthtab[] =
210 : {
211 : {"APR", 4},
212 : {"AUG", 8},
213 : {"DEC", 12},
214 : {"FEB", 2},
215 : {"JAN", 1},
216 : {"JUL", 7},
217 : {"JUN", 6},
218 : {"MAR", 3},
219 : {"MAY", 5},
220 : {"NOV", 11},
221 : {"OCT", 10},
222 : {"SEP", 9}
223 : };
224 :
225 : /* During the merge phase, the number of files to merge at once. */
226 : #define NMERGE 16
227 :
228 : /* Minimum size for a merge or check buffer. */
229 : #define MIN_MERGE_BUFFER_SIZE (2 + sizeof (struct line))
230 :
231 : /* Minimum sort size; the code might not work with smaller sizes. */
232 : #define MIN_SORT_SIZE (NMERGE * MIN_MERGE_BUFFER_SIZE)
233 :
234 : /* The number of bytes needed for a merge or check buffer, which can
235 : function relatively efficiently even if it holds only one line. If
236 : a longer line is seen, this value is increased. */
237 : static size_t merge_buffer_size = MAX (MIN_MERGE_BUFFER_SIZE, 256 * 1024);
238 :
239 : /* The approximate maximum number of bytes of main memory to use, as
240 : specified by the user. Zero if the user has not specified a size. */
241 : static size_t sort_size;
242 :
243 : /* The guessed size for non-regular files. */
244 : #define INPUT_FILE_SIZE_GUESS (1024 * 1024)
245 :
246 : /* Array of directory names in which any temporary files are to be created. */
247 : static char const **temp_dirs;
248 :
249 : /* Number of temporary directory names used. */
250 : static size_t temp_dir_count;
251 :
252 : /* Number of allocated slots in temp_dirs. */
253 : static size_t temp_dir_alloc;
254 :
255 : /* Flag to reverse the order of all comparisons. */
256 : static bool reverse;
257 :
258 : /* Flag for stable sort. This turns off the last ditch bytewise
259 : comparison of lines, and instead leaves lines in the same order
260 : they were read if all keys compare equal. */
261 : static bool stable;
262 :
263 : /* If TAB has this value, blanks separate fields. */
264 : enum { TAB_DEFAULT = CHAR_MAX + 1 };
265 :
266 : /* Tab character separating fields. If TAB_DEFAULT, then fields are
267 : separated by the empty string between a non-blank character and a blank
268 : character. */
269 : static int tab = TAB_DEFAULT;
270 :
271 : /* Flag to remove consecutive duplicate lines from the output.
272 : Only the last of a sequence of equal lines will be output. */
273 : static bool unique;
274 :
275 : /* Nonzero if any of the input files are the standard input. */
276 : static bool have_read_stdin;
277 :
278 : /* List of key field comparisons to be tried. */
279 : static struct keyfield *keylist;
280 :
281 : /* Program used to (de)compress temp files. Must accept -d. */
282 : static char const *compress_program;
283 :
284 : static void sortlines_temp (struct line *, size_t, struct line *);
285 :
286 : /* Report MESSAGE for FILE, then clean up and exit.
287 : If FILE is null, it represents standard output. */
288 :
289 : static void die (char const *, char const *) ATTRIBUTE_NORETURN;
290 : static void
291 21 : die (char const *message, char const *file)
292 : {
293 21 : error (0, errno, "%s: %s", message, file ? file : _("standard output"));
294 21 : exit (SORT_FAILURE);
295 : }
296 :
297 : void
298 7 : usage (int status)
299 : {
300 7 : if (status != EXIT_SUCCESS)
301 7 : fprintf (stderr, _("Try `%s --help' for more information.\n"),
302 : program_name);
303 : else
304 : {
305 0 : printf (_("\
306 : Usage: %s [OPTION]... [FILE]...\n\
307 : "),
308 : program_name);
309 0 : fputs (_("\
310 : Write sorted concatenation of all FILE(s) to standard output.\n\
311 : \n\
312 : "), stdout);
313 0 : fputs (_("\
314 : Mandatory arguments to long options are mandatory for short options too.\n\
315 : "), stdout);
316 0 : fputs (_("\
317 : Ordering options:\n\
318 : \n\
319 : "), stdout);
320 0 : fputs (_("\
321 : -b, --ignore-leading-blanks ignore leading blanks\n\
322 : -d, --dictionary-order consider only blanks and alphanumeric characters\n\
323 : -f, --ignore-case fold lower case to upper case characters\n\
324 : "), stdout);
325 0 : fputs (_("\
326 : -g, --general-numeric-sort compare according to general numerical value\n\
327 : -i, --ignore-nonprinting consider only printable characters\n\
328 : -M, --month-sort compare (unknown) < `JAN' < ... < `DEC'\n\
329 : -n, --numeric-sort compare according to string numerical value\n\
330 : -R, --random-sort sort by random hash of keys\n\
331 : --random-source=FILE get random bytes from FILE (default /dev/urandom)\n\
332 : --sort=WORD sort according to WORD:\n\
333 : general-numeric -g, month -M, numeric -n,\n\
334 : random -R\n\
335 : -r, --reverse reverse the result of comparisons\n\
336 : \n\
337 : "), stdout);
338 0 : fputs (_("\
339 : Other options:\n\
340 : \n\
341 : -c, --check, --check=diagnose-first check for sorted input; do not sort\n\
342 : -C, --check=quiet, --check=silent like -c, but do not report first bad line\n\
343 : --compress-program=PROG compress temporaries with PROG;\n\
344 : decompress them with PROG -d\n\
345 : -k, --key=POS1[,POS2] start a key at POS1, end it at POS2 (origin 1)\n\
346 : -m, --merge merge already sorted files; do not sort\n\
347 : "), stdout);
348 0 : fputs (_("\
349 : -o, --output=FILE write result to FILE instead of standard output\n\
350 : -s, --stable stabilize sort by disabling last-resort comparison\n\
351 : -S, --buffer-size=SIZE use SIZE for main memory buffer\n\
352 : "), stdout);
353 0 : printf (_("\
354 : -t, --field-separator=SEP use SEP instead of non-blank to blank transition\n\
355 : -T, --temporary-directory=DIR use DIR for temporaries, not $TMPDIR or %s;\n\
356 : multiple options specify multiple directories\n\
357 : -u, --unique with -c, check for strict ordering;\n\
358 : without -c, output only the first of an equal run\n\
359 : "), DEFAULT_TMPDIR);
360 0 : fputs (_("\
361 : -z, --zero-terminated end lines with 0 byte, not newline\n\
362 : "), stdout);
363 0 : fputs (HELP_OPTION_DESCRIPTION, stdout);
364 0 : fputs (VERSION_OPTION_DESCRIPTION, stdout);
365 0 : fputs (_("\
366 : \n\
367 : POS is F[.C][OPTS], where F is the field number and C the character position\n\
368 : in the field; both are origin 1. If neither -t nor -b is in effect, characters\n\
369 : in a field are counted from the beginning of the preceding whitespace. OPTS is\n\
370 : one or more single-letter ordering options, which override global ordering\n\
371 : options for that key. If no key is given, use the entire line as the key.\n\
372 : \n\
373 : SIZE may be followed by the following multiplicative suffixes:\n\
374 : "), stdout);
375 0 : fputs (_("\
376 : % 1% of memory, b 1, K 1024 (default), and so on for M, G, T, P, E, Z, Y.\n\
377 : \n\
378 : With no FILE, or when FILE is -, read standard input.\n\
379 : \n\
380 : *** WARNING ***\n\
381 : The locale specified by the environment affects sort order.\n\
382 : Set LC_ALL=C to get the traditional sort order that uses\n\
383 : native byte values.\n\
384 : "), stdout );
385 0 : emit_bug_reporting_address ();
386 : }
387 :
388 7 : exit (status);
389 : }
390 :
391 : /* For long options that have no equivalent short option, use a
392 : non-character as a pseudo short option, starting with CHAR_MAX + 1. */
393 : enum
394 : {
395 : CHECK_OPTION = CHAR_MAX + 1,
396 : COMPRESS_PROGRAM_OPTION,
397 : RANDOM_SOURCE_OPTION,
398 : SORT_OPTION
399 : };
400 :
401 : static char const short_options[] = "-bcCdfgik:mMno:rRsS:t:T:uy:z";
402 :
403 : static struct option const long_options[] =
404 : {
405 : {"ignore-leading-blanks", no_argument, NULL, 'b'},
406 : {"check", optional_argument, NULL, CHECK_OPTION},
407 : {"compress-program", required_argument, NULL, COMPRESS_PROGRAM_OPTION},
408 : {"dictionary-order", no_argument, NULL, 'd'},
409 : {"ignore-case", no_argument, NULL, 'f'},
410 : {"general-numeric-sort", no_argument, NULL, 'g'},
411 : {"ignore-nonprinting", no_argument, NULL, 'i'},
412 : {"key", required_argument, NULL, 'k'},
413 : {"merge", no_argument, NULL, 'm'},
414 : {"month-sort", no_argument, NULL, 'M'},
415 : {"numeric-sort", no_argument, NULL, 'n'},
416 : {"random-sort", no_argument, NULL, 'R'},
417 : {"random-source", required_argument, NULL, RANDOM_SOURCE_OPTION},
418 : {"sort", required_argument, NULL, SORT_OPTION},
419 : {"output", required_argument, NULL, 'o'},
420 : {"reverse", no_argument, NULL, 'r'},
421 : {"stable", no_argument, NULL, 's'},
422 : {"buffer-size", required_argument, NULL, 'S'},
423 : {"field-separator", required_argument, NULL, 't'},
424 : {"temporary-directory", required_argument, NULL, 'T'},
425 : {"unique", no_argument, NULL, 'u'},
426 : {"zero-terminated", no_argument, NULL, 'z'},
427 : {GETOPT_HELP_OPTION_DECL},
428 : {GETOPT_VERSION_OPTION_DECL},
429 : {NULL, 0, NULL, 0},
430 : };
431 :
432 : static char const *const check_args[] =
433 : {
434 : "quiet", "silent", "diagnose-first", NULL
435 : };
436 : static char const check_types[] =
437 : {
438 : 'C', 'C', 'c'
439 : };
440 : ARGMATCH_VERIFY (check_args, check_types);
441 :
442 : static char const *const sort_args[] =
443 : {
444 : "general-numeric", "month", "numeric", "random", NULL
445 : };
446 : static char const sort_types[] =
447 : {
448 : 'g', 'M', 'n', 'R'
449 : };
450 : ARGMATCH_VERIFY (sort_args, sort_types);
451 :
452 : /* The set of signals that are caught. */
453 : static sigset_t caught_signals;
454 :
455 : /* Critical section status. */
456 : struct cs_status
457 : {
458 : bool valid;
459 : sigset_t sigs;
460 : };
461 :
462 : /* Enter a critical section. */
463 : static struct cs_status
464 0 : cs_enter (void)
465 : {
466 : struct cs_status status;
467 0 : status.valid = (sigprocmask (SIG_BLOCK, &caught_signals, &status.sigs) == 0);
468 0 : return status;
469 : }
470 :
471 : /* Leave a critical section. */
472 : static void
473 0 : cs_leave (struct cs_status status)
474 : {
475 0 : if (status.valid)
476 : {
477 : /* Ignore failure when restoring the signal mask. */
478 0 : sigprocmask (SIG_SETMASK, &status.sigs, NULL);
479 : }
480 0 : }
481 :
482 : /* The list of temporary files. */
483 : struct tempnode
484 : {
485 : struct tempnode *volatile next;
486 : pid_t pid; /* If compressed, the pid of compressor, else zero */
487 : char name[1]; /* Actual size is 1 + file name length. */
488 : };
489 : static struct tempnode *volatile temphead;
490 : static struct tempnode *volatile *temptail = &temphead;
491 :
492 : struct sortfile
493 : {
494 : char const *name;
495 : pid_t pid; /* If compressed, the pid of compressor, else zero */
496 : };
497 :
498 : /* A table where we store compression process states. We clean up all
499 : processes in a timely manner so as not to exhaust system resources,
500 : so we store the info on whether the process is still running, or has
501 : been reaped here. */
502 : static Hash_table *proctab;
503 :
504 : enum { INIT_PROCTAB_SIZE = 47 };
505 :
506 : enum procstate { ALIVE, ZOMBIE };
507 :
508 : /* A proctab entry. The COUNT field is there in case we fork a new
509 : compression process that has the same PID as an old zombie process
510 : that is still in the table (because the process to decompress the
511 : temp file it was associated with hasn't started yet). */
512 : struct procnode
513 : {
514 : pid_t pid;
515 : enum procstate state;
516 : size_t count;
517 : };
518 :
519 : static size_t
520 0 : proctab_hasher (const void *entry, size_t tabsize)
521 : {
522 0 : const struct procnode *node = entry;
523 0 : return node->pid % tabsize;
524 : }
525 :
526 : static bool
527 0 : proctab_comparator (const void *e1, const void *e2)
528 : {
529 0 : const struct procnode *n1 = e1, *n2 = e2;
530 0 : return n1->pid == n2->pid;
531 : }
532 :
533 : /* The total number of forked processes (compressors and decompressors)
534 : that have not been reaped yet. */
535 : static size_t nprocs;
536 :
537 : /* The number of child processes we'll allow before we try to reap some. */
538 : enum { MAX_PROCS_BEFORE_REAP = 2 };
539 :
540 : /* If 0 < PID, wait for the child process with that PID to exit.
541 : If PID is -1, clean up a random child process which has finished and
542 : return the process ID of that child. If PID is -1 and no processes
543 : have quit yet, return 0 without waiting. */
544 :
545 : static pid_t
546 0 : reap (pid_t pid)
547 : {
548 : int status;
549 0 : pid_t cpid = waitpid (pid, &status, pid < 0 ? WNOHANG : 0);
550 :
551 0 : if (cpid < 0)
552 0 : error (SORT_FAILURE, errno, _("waiting for %s [-d]"),
553 : compress_program);
554 0 : else if (0 < cpid)
555 : {
556 0 : if (! WIFEXITED (status) || WEXITSTATUS (status))
557 0 : error (SORT_FAILURE, 0, _("%s [-d] terminated abnormally"),
558 : compress_program);
559 0 : --nprocs;
560 : }
561 :
562 0 : return cpid;
563 : }
564 :
565 : /* Add the PID of a running compression process to proctab, or update
566 : the entry COUNT and STATE fields if it's already there. This also
567 : creates the table for us the first time it's called. */
568 :
569 : static void
570 0 : register_proc (pid_t pid)
571 : {
572 : struct procnode test, *node;
573 :
574 0 : if (! proctab)
575 : {
576 0 : proctab = hash_initialize (INIT_PROCTAB_SIZE, NULL,
577 : proctab_hasher,
578 : proctab_comparator,
579 : free);
580 0 : if (! proctab)
581 0 : xalloc_die ();
582 : }
583 :
584 0 : test.pid = pid;
585 0 : node = hash_lookup (proctab, &test);
586 0 : if (node)
587 : {
588 0 : node->state = ALIVE;
589 0 : ++node->count;
590 : }
591 : else
592 : {
593 0 : node = xmalloc (sizeof *node);
594 0 : node->pid = pid;
595 0 : node->state = ALIVE;
596 0 : node->count = 1;
597 0 : hash_insert (proctab, node);
598 : }
599 0 : }
600 :
601 : /* This is called when we reap a random process. We don't know
602 : whether we have reaped a compression process or a decompression
603 : process until we look in the table. If there's an ALIVE entry for
604 : it, then we have reaped a compression process, so change the state
605 : to ZOMBIE. Otherwise, it's a decompression processes, so ignore it. */
606 :
607 : static void
608 0 : update_proc (pid_t pid)
609 : {
610 : struct procnode test, *node;
611 :
612 0 : test.pid = pid;
613 0 : node = hash_lookup (proctab, &test);
614 0 : if (node)
615 0 : node->state = ZOMBIE;
616 0 : }
617 :
618 : /* This is for when we need to wait for a compression process to exit.
619 : If it has a ZOMBIE entry in the table then it's already dead and has
620 : been reaped. Note that if there's an ALIVE entry for it, it still may
621 : already have died and been reaped if a second process was created with
622 : the same PID. This is probably exceedingly rare, but to be on the safe
623 : side we will have to wait for any compression process with this PID. */
624 :
625 : static void
626 0 : wait_proc (pid_t pid)
627 : {
628 : struct procnode test, *node;
629 :
630 0 : test.pid = pid;
631 0 : node = hash_lookup (proctab, &test);
632 0 : if (node->state == ALIVE)
633 0 : reap (pid);
634 :
635 0 : node->state = ZOMBIE;
636 0 : if (! --node->count)
637 : {
638 0 : hash_delete (proctab, node);
639 0 : free (node);
640 : }
641 0 : }
642 :
643 : /* Keep reaping finished children as long as there are more to reap.
644 : This doesn't block waiting for any of them, it only reaps those
645 : that are already dead. */
646 :
647 : static void
648 0 : reap_some (void)
649 : {
650 : pid_t pid;
651 :
652 0 : while (0 < nprocs && (pid = reap (-1)))
653 0 : update_proc (pid);
654 0 : }
655 :
656 : /* Clean up any remaining temporary files. */
657 :
658 : static void
659 0 : cleanup (void)
660 : {
661 : struct tempnode const *node;
662 :
663 0 : for (node = temphead; node; node = node->next)
664 0 : unlink (node->name);
665 0 : temphead = NULL;
666 0 : }
667 :
668 : /* Cleanup actions to take when exiting. */
669 :
670 : static void
671 41 : exit_cleanup (void)
672 : {
673 41 : if (temphead)
674 : {
675 : /* Clean up any remaining temporary files in a critical section so
676 : that a signal handler does not try to clean them too. */
677 0 : struct cs_status cs = cs_enter ();
678 0 : cleanup ();
679 0 : cs_leave (cs);
680 : }
681 :
682 41 : close_stdout ();
683 41 : }
684 :
685 : /* Create a new temporary file, returning its newly allocated tempnode.
686 : Store into *PFD the file descriptor open for writing. */
687 :
688 : static struct tempnode *
689 0 : create_temp_file (int *pfd)
690 : {
691 : static char const slashbase[] = "/sortXXXXXX";
692 : static size_t temp_dir_index;
693 : int fd;
694 : int saved_errno;
695 0 : char const *temp_dir = temp_dirs[temp_dir_index];
696 0 : size_t len = strlen (temp_dir);
697 0 : struct tempnode *node =
698 0 : xmalloc (offsetof (struct tempnode, name) + len + sizeof slashbase);
699 0 : char *file = node->name;
700 : struct cs_status cs;
701 :
702 0 : memcpy (file, temp_dir, len);
703 0 : memcpy (file + len, slashbase, sizeof slashbase);
704 0 : node->next = NULL;
705 0 : node->pid = 0;
706 0 : if (++temp_dir_index == temp_dir_count)
707 0 : temp_dir_index = 0;
708 :
709 : /* Create the temporary file in a critical section, to avoid races. */
710 0 : cs = cs_enter ();
711 0 : fd = mkstemp (file);
712 0 : if (0 <= fd)
713 : {
714 0 : *temptail = node;
715 0 : temptail = &node->next;
716 : }
717 0 : saved_errno = errno;
718 0 : cs_leave (cs);
719 0 : errno = saved_errno;
720 :
721 0 : if (fd < 0)
722 0 : die (_("cannot create temporary file"), file);
723 :
724 0 : *pfd = fd;
725 0 : return node;
726 : }
727 :
728 : /* Return a stream for FILE, opened with mode HOW. A null FILE means
729 : standard output; HOW should be "w". When opening for input, "-"
730 : means standard input. To avoid confusion, do not return file
731 : descriptors STDIN_FILENO, STDOUT_FILENO, or STDERR_FILENO when
732 : opening an ordinary FILE. */
733 :
734 : static FILE *
735 57 : xfopen (const char *file, const char *how)
736 : {
737 : FILE *fp;
738 :
739 57 : if (!file)
740 13 : fp = stdout;
741 44 : else if (STREQ (file, "-") && *how == 'r')
742 : {
743 24 : have_read_stdin = true;
744 24 : fp = stdin;
745 : }
746 : else
747 : {
748 20 : fp = fopen (file, how);
749 20 : if (! fp)
750 8 : die (_("open failed"), file);
751 : }
752 :
753 49 : return fp;
754 : }
755 :
756 : /* Close FP, whose name is FILE, and report any errors. */
757 :
758 : static void
759 36 : xfclose (FILE *fp, char const *file)
760 : {
761 36 : switch (fileno (fp))
762 : {
763 22 : case STDIN_FILENO:
764 : /* Allow reading stdin from tty more than once. */
765 22 : if (feof (fp))
766 22 : clearerr (fp);
767 22 : break;
768 :
769 13 : case STDOUT_FILENO:
770 : /* Don't close stdout just yet. close_stdout does that. */
771 13 : if (fflush (fp) != 0)
772 0 : die (_("fflush failed"), file);
773 13 : break;
774 :
775 1 : default:
776 1 : if (fclose (fp) != 0)
777 0 : die (_("close failed"), file);
778 1 : break;
779 : }
780 36 : }
781 :
782 : static void
783 0 : dup2_or_die (int oldfd, int newfd)
784 : {
785 0 : if (dup2 (oldfd, newfd) < 0)
786 0 : error (SORT_FAILURE, errno, _("dup2 failed"));
787 0 : }
788 :
789 : /* Fork a child process for piping to and do common cleanup. The
790 : TRIES parameter tells us how many times to try to fork before
791 : giving up. Return the PID of the child or -1 if fork failed. */
792 :
793 : static pid_t
794 0 : pipe_fork (int pipefds[2], size_t tries)
795 : {
796 : #if HAVE_WORKING_FORK
797 : struct tempnode *saved_temphead;
798 : int saved_errno;
799 0 : unsigned int wait_retry = 1;
800 : pid_t pid IF_LINT (= -1);
801 : struct cs_status cs;
802 :
803 0 : if (pipe (pipefds) < 0)
804 0 : return -1;
805 :
806 0 : while (tries--)
807 : {
808 : /* This is so the child process won't delete our temp files
809 : if it receives a signal before exec-ing. */
810 0 : cs = cs_enter ();
811 0 : saved_temphead = temphead;
812 0 : temphead = NULL;
813 :
814 0 : pid = fork ();
815 0 : saved_errno = errno;
816 0 : if (pid)
817 0 : temphead = saved_temphead;
818 :
819 0 : cs_leave (cs);
820 0 : errno = saved_errno;
821 :
822 0 : if (0 <= pid || errno != EAGAIN)
823 : break;
824 : else
825 : {
826 0 : sleep (wait_retry);
827 0 : wait_retry *= 2;
828 0 : reap_some ();
829 : }
830 : }
831 :
832 0 : if (pid < 0)
833 : {
834 0 : close (pipefds[0]);
835 0 : close (pipefds[1]);
836 : }
837 0 : else if (pid == 0)
838 : {
839 0 : close (STDIN_FILENO);
840 0 : close (STDOUT_FILENO);
841 : }
842 : else
843 0 : ++nprocs;
844 :
845 0 : return pid;
846 :
847 : #else /* ! HAVE_WORKING_FORK */
848 : return -1;
849 : #endif
850 : }
851 :
852 : /* Create a temporary file and start a compression program to filter output
853 : to that file. Set *PFP to the file handle and if *PPID is non-NULL,
854 : set it to the PID of the newly-created process. */
855 :
856 : static char *
857 0 : create_temp (FILE **pfp, pid_t *ppid)
858 : {
859 : int tempfd;
860 0 : struct tempnode *node = create_temp_file (&tempfd);
861 0 : char *name = node->name;
862 :
863 0 : if (compress_program)
864 : {
865 : int pipefds[2];
866 :
867 0 : node->pid = pipe_fork (pipefds, MAX_FORK_TRIES_COMPRESS);
868 0 : if (0 < node->pid)
869 : {
870 0 : close (tempfd);
871 0 : close (pipefds[0]);
872 0 : tempfd = pipefds[1];
873 :
874 0 : register_proc (node->pid);
875 : }
876 0 : else if (node->pid == 0)
877 : {
878 0 : close (pipefds[1]);
879 0 : dup2_or_die (tempfd, STDOUT_FILENO);
880 0 : close (tempfd);
881 0 : dup2_or_die (pipefds[0], STDIN_FILENO);
882 0 : close (pipefds[0]);
883 :
884 0 : if (execlp (compress_program, compress_program, (char *) NULL) < 0)
885 0 : error (SORT_FAILURE, errno, _("couldn't execute %s"),
886 : compress_program);
887 : }
888 : else
889 0 : node->pid = 0;
890 : }
891 :
892 0 : *pfp = fdopen (tempfd, "w");
893 0 : if (! *pfp)
894 0 : die (_("couldn't create temporary file"), name);
895 :
896 0 : if (ppid)
897 0 : *ppid = node->pid;
898 :
899 0 : return name;
900 : }
901 :
902 : /* Open a compressed temp file and start a decompression process through
903 : which to filter the input. PID must be the valid processes ID of the
904 : process used to compress the file. */
905 :
906 : static FILE *
907 0 : open_temp (const char *name, pid_t pid)
908 : {
909 : int tempfd, pipefds[2];
910 : pid_t child_pid;
911 : FILE *fp;
912 :
913 0 : wait_proc (pid);
914 :
915 0 : tempfd = open (name, O_RDONLY);
916 0 : if (tempfd < 0)
917 0 : die (_("couldn't open temporary file"), name);
918 :
919 0 : child_pid = pipe_fork (pipefds, MAX_FORK_TRIES_DECOMPRESS);
920 0 : if (0 < child_pid)
921 : {
922 0 : close (tempfd);
923 0 : close (pipefds[1]);
924 : }
925 0 : else if (child_pid == 0)
926 : {
927 0 : close (pipefds[0]);
928 0 : dup2_or_die (tempfd, STDIN_FILENO);
929 0 : close (tempfd);
930 0 : dup2_or_die (pipefds[1], STDOUT_FILENO);
931 0 : close (pipefds[1]);
932 :
933 0 : if (execlp (compress_program, compress_program, "-d", (char *) NULL) < 0)
934 0 : error (SORT_FAILURE, errno, _("couldn't execute %s -d"),
935 : compress_program);
936 : }
937 : else
938 0 : error (SORT_FAILURE, errno, _("couldn't create process for %s -d"),
939 : compress_program);
940 :
941 0 : fp = fdopen (pipefds[0], "r");
942 0 : if (! fp)
943 0 : die (_("couldn't create temporary file"), name);
944 :
945 0 : return fp;
946 : }
947 :
948 : static void
949 102 : write_bytes (const char *buf, size_t n_bytes, FILE *fp, const char *output_file)
950 : {
951 102 : if (fwrite (buf, 1, n_bytes, fp) != n_bytes)
952 0 : die (_("write failed"), output_file);
953 102 : }
954 :
955 : /* Append DIR to the array of temporary directory names. */
956 : static void
957 34 : add_temp_dir (char const *dir)
958 : {
959 34 : if (temp_dir_count == temp_dir_alloc)
960 34 : temp_dirs = X2NREALLOC (temp_dirs, &temp_dir_alloc);
961 :
962 34 : temp_dirs[temp_dir_count++] = dir;
963 34 : }
964 :
965 : /* Remove NAME from the list of temporary files. */
966 :
967 : static void
968 0 : zaptemp (const char *name)
969 : {
970 : struct tempnode *volatile *pnode;
971 : struct tempnode *node;
972 : struct tempnode *next;
973 : int unlink_status;
974 0 : int unlink_errno = 0;
975 : struct cs_status cs;
976 :
977 0 : for (pnode = &temphead; (node = *pnode)->name != name; pnode = &node->next)
978 0 : continue;
979 :
980 : /* Unlink the temporary file in a critical section to avoid races. */
981 0 : next = node->next;
982 0 : cs = cs_enter ();
983 0 : unlink_status = unlink (name);
984 0 : unlink_errno = errno;
985 0 : *pnode = next;
986 0 : cs_leave (cs);
987 :
988 0 : if (unlink_status != 0)
989 0 : error (0, unlink_errno, _("warning: cannot remove: %s"), name);
990 0 : if (! next)
991 0 : temptail = pnode;
992 0 : free (node);
993 0 : }
994 :
995 : #if HAVE_NL_LANGINFO
996 :
997 : static int
998 : struct_month_cmp (const void *m1, const void *m2)
999 : {
1000 : struct month const *month1 = m1;
1001 : struct month const *month2 = m2;
1002 : return strcmp (month1->name, month2->name);
1003 : }
1004 :
1005 : #endif
1006 :
1007 : /* Initialize the character class tables. */
1008 :
1009 : static void
1010 41 : inittables (void)
1011 : {
1012 : size_t i;
1013 :
1014 10537 : for (i = 0; i < UCHAR_LIM; ++i)
1015 : {
1016 10496 : blanks[i] = !! isblank (i);
1017 10496 : nonprinting[i] = ! isprint (i);
1018 10496 : nondictionary[i] = ! isalnum (i) && ! isblank (i);
1019 10496 : fold_toupper[i] = toupper (i);
1020 : }
1021 :
1022 : #if HAVE_NL_LANGINFO
1023 : /* If we're not in the "C" locale, read different names for months. */
1024 : if (hard_LC_TIME)
1025 : {
1026 : for (i = 0; i < MONTHS_PER_YEAR; i++)
1027 : {
1028 : char const *s;
1029 : size_t s_len;
1030 : size_t j;
1031 : char *name;
1032 :
1033 : s = (char *) nl_langinfo (ABMON_1 + i);
1034 : s_len = strlen (s);
1035 : monthtab[i].name = name = xmalloc (s_len + 1);
1036 : monthtab[i].val = i + 1;
1037 :
1038 : for (j = 0; j < s_len; j++)
1039 : name[j] = fold_toupper[to_uchar (s[j])];
1040 : name[j] = '\0';
1041 : }
1042 : qsort ((void *) monthtab, MONTHS_PER_YEAR,
1043 : sizeof *monthtab, struct_month_cmp);
1044 : }
1045 : #endif
1046 41 : }
1047 :
1048 : /* Specify the amount of main memory to use when sorting. */
1049 : static void
1050 0 : specify_sort_size (int oi, char c, char const *s)
1051 : {
1052 : uintmax_t n;
1053 : char *suffix;
1054 0 : enum strtol_error e = xstrtoumax (s, &suffix, 10, &n, "EgGkKmMPtTYZ");
1055 :
1056 : /* The default unit is KiB. */
1057 0 : if (e == LONGINT_OK && ISDIGIT (suffix[-1]))
1058 : {
1059 0 : if (n <= UINTMAX_MAX / 1024)
1060 0 : n *= 1024;
1061 : else
1062 0 : e = LONGINT_OVERFLOW;
1063 : }
1064 :
1065 : /* A 'b' suffix means bytes; a '%' suffix means percent of memory. */
1066 0 : if (e == LONGINT_INVALID_SUFFIX_CHAR && ISDIGIT (suffix[-1]) && ! suffix[1])
1067 0 : switch (suffix[0])
1068 : {
1069 0 : case 'b':
1070 0 : e = LONGINT_OK;
1071 0 : break;
1072 :
1073 0 : case '%':
1074 : {
1075 0 : double mem = physmem_total () * n / 100;
1076 :
1077 : /* Use "<", not "<=", to avoid problems with rounding. */
1078 0 : if (mem < UINTMAX_MAX)
1079 : {
1080 0 : n = mem;
1081 0 : e = LONGINT_OK;
1082 : }
1083 : else
1084 0 : e = LONGINT_OVERFLOW;
1085 : }
1086 0 : break;
1087 : }
1088 :
1089 0 : if (e == LONGINT_OK)
1090 : {
1091 : /* If multiple sort sizes are specified, take the maximum, so
1092 : that option order does not matter. */
1093 0 : if (n < sort_size)
1094 0 : return;
1095 :
1096 0 : sort_size = n;
1097 0 : if (sort_size == n)
1098 : {
1099 0 : sort_size = MAX (sort_size, MIN_SORT_SIZE);
1100 0 : return;
1101 : }
1102 :
1103 0 : e = LONGINT_OVERFLOW;
1104 : }
1105 :
1106 0 : xstrtol_fatal (e, oi, c, long_options, s);
1107 : }
1108 :
1109 : /* Return the default sort size. */
1110 : static size_t
1111 26 : default_sort_size (void)
1112 : {
1113 : /* Let MEM be available memory or 1/8 of total memory, whichever
1114 : is greater. */
1115 26 : double avail = physmem_available ();
1116 26 : double total = physmem_total ();
1117 26 : double mem = MAX (avail, total / 8);
1118 : struct rlimit rlimit;
1119 :
1120 : /* Let SIZE be MEM, but no more than the maximum object size or
1121 : system resource limits. Avoid the MIN macro here, as it is not
1122 : quite right when only one argument is floating point. Don't
1123 : bother to check for values like RLIM_INFINITY since in practice
1124 : they are not much less than SIZE_MAX. */
1125 26 : size_t size = SIZE_MAX;
1126 26 : if (mem < size)
1127 26 : size = mem;
1128 26 : if (getrlimit (RLIMIT_DATA, &rlimit) == 0 && rlimit.rlim_cur < size)
1129 0 : size = rlimit.rlim_cur;
1130 : #ifdef RLIMIT_AS
1131 26 : if (getrlimit (RLIMIT_AS, &rlimit) == 0 && rlimit.rlim_cur < size)
1132 0 : size = rlimit.rlim_cur;
1133 : #endif
1134 :
1135 : /* Leave a large safety margin for the above limits, as failure can
1136 : occur when they are exceeded. */
1137 26 : size /= 2;
1138 :
1139 : #ifdef RLIMIT_RSS
1140 : /* Leave a 1/16 margin for RSS to leave room for code, stack, etc.
1141 : Exceeding RSS is not fatal, but can be quite slow. */
1142 26 : if (getrlimit (RLIMIT_RSS, &rlimit) == 0 && rlimit.rlim_cur / 16 * 15 < size)
1143 0 : size = rlimit.rlim_cur / 16 * 15;
1144 : #endif
1145 :
1146 : /* Use no less than the minimum. */
1147 26 : return MAX (size, MIN_SORT_SIZE);
1148 : }
1149 :
1150 : /* Return the sort buffer size to use with the input files identified
1151 : by FPS and FILES, which are alternate names of the same files.
1152 : NFILES gives the number of input files; NFPS may be less. Assume
1153 : that each input line requires LINE_BYTES extra bytes' worth of line
1154 : information. Do not exceed the size bound specified by the user
1155 : (or a default size bound, if the user does not specify one). */
1156 :
1157 : static size_t
1158 26 : sort_buffer_size (FILE *const *fps, size_t nfps,
1159 : char *const *files, size_t nfiles,
1160 : size_t line_bytes)
1161 : {
1162 : /* A bound on the input size. If zero, the bound hasn't been
1163 : determined yet. */
1164 : static size_t size_bound;
1165 :
1166 : /* In the worst case, each input byte is a newline. */
1167 26 : size_t worst_case_per_input_byte = line_bytes + 1;
1168 :
1169 : /* Keep enough room for one extra input line and an extra byte.
1170 : This extra room might be needed when preparing to read EOF. */
1171 26 : size_t size = worst_case_per_input_byte + 1;
1172 :
1173 : size_t i;
1174 :
1175 126 : for (i = 0; i < nfiles; i++)
1176 : {
1177 : struct stat st;
1178 : off_t file_size;
1179 : size_t worst_case;
1180 :
1181 66 : if ((i < nfps ? fstat (fileno (fps[i]), &st)
1182 25 : : STREQ (files[i], "-") ? fstat (STDIN_FILENO, &st)
1183 3 : : stat (files[i], &st))
1184 94 : != 0)
1185 3 : die (_("stat failed"), files[i]);
1186 :
1187 37 : if (S_ISREG (st.st_mode))
1188 24 : file_size = st.st_size;
1189 : else
1190 : {
1191 : /* The file has unknown size. If the user specified a sort
1192 : buffer size, use that; otherwise, guess the size. */
1193 13 : if (sort_size)
1194 0 : return sort_size;
1195 13 : file_size = INPUT_FILE_SIZE_GUESS;
1196 : }
1197 :
1198 37 : if (! size_bound)
1199 : {
1200 26 : size_bound = sort_size;
1201 26 : if (! size_bound)
1202 26 : size_bound = default_sort_size ();
1203 : }
1204 :
1205 : /* Add the amount of memory needed to represent the worst case
1206 : where the input consists entirely of newlines followed by a
1207 : single non-newline. Check for overflow. */
1208 37 : worst_case = file_size * worst_case_per_input_byte + 1;
1209 37 : if (file_size != worst_case / worst_case_per_input_byte
1210 37 : || size_bound - size <= worst_case)
1211 0 : return size_bound;
1212 37 : size += worst_case;
1213 : }
1214 :
1215 23 : return size;
1216 : }
1217 :
1218 : /* Initialize BUF. Reserve LINE_BYTES bytes for each line; LINE_BYTES
1219 : must be at least sizeof (struct line). Allocate ALLOC bytes
1220 : initially. */
1221 :
1222 : static void
1223 23 : initbuf (struct buffer *buf, size_t line_bytes, size_t alloc)
1224 : {
1225 : /* Ensure that the line array is properly aligned. If the desired
1226 : size cannot be allocated, repeatedly halve it until allocation
1227 : succeeds. The smaller allocation may hurt overall performance,
1228 : but that's better than failing. */
1229 : for (;;)
1230 : {
1231 23 : alloc += sizeof (struct line) - alloc % sizeof (struct line);
1232 23 : buf->buf = malloc (alloc);
1233 23 : if (buf->buf)
1234 23 : break;
1235 0 : alloc /= 2;
1236 0 : if (alloc <= line_bytes + 1)
1237 0 : xalloc_die ();
1238 : }
1239 :
1240 23 : buf->line_bytes = line_bytes;
1241 23 : buf->alloc = alloc;
1242 23 : buf->used = buf->left = buf->nlines = 0;
1243 23 : buf->eof = false;
1244 23 : }
1245 :
1246 : /* Return one past the limit of the line array. */
1247 :
1248 : static inline struct line *
1249 69 : buffer_linelim (struct buffer const *buf)
1250 : {
1251 69 : return (struct line *) (buf->buf + buf->alloc);
1252 : }
1253 :
1254 : /* Return a pointer to the first character of the field specified
1255 : by KEY in LINE. */
1256 :
1257 : static char *
1258 0 : begfield (const struct line *line, const struct keyfield *key)
1259 : {
1260 0 : char *ptr = line->text, *lim = ptr + line->length - 1;
1261 0 : size_t sword = key->sword;
1262 0 : size_t schar = key->schar;
1263 : size_t remaining_bytes;
1264 :
1265 : /* The leading field separator itself is included in a field when -t
1266 : is absent. */
1267 :
1268 0 : if (tab != TAB_DEFAULT)
1269 0 : while (ptr < lim && sword--)
1270 : {
1271 0 : while (ptr < lim && *ptr != tab)
1272 0 : ++ptr;
1273 0 : if (ptr < lim)
1274 0 : ++ptr;
1275 : }
1276 : else
1277 0 : while (ptr < lim && sword--)
1278 : {
1279 0 : while (ptr < lim && blanks[to_uchar (*ptr)])
1280 0 : ++ptr;
1281 0 : while (ptr < lim && !blanks[to_uchar (*ptr)])
1282 0 : ++ptr;
1283 : }
1284 :
1285 0 : if (key->skipsblanks)
1286 0 : while (ptr < lim && blanks[to_uchar (*ptr)])
1287 0 : ++ptr;
1288 :
1289 : /* Advance PTR by SCHAR (if possible), but no further than LIM. */
1290 0 : remaining_bytes = lim - ptr;
1291 0 : if (schar < remaining_bytes)
1292 0 : ptr += schar;
1293 : else
1294 0 : ptr = lim;
1295 :
1296 0 : return ptr;
1297 : }
1298 :
1299 : /* Return the limit of (a pointer to the first character after) the field
1300 : in LINE specified by KEY. */
1301 :
1302 : static char *
1303 0 : limfield (const struct line *line, const struct keyfield *key)
1304 : {
1305 0 : char *ptr = line->text, *lim = ptr + line->length - 1;
1306 0 : size_t eword = key->eword, echar = key->echar;
1307 : size_t remaining_bytes;
1308 :
1309 : /* Move PTR past EWORD fields or to one past the last byte on LINE,
1310 : whichever comes first. If there are more than EWORD fields, leave
1311 : PTR pointing at the beginning of the field having zero-based index,
1312 : EWORD. If a delimiter character was specified (via -t), then that
1313 : `beginning' is the first character following the delimiting TAB.
1314 : Otherwise, leave PTR pointing at the first `blank' character after
1315 : the preceding field. */
1316 0 : if (tab != TAB_DEFAULT)
1317 0 : while (ptr < lim && eword--)
1318 : {
1319 0 : while (ptr < lim && *ptr != tab)
1320 0 : ++ptr;
1321 0 : if (ptr < lim && (eword | echar))
1322 0 : ++ptr;
1323 : }
1324 : else
1325 0 : while (ptr < lim && eword--)
1326 : {
1327 0 : while (ptr < lim && blanks[to_uchar (*ptr)])
1328 0 : ++ptr;
1329 0 : while (ptr < lim && !blanks[to_uchar (*ptr)])
1330 0 : ++ptr;
1331 : }
1332 :
1333 : #ifdef POSIX_UNSPECIFIED
1334 : /* The following block of code makes GNU sort incompatible with
1335 : standard Unix sort, so it's ifdef'd out for now.
1336 : The POSIX spec isn't clear on how to interpret this.
1337 : FIXME: request clarification.
1338 :
1339 : From: kwzh@gnu.ai.mit.edu (Karl Heuer)
1340 : Date: Thu, 30 May 96 12:20:41 -0400
1341 : [Translated to POSIX 1003.1-2001 terminology by Paul Eggert.]
1342 :
1343 : [...]I believe I've found another bug in `sort'.
1344 :
1345 : $ cat /tmp/sort.in
1346 : a b c 2 d
1347 : pq rs 1 t
1348 : $ textutils-1.15/src/sort -k1.7,1.7 </tmp/sort.in
1349 : a b c 2 d
1350 : pq rs 1 t
1351 : $ /bin/sort -k1.7,1.7 </tmp/sort.in
1352 : pq rs 1 t
1353 : a b c 2 d
1354 :
1355 : Unix sort produced the answer I expected: sort on the single character
1356 : in column 7. GNU sort produced different results, because it disagrees
1357 : on the interpretation of the key-end spec "M.N". Unix sort reads this
1358 : as "skip M-1 fields, then N-1 characters"; but GNU sort wants it to mean
1359 : "skip M-1 fields, then either N-1 characters or the rest of the current
1360 : field, whichever comes first". This extra clause applies only to
1361 : key-ends, not key-starts.
1362 : */
1363 :
1364 : /* Make LIM point to the end of (one byte past) the current field. */
1365 : if (tab != TAB_DEFAULT)
1366 : {
1367 : char *newlim;
1368 : newlim = memchr (ptr, tab, lim - ptr);
1369 : if (newlim)
1370 : lim = newlim;
1371 : }
1372 : else
1373 : {
1374 : char *newlim;
1375 : newlim = ptr;
1376 : while (newlim < lim && blanks[to_uchar (*newlim)])
1377 : ++newlim;
1378 : while (newlim < lim && !blanks[to_uchar (*newlim)])
1379 : ++newlim;
1380 : lim = newlim;
1381 : }
1382 : #endif
1383 :
1384 : /* If we're ignoring leading blanks when computing the End
1385 : of the field, don't start counting bytes until after skipping
1386 : past any leading blanks. */
1387 0 : if (key->skipeblanks)
1388 0 : while (ptr < lim && blanks[to_uchar (*ptr)])
1389 0 : ++ptr;
1390 :
1391 : /* Advance PTR by ECHAR (if possible), but no further than LIM. */
1392 0 : remaining_bytes = lim - ptr;
1393 0 : if (echar < remaining_bytes)
1394 0 : ptr += echar;
1395 : else
1396 0 : ptr = lim;
1397 :
1398 0 : return ptr;
1399 : }
1400 :
1401 : /* Fill BUF reading from FP, moving buf->left bytes from the end
1402 : of buf->buf to the beginning first. If EOF is reached and the
1403 : file wasn't terminated by a newline, supply one. Set up BUF's line
1404 : table too. FILE is the name of the file corresponding to FP.
1405 : Return true if some input was read. */
1406 :
1407 : static bool
1408 33 : fillbuf (struct buffer *buf, FILE *fp, char const *file)
1409 : {
1410 33 : struct keyfield const *key = keylist;
1411 33 : char eol = eolchar;
1412 33 : size_t line_bytes = buf->line_bytes;
1413 33 : size_t mergesize = merge_buffer_size - MIN_MERGE_BUFFER_SIZE;
1414 :
1415 33 : if (buf->eof)
1416 0 : return false;
1417 :
1418 33 : if (buf->used != buf->left)
1419 : {
1420 0 : memmove (buf->buf, buf->buf + buf->used - buf->left, buf->left);
1421 0 : buf->used = buf->left;
1422 0 : buf->nlines = 0;
1423 : }
1424 :
1425 : for (;;)
1426 0 : {
1427 33 : char *ptr = buf->buf + buf->used;
1428 33 : struct line *linelim = buffer_linelim (buf);
1429 33 : struct line *line = linelim - buf->nlines;
1430 33 : size_t avail = (char *) linelim - buf->nlines * line_bytes - ptr;
1431 33 : char *line_start = buf->nlines ? line->text + line->length : buf->buf;
1432 :
1433 66 : while (line_bytes + 1 < avail)
1434 : {
1435 : /* Read as many bytes as possible, but do not read so many
1436 : bytes that there might not be enough room for the
1437 : corresponding line array. The worst case is when the
1438 : rest of the input file consists entirely of newlines,
1439 : except that the last byte is not a newline. */
1440 33 : size_t readsize = (avail - 1) / (line_bytes + 1);
1441 33 : size_t bytes_read = fread (ptr, 1, readsize, fp);
1442 33 : char *ptrlim = ptr + bytes_read;
1443 : char *p;
1444 33 : avail -= bytes_read;
1445 :
1446 33 : if (bytes_read != readsize)
1447 : {
1448 33 : if (ferror (fp))
1449 10 : die (_("read failed"), file);
1450 23 : if (feof (fp))
1451 : {
1452 23 : buf->eof = true;
1453 23 : if (buf->buf == ptrlim)
1454 0 : return false;
1455 23 : if (ptrlim[-1] != eol)
1456 1 : *ptrlim++ = eol;
1457 : }
1458 : }
1459 :
1460 : /* Find and record each line in the just-read input. */
1461 148 : while ((p = memchr (ptr, eol, ptrlim - ptr)))
1462 : {
1463 102 : ptr = p + 1;
1464 102 : line--;
1465 102 : line->text = line_start;
1466 102 : line->length = ptr - line_start;
1467 102 : mergesize = MAX (mergesize, line->length);
1468 102 : avail -= line_bytes;
1469 :
1470 102 : if (key)
1471 : {
1472 : /* Precompute the position of the first key for
1473 : efficiency. */
1474 0 : line->keylim = (key->eword == SIZE_MAX
1475 : ? p
1476 0 : : limfield (line, key));
1477 :
1478 0 : if (key->sword != SIZE_MAX)
1479 0 : line->keybeg = begfield (line, key);
1480 : else
1481 : {
1482 0 : if (key->skipsblanks)
1483 0 : while (blanks[to_uchar (*line_start)])
1484 0 : line_start++;
1485 0 : line->keybeg = line_start;
1486 : }
1487 : }
1488 :
1489 102 : line_start = ptr;
1490 : }
1491 :
1492 23 : ptr = ptrlim;
1493 23 : if (buf->eof)
1494 23 : break;
1495 : }
1496 :
1497 23 : buf->used = ptr - buf->buf;
1498 23 : buf->nlines = buffer_linelim (buf) - line;
1499 23 : if (buf->nlines != 0)
1500 : {
1501 23 : buf->left = ptr - line_start;
1502 23 : merge_buffer_size = mergesize + MIN_MERGE_BUFFER_SIZE;
1503 23 : return true;
1504 : }
1505 :
1506 : {
1507 : /* The current input line is too long to fit in the buffer.
1508 : Double the buffer size and try again, keeping it properly
1509 : aligned. */
1510 0 : size_t line_alloc = buf->alloc / sizeof (struct line);
1511 0 : buf->buf = x2nrealloc (buf->buf, &line_alloc, sizeof (struct line));
1512 0 : buf->alloc = line_alloc * sizeof (struct line);
1513 : }
1514 : }
1515 : }
1516 :
1517 : /* Compare strings A and B as numbers without explicitly converting them to
1518 : machine numbers. Comparatively slow for short strings, but asymptotically
1519 : hideously fast. */
1520 :
1521 : static int
1522 0 : numcompare (const char *a, const char *b)
1523 : {
1524 0 : while (blanks[to_uchar (*a)])
1525 0 : a++;
1526 0 : while (blanks[to_uchar (*b)])
1527 0 : b++;
1528 :
1529 0 : return strnumcmp (a, b, decimal_point, thousands_sep);
1530 : }
1531 :
1532 : static int
1533 0 : general_numcompare (const char *sa, const char *sb)
1534 : {
1535 : /* FIXME: add option to warn about failed conversions. */
1536 : /* FIXME: maybe add option to try expensive FP conversion
1537 : only if A and B can't be compared more cheaply/accurately. */
1538 :
1539 : char *ea;
1540 : char *eb;
1541 0 : double a = strtod (sa, &ea);
1542 0 : double b = strtod (sb, &eb);
1543 :
1544 : /* Put conversion errors at the start of the collating sequence. */
1545 0 : if (sa == ea)
1546 0 : return sb == eb ? 0 : -1;
1547 0 : if (sb == eb)
1548 0 : return 1;
1549 :
1550 : /* Sort numbers in the usual way, where -0 == +0. Put NaNs after
1551 : conversion errors but before numbers; sort them by internal
1552 : bit-pattern, for lack of a more portable alternative. */
1553 0 : return (a < b ? -1
1554 0 : : a > b ? 1
1555 0 : : a == b ? 0
1556 0 : : b == b ? -1
1557 0 : : a == a ? 1
1558 0 : : memcmp ((char *) &a, (char *) &b, sizeof a));
1559 : }
1560 :
1561 : /* Return an integer in 1..12 of the month name MONTH with length LEN.
1562 : Return 0 if the name in S is not recognized. */
1563 :
1564 : static int
1565 0 : getmonth (char const *month, size_t len)
1566 : {
1567 0 : size_t lo = 0;
1568 0 : size_t hi = MONTHS_PER_YEAR;
1569 0 : char const *monthlim = month + len;
1570 :
1571 : for (;;)
1572 : {
1573 0 : if (month == monthlim)
1574 0 : return 0;
1575 0 : if (!blanks[to_uchar (*month)])
1576 0 : break;
1577 0 : ++month;
1578 : }
1579 :
1580 : do
1581 : {
1582 0 : size_t ix = (lo + hi) / 2;
1583 0 : char const *m = month;
1584 0 : char const *n = monthtab[ix].name;
1585 :
1586 0 : for (;; m++, n++)
1587 : {
1588 0 : if (!*n)
1589 0 : return monthtab[ix].val;
1590 0 : if (m == monthlim || fold_toupper[to_uchar (*m)] < to_uchar (*n))
1591 : {
1592 0 : hi = ix;
1593 0 : break;
1594 : }
1595 0 : else if (fold_toupper[to_uchar (*m)] > to_uchar (*n))
1596 : {
1597 0 : lo = ix + 1;
1598 0 : break;
1599 : }
1600 : }
1601 : }
1602 0 : while (lo < hi);
1603 :
1604 0 : return 0;
1605 : }
1606 :
1607 : /* A source of random data. */
1608 : static struct randread_source *randread_source;
1609 :
1610 : /* Return the Ith randomly-generated state. The caller must invoke
1611 : random_state (H) for all H less than I before invoking random_state
1612 : (I). */
1613 :
1614 : static struct md5_ctx
1615 0 : random_state (size_t i)
1616 : {
1617 : /* An array of states resulting from the random data, and counts of
1618 : its used and allocated members. */
1619 : static struct md5_ctx *state;
1620 : static size_t used;
1621 : static size_t allocated;
1622 :
1623 0 : struct md5_ctx *s = &state[i];
1624 :
1625 0 : if (used <= i)
1626 : {
1627 : unsigned char buf[MD5_DIGEST_SIZE];
1628 :
1629 0 : used++;
1630 :
1631 0 : if (allocated <= i)
1632 : {
1633 0 : state = X2NREALLOC (state, &allocated);
1634 0 : s = &state[i];
1635 : }
1636 :
1637 0 : randread (randread_source, buf, sizeof buf);
1638 0 : md5_init_ctx (s);
1639 0 : md5_process_bytes (buf, sizeof buf, s);
1640 : }
1641 :
1642 0 : return *s;
1643 : }
1644 :
1645 : /* Compare the hashes of TEXTA with length LENGTHA to those of TEXTB
1646 : with length LENGTHB. Return negative if less, zero if equal,
1647 : positive if greater. */
1648 :
1649 : static int
1650 0 : cmp_hashes (char const *texta, size_t lena,
1651 : char const *textb, size_t lenb)
1652 : {
1653 : /* Try random hashes until a pair of hashes disagree. But if the
1654 : first pair of random hashes agree, check whether the keys are
1655 : identical and if so report no difference. */
1656 : int diff;
1657 : size_t i;
1658 0 : for (i = 0; ; i++)
1659 0 : {
1660 : uint32_t dig[2][MD5_DIGEST_SIZE / sizeof (uint32_t)];
1661 : struct md5_ctx s[2];
1662 0 : s[0] = s[1] = random_state (i);
1663 0 : md5_process_bytes (texta, lena, &s[0]); md5_finish_ctx (&s[0], dig[0]);
1664 0 : md5_process_bytes (textb, lenb, &s[1]); md5_finish_ctx (&s[1], dig[1]);
1665 0 : diff = memcmp (dig[0], dig[1], sizeof dig[0]);
1666 0 : if (diff != 0)
1667 0 : break;
1668 0 : if (i == 0 && lena == lenb && memcmp (texta, textb, lena) == 0)
1669 0 : break;
1670 : }
1671 :
1672 0 : return diff;
1673 : }
1674 :
1675 : /* Compare the keys TEXTA (of length LENA) and TEXTB (of length LENB)
1676 : using one or more random hash functions. */
1677 :
1678 : static int
1679 0 : compare_random (char *restrict texta, size_t lena,
1680 : char *restrict textb, size_t lenb)
1681 : {
1682 : int diff;
1683 :
1684 0 : if (! hard_LC_COLLATE)
1685 0 : diff = cmp_hashes (texta, lena, textb, lenb);
1686 : else
1687 : {
1688 : /* Transform the text into the basis of comparison, so that byte
1689 : strings that would otherwise considered to be equal are
1690 : considered equal here even if their bytes differ. */
1691 :
1692 0 : char *buf = NULL;
1693 : char stackbuf[4000];
1694 0 : size_t tlena = xmemxfrm (stackbuf, sizeof stackbuf, texta, lena);
1695 0 : bool a_fits = tlena <= sizeof stackbuf;
1696 0 : size_t tlenb = xmemxfrm ((a_fits ? stackbuf + tlena : NULL),
1697 : (a_fits ? sizeof stackbuf - tlena : 0),
1698 : textb, lenb);
1699 :
1700 0 : if (a_fits && tlena + tlenb <= sizeof stackbuf)
1701 0 : buf = stackbuf;
1702 : else
1703 : {
1704 : /* Adding 1 to the buffer size lets xmemxfrm run a bit
1705 : faster by avoiding the need for an extra buffer copy. */
1706 0 : buf = xmalloc (tlena + tlenb + 1);
1707 0 : xmemxfrm (buf, tlena + 1, texta, lena);
1708 0 : xmemxfrm (buf + tlena, tlenb + 1, textb, lenb);
1709 : }
1710 :
1711 0 : diff = cmp_hashes (buf, tlena, buf + tlena, tlenb);
1712 :
1713 0 : if (buf != stackbuf)
1714 0 : free (buf);
1715 : }
1716 :
1717 0 : return diff;
1718 : }
1719 :
1720 : /* Compare two lines A and B trying every key in sequence until there
1721 : are no more keys or a difference is found. */
1722 :
1723 : static int
1724 0 : keycompare (const struct line *a, const struct line *b)
1725 : {
1726 0 : struct keyfield const *key = keylist;
1727 :
1728 : /* For the first iteration only, the key positions have been
1729 : precomputed for us. */
1730 0 : char *texta = a->keybeg;
1731 0 : char *textb = b->keybeg;
1732 0 : char *lima = a->keylim;
1733 0 : char *limb = b->keylim;
1734 :
1735 : int diff;
1736 :
1737 : for (;;)
1738 0 : {
1739 0 : char const *translate = key->translate;
1740 0 : bool const *ignore = key->ignore;
1741 :
1742 : /* Find the lengths. */
1743 0 : size_t lena = lima <= texta ? 0 : lima - texta;
1744 0 : size_t lenb = limb <= textb ? 0 : limb - textb;
1745 :
1746 : /* Actually compare the fields. */
1747 :
1748 0 : if (key->random)
1749 0 : diff = compare_random (texta, lena, textb, lenb);
1750 0 : else if (key->numeric | key->general_numeric)
1751 : {
1752 0 : char savea = *lima, saveb = *limb;
1753 :
1754 0 : *lima = *limb = '\0';
1755 0 : diff = ((key->numeric ? numcompare : general_numcompare)
1756 : (texta, textb));
1757 0 : *lima = savea, *limb = saveb;
1758 : }
1759 0 : else if (key->month)
1760 0 : diff = getmonth (texta, lena) - getmonth (textb, lenb);
1761 : /* Sorting like this may become slow, so in a simple locale the user
1762 : can select a faster sort that is similar to ascii sort. */
1763 0 : else if (hard_LC_COLLATE)
1764 : {
1765 0 : if (ignore || translate)
1766 0 : {
1767 : char buf[4000];
1768 0 : size_t size = lena + 1 + lenb + 1;
1769 0 : char *copy_a = (size <= sizeof buf ? buf : xmalloc (size));
1770 0 : char *copy_b = copy_a + lena + 1;
1771 : size_t new_len_a, new_len_b, i;
1772 :
1773 : /* Ignore and/or translate chars before comparing. */
1774 0 : for (new_len_a = new_len_b = i = 0; i < MAX (lena, lenb); i++)
1775 : {
1776 0 : if (i < lena)
1777 : {
1778 0 : copy_a[new_len_a] = (translate
1779 0 : ? translate[to_uchar (texta[i])]
1780 0 : : texta[i]);
1781 0 : if (!ignore || !ignore[to_uchar (texta[i])])
1782 0 : ++new_len_a;
1783 : }
1784 0 : if (i < lenb)
1785 : {
1786 0 : copy_b[new_len_b] = (translate
1787 0 : ? translate[to_uchar (textb[i])]
1788 0 : : textb [i]);
1789 0 : if (!ignore || !ignore[to_uchar (textb[i])])
1790 0 : ++new_len_b;
1791 : }
1792 : }
1793 :
1794 0 : diff = xmemcoll (copy_a, new_len_a, copy_b, new_len_b);
1795 :
1796 0 : if (sizeof buf < size)
1797 0 : free (copy_a);
1798 : }
1799 0 : else if (lena == 0)
1800 0 : diff = - NONZERO (lenb);
1801 0 : else if (lenb == 0)
1802 0 : goto greater;
1803 : else
1804 0 : diff = xmemcoll (texta, lena, textb, lenb);
1805 : }
1806 0 : else if (ignore)
1807 : {
1808 : #define CMP_WITH_IGNORE(A, B) \
1809 : do \
1810 : { \
1811 : for (;;) \
1812 : { \
1813 : while (texta < lima && ignore[to_uchar (*texta)]) \
1814 : ++texta; \
1815 : while (textb < limb && ignore[to_uchar (*textb)]) \
1816 : ++textb; \
1817 : if (! (texta < lima && textb < limb)) \
1818 : break; \
1819 : diff = to_uchar (A) - to_uchar (B); \
1820 : if (diff) \
1821 : goto not_equal; \
1822 : ++texta; \
1823 : ++textb; \
1824 : } \
1825 : \
1826 : diff = (texta < lima) - (textb < limb); \
1827 : } \
1828 : while (0)
1829 :
1830 0 : if (translate)
1831 0 : CMP_WITH_IGNORE (translate[to_uchar (*texta)],
1832 : translate[to_uchar (*textb)]);
1833 : else
1834 0 : CMP_WITH_IGNORE (*texta, *textb);
1835 : }
1836 0 : else if (lena == 0)
1837 0 : diff = - NONZERO (lenb);
1838 0 : else if (lenb == 0)
1839 0 : goto greater;
1840 : else
1841 : {
1842 0 : if (translate)
1843 : {
1844 0 : while (texta < lima && textb < limb)
1845 : {
1846 0 : diff = (to_uchar (translate[to_uchar (*texta++)])
1847 0 : - to_uchar (translate[to_uchar (*textb++)]));
1848 0 : if (diff)
1849 0 : goto not_equal;
1850 : }
1851 : }
1852 : else
1853 : {
1854 0 : diff = memcmp (texta, textb, MIN (lena, lenb));
1855 0 : if (diff)
1856 0 : goto not_equal;
1857 : }
1858 0 : diff = lena < lenb ? -1 : lena != lenb;
1859 : }
1860 :
1861 0 : if (diff)
1862 0 : goto not_equal;
1863 :
1864 0 : key = key->next;
1865 0 : if (! key)
1866 0 : break;
1867 :
1868 : /* Find the beginning and limit of the next field. */
1869 0 : if (key->eword != SIZE_MAX)
1870 0 : lima = limfield (a, key), limb = limfield (b, key);
1871 : else
1872 0 : lima = a->text + a->length - 1, limb = b->text + b->length - 1;
1873 :
1874 0 : if (key->sword != SIZE_MAX)
1875 0 : texta = begfield (a, key), textb = begfield (b, key);
1876 : else
1877 : {
1878 0 : texta = a->text, textb = b->text;
1879 0 : if (key->skipsblanks)
1880 : {
1881 0 : while (texta < lima && blanks[to_uchar (*texta)])
1882 0 : ++texta;
1883 0 : while (textb < limb && blanks[to_uchar (*textb)])
1884 0 : ++textb;
1885 : }
1886 : }
1887 : }
1888 :
1889 0 : return 0;
1890 :
1891 0 : greater:
1892 0 : diff = 1;
1893 0 : not_equal:
1894 0 : return key->reverse ? -diff : diff;
1895 : }
1896 :
1897 : /* Compare two lines A and B, returning negative, zero, or positive
1898 : depending on whether A compares less than, equal to, or greater than B. */
1899 :
1900 : static int
1901 165 : compare (const struct line *a, const struct line *b)
1902 : {
1903 : int diff;
1904 : size_t alen, blen;
1905 :
1906 : /* First try to compare on the specified keys (if any).
1907 : The only two cases with no key at all are unadorned sort,
1908 : and unadorned sort -r. */
1909 165 : if (keylist)
1910 : {
1911 0 : diff = keycompare (a, b);
1912 0 : if (diff | unique | stable)
1913 0 : return diff;
1914 : }
1915 :
1916 : /* If the keys all compare equal (or no keys were specified)
1917 : fall through to the default comparison. */
1918 165 : alen = a->length - 1, blen = b->length - 1;
1919 :
1920 165 : if (alen == 0)
1921 139 : diff = - NONZERO (blen);
1922 26 : else if (blen == 0)
1923 23 : diff = 1;
1924 3 : else if (hard_LC_COLLATE)
1925 0 : diff = xmemcoll (a->text, alen, b->text, blen);
1926 3 : else if (! (diff = memcmp (a->text, b->text, MIN (alen, blen))))
1927 2 : diff = alen < blen ? -1 : alen != blen;
1928 :
1929 165 : return reverse ? -diff : diff;
1930 : }
1931 :
1932 : /* Check that the lines read from FILE_NAME come in order. Return
1933 : true if they are in order. If CHECKONLY == 'c', also print a
1934 : diagnostic (FILE_NAME, line number, contents of line) to stderr if
1935 : they are not in order. */
1936 :
1937 : static bool
1938 0 : check (char const *file_name, char checkonly)
1939 : {
1940 0 : FILE *fp = xfopen (file_name, "r");
1941 : struct buffer buf; /* Input buffer. */
1942 : struct line temp; /* Copy of previous line. */
1943 0 : size_t alloc = 0;
1944 0 : uintmax_t line_number = 0;
1945 0 : struct keyfield const *key = keylist;
1946 0 : bool nonunique = ! unique;
1947 0 : bool ordered = true;
1948 :
1949 0 : initbuf (&buf, sizeof (struct line),
1950 : MAX (merge_buffer_size, sort_size));
1951 0 : temp.text = NULL;
1952 :
1953 0 : while (fillbuf (&buf, fp, file_name))
1954 : {
1955 0 : struct line const *line = buffer_linelim (&buf);
1956 0 : struct line const *linebase = line - buf.nlines;
1957 :
1958 : /* Make sure the line saved from the old buffer contents is
1959 : less than or equal to the first line of the new buffer. */
1960 0 : if (alloc && nonunique <= compare (&temp, line - 1))
1961 : {
1962 0 : found_disorder:
1963 : {
1964 0 : if (checkonly == 'c')
1965 : {
1966 0 : struct line const *disorder_line = line - 1;
1967 0 : uintmax_t disorder_line_number =
1968 0 : buffer_linelim (&buf) - disorder_line + line_number;
1969 : char hr_buf[INT_BUFSIZE_BOUND (uintmax_t)];
1970 0 : fprintf (stderr, _("%s: %s:%s: disorder: "),
1971 : program_name, file_name,
1972 : umaxtostr (disorder_line_number, hr_buf));
1973 0 : write_bytes (disorder_line->text, disorder_line->length,
1974 : stderr, _("standard error"));
1975 : }
1976 :
1977 0 : ordered = false;
1978 0 : break;
1979 : }
1980 : }
1981 :
1982 : /* Compare each line in the buffer with its successor. */
1983 0 : while (linebase < --line)
1984 0 : if (nonunique <= compare (line, line - 1))
1985 0 : goto found_disorder;
1986 :
1987 0 : line_number += buf.nlines;
1988 :
1989 : /* Save the last line of the buffer. */
1990 0 : if (alloc < line->length)
1991 : {
1992 : do
1993 : {
1994 0 : alloc *= 2;
1995 0 : if (! alloc)
1996 : {
1997 0 : alloc = line->length;
1998 0 : break;
1999 : }
2000 : }
2001 0 : while (alloc < line->length);
2002 :
2003 0 : temp.text = xrealloc (temp.text, alloc);
2004 : }
2005 0 : memcpy (temp.text, line->text, line->length);
2006 0 : temp.length = line->length;
2007 0 : if (key)
2008 : {
2009 0 : temp.keybeg = temp.text + (line->keybeg - line->text);
2010 0 : temp.keylim = temp.text + (line->keylim - line->text);
2011 : }
2012 : }
2013 :
2014 0 : xfclose (fp, file_name);
2015 0 : free (buf.buf);
2016 0 : free (temp.text);
2017 0 : return ordered;
2018 : }
2019 :
2020 : /* Merge lines from FILES onto OFP. NTEMPS is the number of temporary
2021 : files (all of which are at the start of the FILES array), and
2022 : NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
2023 : Close input and output files before returning.
2024 : OUTPUT_FILE gives the name of the output file. If it is NULL,
2025 : the output file is standard output. If OFP is NULL, the output
2026 : file has not been opened yet (or written to, if standard output). */
2027 :
2028 : static void
2029 0 : mergefps (struct sortfile *files, size_t ntemps, size_t nfiles,
2030 : FILE *ofp, char const *output_file)
2031 : {
2032 : FILE *fps[NMERGE]; /* Input streams for each file. */
2033 : struct buffer buffer[NMERGE]; /* Input buffers for each file. */
2034 : struct line saved; /* Saved line storage for unique check. */
2035 0 : struct line const *savedline = NULL;
2036 : /* &saved if there is a saved line. */
2037 0 : size_t savealloc = 0; /* Size allocated for the saved line. */
2038 : struct line const *cur[NMERGE]; /* Current line in each line table. */
2039 : struct line const *base[NMERGE]; /* Base of each line table. */
2040 : size_t ord[NMERGE]; /* Table representing a permutation of fps,
2041 : such that cur[ord[0]] is the smallest line
2042 : and will be next output. */
2043 : size_t i;
2044 : size_t j;
2045 : size_t t;
2046 0 : struct keyfield const *key = keylist;
2047 0 : saved.text = NULL;
2048 :
2049 : /* Read initial lines from each input file. */
2050 0 : for (i = 0; i < nfiles; )
2051 : {
2052 0 : fps[i] = (files[i].pid
2053 0 : ? open_temp (files[i].name, files[i].pid)
2054 0 : : xfopen (files[i].name, "r"));
2055 0 : initbuf (&buffer[i], sizeof (struct line),
2056 0 : MAX (merge_buffer_size, sort_size / nfiles));
2057 0 : if (fillbuf (&buffer[i], fps[i], files[i].name))
2058 : {
2059 0 : struct line const *linelim = buffer_linelim (&buffer[i]);
2060 0 : cur[i] = linelim - 1;
2061 0 : base[i] = linelim - buffer[i].nlines;
2062 0 : i++;
2063 : }
2064 : else
2065 : {
2066 : /* fps[i] is empty; eliminate it from future consideration. */
2067 0 : xfclose (fps[i], files[i].name);
2068 0 : if (i < ntemps)
2069 : {
2070 0 : ntemps--;
2071 0 : zaptemp (files[i].name);
2072 : }
2073 0 : free (buffer[i].buf);
2074 0 : --nfiles;
2075 0 : for (j = i; j < nfiles; ++j)
2076 0 : files[j] = files[j + 1];
2077 : }
2078 : }
2079 :
2080 0 : if (! ofp)
2081 0 : ofp = xfopen (output_file, "w");
2082 :
2083 : /* Set up the ord table according to comparisons among input lines.
2084 : Since this only reorders two items if one is strictly greater than
2085 : the other, it is stable. */
2086 0 : for (i = 0; i < nfiles; ++i)
2087 0 : ord[i] = i;
2088 0 : for (i = 1; i < nfiles; ++i)
2089 0 : if (0 < compare (cur[ord[i - 1]], cur[ord[i]]))
2090 0 : t = ord[i - 1], ord[i - 1] = ord[i], ord[i] = t, i = 0;
2091 :
2092 : /* Repeatedly output the smallest line until no input remains. */
2093 0 : while (nfiles)
2094 : {
2095 0 : struct line const *smallest = cur[ord[0]];
2096 :
2097 : /* If uniquified output is turned on, output only the first of
2098 : an identical series of lines. */
2099 0 : if (unique)
2100 : {
2101 0 : if (savedline && compare (savedline, smallest))
2102 : {
2103 0 : savedline = NULL;
2104 0 : write_bytes (saved.text, saved.length, ofp, output_file);
2105 : }
2106 0 : if (!savedline)
2107 : {
2108 0 : savedline = &saved;
2109 0 : if (savealloc < smallest->length)
2110 : {
2111 : do
2112 0 : if (! savealloc)
2113 : {
2114 0 : savealloc = smallest->length;
2115 0 : break;
2116 : }
2117 0 : while ((savealloc *= 2) < smallest->length);
2118 :
2119 0 : saved.text = xrealloc (saved.text, savealloc);
2120 : }
2121 0 : saved.length = smallest->length;
2122 0 : memcpy (saved.text, smallest->text, saved.length);
2123 0 : if (key)
2124 : {
2125 0 : saved.keybeg =
2126 0 : saved.text + (smallest->keybeg - smallest->text);
2127 0 : saved.keylim =
2128 0 : saved.text + (smallest->keylim - smallest->text);
2129 : }
2130 : }
2131 : }
2132 : else
2133 0 : write_bytes (smallest->text, smallest->length, ofp, output_file);
2134 :
2135 : /* Check if we need to read more lines into core. */
2136 0 : if (base[ord[0]] < smallest)
2137 0 : cur[ord[0]] = smallest - 1;
2138 : else
2139 : {
2140 0 : if (fillbuf (&buffer[ord[0]], fps[ord[0]], files[ord[0]].name))
2141 : {
2142 0 : struct line const *linelim = buffer_linelim (&buffer[ord[0]]);
2143 0 : cur[ord[0]] = linelim - 1;
2144 0 : base[ord[0]] = linelim - buffer[ord[0]].nlines;
2145 : }
2146 : else
2147 : {
2148 : /* We reached EOF on fps[ord[0]]. */
2149 0 : for (i = 1; i < nfiles; ++i)
2150 0 : if (ord[i] > ord[0])
2151 0 : --ord[i];
2152 0 : --nfiles;
2153 0 : xfclose (fps[ord[0]], files[ord[0]].name);
2154 0 : if (ord[0] < ntemps)
2155 : {
2156 0 : ntemps--;
2157 0 : zaptemp (files[ord[0]].name);
2158 : }
2159 0 : free (buffer[ord[0]].buf);
2160 0 : for (i = ord[0]; i < nfiles; ++i)
2161 : {
2162 0 : fps[i] = fps[i + 1];
2163 0 : files[i] = files[i + 1];
2164 0 : buffer[i] = buffer[i + 1];
2165 0 : cur[i] = cur[i + 1];
2166 0 : base[i] = base[i + 1];
2167 : }
2168 0 : for (i = 0; i < nfiles; ++i)
2169 0 : ord[i] = ord[i + 1];
2170 0 : continue;
2171 : }
2172 : }
2173 :
2174 : /* The new line just read in may be larger than other lines
2175 : already in main memory; push it back in the queue until we
2176 : encounter a line larger than it. Optimize for the common
2177 : case where the new line is smallest. */
2178 : {
2179 0 : size_t lo = 1;
2180 0 : size_t hi = nfiles;
2181 0 : size_t probe = lo;
2182 0 : size_t ord0 = ord[0];
2183 : size_t count_of_smaller_lines;
2184 :
2185 0 : while (lo < hi)
2186 : {
2187 0 : int cmp = compare (cur[ord0], cur[ord[probe]]);
2188 0 : if (cmp < 0 || (cmp == 0 && ord0 < ord[probe]))
2189 0 : hi = probe;
2190 : else
2191 0 : lo = probe + 1;
2192 0 : probe = (lo + hi) / 2;
2193 : }
2194 :
2195 0 : count_of_smaller_lines = lo - 1;
2196 0 : for (j = 0; j < count_of_smaller_lines; j++)
2197 0 : ord[j] = ord[j + 1];
2198 0 : ord[count_of_smaller_lines] = ord0;
2199 : }
2200 :
2201 : /* Free up some resources every once in a while. */
2202 0 : if (MAX_PROCS_BEFORE_REAP < nprocs)
2203 0 : reap_some ();
2204 : }
2205 :
2206 0 : if (unique && savedline)
2207 : {
2208 0 : write_bytes (saved.text, saved.length, ofp, output_file);
2209 0 : free (saved.text);
2210 : }
2211 :
2212 0 : xfclose (ofp, output_file);
2213 0 : }
2214 :
2215 : /* Merge into T the two sorted arrays of lines LO (with NLO members)
2216 : and HI (with NHI members). T, LO, and HI point just past their
2217 : respective arrays, and the arrays are in reverse order. NLO and
2218 : NHI must be positive, and HI - NHI must equal T - (NLO + NHI). */
2219 :
2220 : static inline void
2221 119 : mergelines (struct line *t,
2222 : struct line const *lo, size_t nlo,
2223 : struct line const *hi, size_t nhi)
2224 : {
2225 : for (;;)
2226 195 : if (compare (lo - 1, hi - 1) <= 0)
2227 : {
2228 98 : *--t = *--lo;
2229 98 : if (! --nlo)
2230 : {
2231 : /* HI - NHI equalled T - (NLO + NHI) when this function
2232 : began. Therefore HI must equal T now, and there is no
2233 : need to copy from HI to T. */
2234 38 : return;
2235 : }
2236 : }
2237 : else
2238 : {
2239 21 : *--t = *--hi;
2240 21 : if (! --nhi)
2241 : {
2242 : do
2243 6 : *--t = *--lo;
2244 6 : while (--nlo);
2245 :
2246 5 : return;
2247 : }
2248 : }
2249 : }
2250 :
2251 : /* Sort the array LINES with NLINES members, using TEMP for temporary space.
2252 : NLINES must be at least 2.
2253 : The input and output arrays are in reverse order, and LINES and
2254 : TEMP point just past the end of their respective arrays.
2255 :
2256 : Use a recursive divide-and-conquer algorithm, in the style
2257 : suggested by Knuth volume 3 (2nd edition), exercise 5.2.4-23. Use
2258 : the optimization suggested by exercise 5.2.4-10; this requires room
2259 : for only 1.5*N lines, rather than the usual 2*N lines. Knuth
2260 : writes that this memory optimization was originally published by
2261 : D. A. Bell, Comp J. 1 (1958), 75. */
2262 :
2263 : static void
2264 50 : sortlines (struct line *lines, size_t nlines, struct line *temp)
2265 : {
2266 50 : if (nlines == 2)
2267 : {
2268 22 : if (0 < compare (&lines[-1], &lines[-2]))
2269 : {
2270 1 : struct line tmp = lines[-1];
2271 1 : lines[-1] = lines[-2];
2272 1 : lines[-2] = tmp;
2273 : }
2274 : }
2275 : else
2276 : {
2277 28 : size_t nlo = nlines / 2;
2278 28 : size_t nhi = nlines - nlo;
2279 28 : struct line *lo = lines;
2280 28 : struct line *hi = lines - nlo;
2281 28 : struct line *sorted_lo = temp;
2282 :
2283 28 : sortlines (hi, nhi, temp);
2284 28 : if (1 < nlo)
2285 24 : sortlines_temp (lo, nlo, sorted_lo);
2286 : else
2287 4 : sorted_lo[-1] = lo[-1];
2288 :
2289 28 : mergelines (lines, sorted_lo, nlo, hi, nhi);
2290 : }
2291 50 : }
2292 :
2293 : /* Like sortlines (LINES, NLINES, TEMP), except output into TEMP
2294 : rather than sorting in place. */
2295 :
2296 : static void
2297 39 : sortlines_temp (struct line *lines, size_t nlines, struct line *temp)
2298 : {
2299 39 : if (nlines == 2)
2300 : {
2301 : /* Declare `swap' as int, not bool, to work around a bug
2302 : <http://lists.gnu.org/archive/html/bug-coreutils/2005-10/msg00086.html>
2303 : in the IBM xlc 6.0.0.0 compiler in 64-bit mode. */
2304 24 : int swap = (0 < compare (&lines[-1], &lines[-2]));
2305 24 : temp[-1] = lines[-1 - swap];
2306 24 : temp[-2] = lines[-2 + swap];
2307 : }
2308 : else
2309 : {
2310 15 : size_t nlo = nlines / 2;
2311 15 : size_t nhi = nlines - nlo;
2312 15 : struct line *lo = lines;
2313 15 : struct line *hi = lines - nlo;
2314 15 : struct line *sorted_hi = temp - nlo;
2315 :
2316 15 : sortlines_temp (hi, nhi, sorted_hi);
2317 15 : if (1 < nlo)
2318 9 : sortlines (lo, nlo, temp);
2319 :
2320 15 : mergelines (temp, lo, nlo, sorted_hi, nhi);
2321 : }
2322 39 : }
2323 :
2324 : /* Scan through FILES[NTEMPS .. NFILES-1] looking for a file that is
2325 : the same as OUTFILE. If found, merge the found instances (and perhaps
2326 : some other files) into a temporary file so that it can in turn be
2327 : merged into OUTFILE without destroying OUTFILE before it is completely
2328 : read. Return the new value of NFILES, which differs from the old if
2329 : some merging occurred.
2330 :
2331 : This test ensures that an otherwise-erroneous use like
2332 : "sort -m -o FILE ... FILE ..." copies FILE before writing to it.
2333 : It's not clear that POSIX requires this nicety.
2334 : Detect common error cases, but don't try to catch obscure cases like
2335 : "cat ... FILE ... | sort -m -o FILE"
2336 : where traditional "sort" doesn't copy the input and where
2337 : people should know that they're getting into trouble anyway.
2338 : Catching these obscure cases would slow down performance in
2339 : common cases. */
2340 :
2341 : static size_t
2342 0 : avoid_trashing_input (struct sortfile *files, size_t ntemps,
2343 : size_t nfiles, char const *outfile)
2344 : {
2345 : size_t i;
2346 0 : bool got_outstat = false;
2347 : struct stat outstat;
2348 :
2349 0 : for (i = ntemps; i < nfiles; i++)
2350 : {
2351 0 : bool is_stdin = STREQ (files[i].name, "-");
2352 : bool same;
2353 : struct stat instat;
2354 :
2355 0 : if (outfile && STREQ (outfile, files[i].name) && !is_stdin)
2356 0 : same = true;
2357 : else
2358 : {
2359 0 : if (! got_outstat)
2360 : {
2361 0 : if ((outfile
2362 0 : ? stat (outfile, &outstat)
2363 0 : : fstat (STDOUT_FILENO, &outstat))
2364 0 : != 0)
2365 0 : break;
2366 0 : got_outstat = true;
2367 : }
2368 :
2369 0 : same = (((is_stdin
2370 0 : ? fstat (STDIN_FILENO, &instat)
2371 0 : : stat (files[i].name, &instat))
2372 0 : == 0)
2373 0 : && SAME_INODE (instat, outstat));
2374 : }
2375 :
2376 0 : if (same)
2377 : {
2378 : FILE *tftp;
2379 : pid_t pid;
2380 0 : char *temp = create_temp (&tftp, &pid);
2381 0 : mergefps (&files[i],0, nfiles - i, tftp, temp);
2382 0 : files[i].name = temp;
2383 0 : files[i].pid = pid;
2384 0 : return i + 1;
2385 : }
2386 : }
2387 :
2388 0 : return nfiles;
2389 : }
2390 :
2391 : /* Merge the input FILES. NTEMPS is the number of files at the
2392 : start of FILES that are temporary; it is zero at the top level.
2393 : NFILES is the total number of files. Put the output in
2394 : OUTPUT_FILE; a null OUTPUT_FILE stands for standard output. */
2395 :
2396 : static void
2397 0 : merge (struct sortfile *files, size_t ntemps, size_t nfiles,
2398 : char const *output_file)
2399 : {
2400 0 : while (NMERGE < nfiles)
2401 : {
2402 : /* Number of input files processed so far. */
2403 : size_t in;
2404 :
2405 : /* Number of output files generated so far. */
2406 : size_t out;
2407 :
2408 : /* nfiles % NMERGE; this counts input files that are left over
2409 : after all full-sized merges have been done. */
2410 : size_t remainder;
2411 :
2412 : /* Number of easily-available slots at the next loop iteration. */
2413 : size_t cheap_slots;
2414 :
2415 : /* Do as many NMERGE-size merges as possible. */
2416 0 : for (out = in = 0; out < nfiles / NMERGE; out++, in += NMERGE)
2417 : {
2418 : FILE *tfp;
2419 : pid_t pid;
2420 0 : char *temp = create_temp (&tfp, &pid);
2421 0 : size_t nt = MIN (ntemps, NMERGE);
2422 0 : ntemps -= nt;
2423 0 : mergefps (&files[in], nt, NMERGE, tfp, temp);
2424 0 : files[out].name = temp;
2425 0 : files[out].pid = pid;
2426 : }
2427 :
2428 0 : remainder = nfiles - in;
2429 0 : cheap_slots = NMERGE - out % NMERGE;
2430 :
2431 0 : if (cheap_slots < remainder)
2432 : {
2433 : /* So many files remain that they can't all be put into the last
2434 : NMERGE-sized output window. Do one more merge. Merge as few
2435 : files as possible, to avoid needless I/O. */
2436 0 : size_t nshortmerge = remainder - cheap_slots + 1;
2437 : FILE *tfp;
2438 : pid_t pid;
2439 0 : char *temp = create_temp (&tfp, &pid);
2440 0 : size_t nt = MIN (ntemps, nshortmerge);
2441 0 : ntemps -= nt;
2442 0 : mergefps (&files[in], nt, nshortmerge, tfp, temp);
2443 0 : files[out].name = temp;
2444 0 : files[out++].pid = pid;
2445 0 : in += nshortmerge;
2446 : }
2447 :
2448 : /* Put the remaining input files into the last NMERGE-sized output
2449 : window, so they will be merged in the next pass. */
2450 0 : memmove(&files[out], &files[in], (nfiles - in) * sizeof *files);
2451 0 : ntemps += out;
2452 0 : nfiles -= in - out;
2453 : }
2454 :
2455 0 : nfiles = avoid_trashing_input (files, ntemps, nfiles, output_file);
2456 0 : mergefps (files, ntemps, nfiles, NULL, output_file);
2457 0 : }
2458 :
2459 : /* Sort NFILES FILES onto OUTPUT_FILE. */
2460 :
2461 : static void
2462 34 : sort (char * const *files, size_t nfiles, char const *output_file)
2463 : {
2464 : struct buffer buf;
2465 34 : size_t ntemps = 0;
2466 34 : bool output_file_created = false;
2467 :
2468 34 : buf.alloc = 0;
2469 :
2470 78 : while (nfiles)
2471 : {
2472 : char const *temp_output;
2473 44 : char const *file = *files;
2474 44 : FILE *fp = xfopen (file, "r");
2475 : FILE *tfp;
2476 36 : size_t bytes_per_line = (2 * sizeof (struct line)
2477 : - sizeof (struct line) / 2);
2478 :
2479 36 : if (! buf.alloc)
2480 26 : initbuf (&buf, bytes_per_line,
2481 : sort_buffer_size (&fp, 1, files, nfiles, bytes_per_line));
2482 33 : buf.eof = false;
2483 33 : files++;
2484 33 : nfiles--;
2485 :
2486 66 : while (fillbuf (&buf, fp, file))
2487 : {
2488 : struct line *line;
2489 : struct line *linebase;
2490 :
2491 23 : if (buf.eof && nfiles
2492 20 : && (bytes_per_line + 1
2493 10 : < (buf.alloc - buf.used - bytes_per_line * buf.nlines)))
2494 : {
2495 : /* End of file, but there is more input and buffer room.
2496 : Concatenate the next input file; this is faster in
2497 : the usual case. */
2498 10 : buf.left = buf.used;
2499 10 : break;
2500 : }
2501 :
2502 13 : line = buffer_linelim (&buf);
2503 13 : linebase = line - buf.nlines;
2504 13 : if (1 < buf.nlines)
2505 13 : sortlines (line, buf.nlines, linebase);
2506 13 : if (buf.eof && !nfiles && !ntemps && !buf.left)
2507 : {
2508 13 : xfclose (fp, file);
2509 13 : tfp = xfopen (output_file, "w");
2510 13 : temp_output = output_file;
2511 13 : output_file_created = true;
2512 : }
2513 : else
2514 : {
2515 0 : ++ntemps;
2516 0 : temp_output = create_temp (&tfp, NULL);
2517 : }
2518 :
2519 : do
2520 : {
2521 102 : line--;
2522 102 : write_bytes (line->text, line->length, tfp, temp_output);
2523 102 : if (unique)
2524 0 : while (linebase < line && compare (line, line - 1) == 0)
2525 0 : line--;
2526 : }
2527 102 : while (linebase < line);
2528 :
2529 13 : xfclose (tfp, temp_output);
2530 :
2531 : /* Free up some resources every once in a while. */
2532 13 : if (MAX_PROCS_BEFORE_REAP < nprocs)
2533 0 : reap_some ();
2534 :
2535 13 : if (output_file_created)
2536 26 : goto finish;
2537 : }
2538 10 : xfclose (fp, file);
2539 : }
2540 :
2541 0 : finish:
2542 13 : free (buf.buf);
2543 :
2544 13 : if (! output_file_created)
2545 : {
2546 : size_t i;
2547 0 : struct tempnode *node = temphead;
2548 0 : struct sortfile *tempfiles = xnmalloc (ntemps, sizeof *tempfiles);
2549 0 : for (i = 0; node; i++)
2550 : {
2551 0 : tempfiles[i].name = node->name;
2552 0 : tempfiles[i].pid = node->pid;
2553 0 : node = node->next;
2554 : }
2555 0 : merge (tempfiles, ntemps, ntemps, output_file);
2556 0 : free (tempfiles);
2557 : }
2558 13 : }
2559 :
2560 : /* Insert a malloc'd copy of key KEY_ARG at the end of the key list. */
2561 :
2562 : static void
2563 8 : insertkey (struct keyfield *key_arg)
2564 : {
2565 : struct keyfield **p;
2566 8 : struct keyfield *key = xmemdup (key_arg, sizeof *key);
2567 :
2568 8 : for (p = &keylist; *p; p = &(*p)->next)
2569 0 : continue;
2570 8 : *p = key;
2571 8 : key->next = NULL;
2572 8 : }
2573 :
2574 : /* Report a bad field specification SPEC, with extra info MSGID. */
2575 :
2576 : static void badfieldspec (char const *, char const *)
2577 : ATTRIBUTE_NORETURN;
2578 : static void
2579 0 : badfieldspec (char const *spec, char const *msgid)
2580 : {
2581 0 : error (SORT_FAILURE, 0, _("%s: invalid field specification %s"),
2582 : _(msgid), quote (spec));
2583 0 : abort ();
2584 : }
2585 :
2586 : /* Report incompatible options. */
2587 :
2588 : static void incompatible_options (char const *) ATTRIBUTE_NORETURN;
2589 : static void
2590 0 : incompatible_options (char const *opts)
2591 : {
2592 0 : error (SORT_FAILURE, 0, _("options `-%s' are incompatible"), opts);
2593 0 : abort ();
2594 : }
2595 :
2596 : /* Check compatibility of ordering options. */
2597 :
2598 : static void
2599 34 : check_ordering_compatibility (void)
2600 : {
2601 : struct keyfield const *key;
2602 :
2603 42 : for (key = keylist; key; key = key->next)
2604 16 : if ((1 < (key->random + key->numeric + key->general_numeric + key->month
2605 8 : + !!key->ignore))
2606 8 : || (key->random && key->translate))
2607 : {
2608 : char opts[7];
2609 0 : char *p = opts;
2610 0 : if (key->ignore == nondictionary)
2611 0 : *p++ = 'd';
2612 0 : if (key->translate)
2613 0 : *p++ = 'f';
2614 0 : if (key->general_numeric)
2615 0 : *p++ = 'g';
2616 0 : if (key->ignore == nonprinting)
2617 0 : *p++ = 'i';
2618 0 : if (key->month)
2619 0 : *p++ = 'M';
2620 0 : if (key->numeric)
2621 0 : *p++ = 'n';
2622 0 : if (key->random)
2623 0 : *p++ = 'R';
2624 0 : *p = '\0';
2625 0 : incompatible_options (opts);
2626 : }
2627 34 : }
2628 :
2629 : /* Parse the leading integer in STRING and store the resulting value
2630 : (which must fit into size_t) into *VAL. Return the address of the
2631 : suffix after the integer. If the value is too large, silently
2632 : substitute SIZE_MAX. If MSGID is NULL, return NULL after
2633 : failure; otherwise, report MSGID and exit on failure. */
2634 :
2635 : static char const *
2636 3 : parse_field_count (char const *string, size_t *val, char const *msgid)
2637 : {
2638 : char *suffix;
2639 : uintmax_t n;
2640 :
2641 3 : switch (xstrtoumax (string, &suffix, 10, &n, ""))
2642 : {
2643 0 : case LONGINT_OK:
2644 : case LONGINT_INVALID_SUFFIX_CHAR:
2645 0 : *val = n;
2646 0 : if (*val == n)
2647 0 : break;
2648 : /* Fall through. */
2649 : case LONGINT_OVERFLOW:
2650 : case LONGINT_OVERFLOW | LONGINT_INVALID_SUFFIX_CHAR:
2651 0 : *val = SIZE_MAX;
2652 0 : break;
2653 :
2654 3 : case LONGINT_INVALID:
2655 3 : if (msgid)
2656 0 : error (SORT_FAILURE, 0, _("%s: invalid count at start of %s"),
2657 : _(msgid), quote (string));
2658 3 : return NULL;
2659 : }
2660 :
2661 0 : return suffix;
2662 : }
2663 :
2664 : /* Handle interrupts and hangups. */
2665 :
2666 : static void
2667 0 : sighandler (int sig)
2668 : {
2669 : if (! SA_NOCLDSTOP)
2670 : signal (sig, SIG_IGN);
2671 :
2672 0 : cleanup ();
2673 :
2674 0 : signal (sig, SIG_DFL);
2675 0 : raise (sig);
2676 0 : }
2677 :
2678 : /* Set the ordering options for KEY specified in S.
2679 : Return the address of the first character in S that
2680 : is not a valid ordering option.
2681 : BLANKTYPE is the kind of blanks that 'b' should skip. */
2682 :
2683 : static char *
2684 8 : set_ordering (const char *s, struct keyfield *key, enum blanktype blanktype)
2685 : {
2686 24 : while (*s)
2687 : {
2688 8 : switch (*s)
2689 : {
2690 8 : case 'b':
2691 8 : if (blanktype == bl_start || blanktype == bl_both)
2692 8 : key->skipsblanks = true;
2693 8 : if (blanktype == bl_end || blanktype == bl_both)
2694 8 : key->skipeblanks = true;
2695 8 : break;
2696 0 : case 'd':
2697 0 : key->ignore = nondictionary;
2698 0 : break;
2699 0 : case 'f':
2700 0 : key->translate = fold_toupper;
2701 0 : break;
2702 0 : case 'g':
2703 0 : key->general_numeric = true;
2704 0 : break;
2705 0 : case 'i':
2706 : /* Option order should not matter, so don't let -i override
2707 : -d. -d implies -i, but -i does not imply -d. */
2708 0 : if (! key->ignore)
2709 0 : key->ignore = nonprinting;
2710 0 : break;
2711 0 : case 'M':
2712 0 : key->month = true;
2713 0 : break;
2714 0 : case 'n':
2715 0 : key->numeric = true;
2716 0 : break;
2717 0 : case 'R':
2718 0 : key->random = true;
2719 0 : break;
2720 0 : case 'r':
2721 0 : key->reverse = true;
2722 0 : break;
2723 0 : default:
2724 0 : return (char *) s;
2725 : }
2726 8 : ++s;
2727 : }
2728 8 : return (char *) s;
2729 : }
2730 :
2731 : static struct keyfield *
2732 3 : key_init (struct keyfield *key)
2733 : {
2734 3 : memset (key, 0, sizeof *key);
2735 3 : key->eword = SIZE_MAX;
2736 3 : return key;
2737 : }
2738 :
2739 : int
2740 41 : main (int argc, char **argv)
2741 : {
2742 : struct keyfield *key;
2743 : struct keyfield key_buf;
2744 : struct keyfield gkey;
2745 : char const *s;
2746 41 : int c = 0;
2747 41 : char checkonly = 0;
2748 41 : bool mergeonly = false;
2749 41 : char *random_source = NULL;
2750 41 : bool need_random = false;
2751 41 : size_t nfiles = 0;
2752 41 : bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
2753 41 : bool obsolete_usage = (posix2_version () < 200112);
2754 : char **files;
2755 41 : char const *outfile = NULL;
2756 :
2757 : initialize_main (&argc, &argv);
2758 41 : program_name = argv[0];
2759 41 : setlocale (LC_ALL, "");
2760 : bindtextdomain (PACKAGE, LOCALEDIR);
2761 : textdomain (PACKAGE);
2762 :
2763 41 : initialize_exit_failure (SORT_FAILURE);
2764 :
2765 41 : hard_LC_COLLATE = hard_locale (LC_COLLATE);
2766 : #if HAVE_NL_LANGINFO
2767 : hard_LC_TIME = hard_locale (LC_TIME);
2768 : #endif
2769 :
2770 : /* Get locale's representation of the decimal point. */
2771 : {
2772 41 : struct lconv const *locale = localeconv ();
2773 :
2774 : /* If the locale doesn't define a decimal point, or if the decimal
2775 : point is multibyte, use the C locale's decimal point. FIXME:
2776 : add support for multibyte decimal points. */
2777 41 : decimal_point = to_uchar (locale->decimal_point[0]);
2778 41 : if (! decimal_point || locale->decimal_point[1])
2779 0 : decimal_point = '.';
2780 :
2781 : /* FIXME: add support for multibyte thousands separators. */
2782 41 : thousands_sep = to_uchar (*locale->thousands_sep);
2783 41 : if (! thousands_sep || locale->thousands_sep[1])
2784 41 : thousands_sep = -1;
2785 : }
2786 :
2787 41 : have_read_stdin = false;
2788 41 : inittables ();
2789 :
2790 : {
2791 : size_t i;
2792 : static int const sig[] =
2793 : {
2794 : /* The usual suspects. */
2795 : SIGALRM, SIGHUP, SIGINT, SIGPIPE, SIGQUIT, SIGTERM,
2796 : #ifdef SIGPOLL
2797 : SIGPOLL,
2798 : #endif
2799 : #ifdef SIGPROF
2800 : SIGPROF,
2801 : #endif
2802 : #ifdef SIGVTALRM
2803 : SIGVTALRM,
2804 : #endif
2805 : #ifdef SIGXCPU
2806 : SIGXCPU,
2807 : #endif
2808 : #ifdef SIGXFSZ
2809 : SIGXFSZ,
2810 : #endif
2811 : };
2812 : enum { nsigs = sizeof sig / sizeof sig[0] };
2813 :
2814 : #if SA_NOCLDSTOP
2815 : struct sigaction act;
2816 :
2817 41 : sigemptyset (&caught_signals);
2818 492 : for (i = 0; i < nsigs; i++)
2819 : {
2820 451 : sigaction (sig[i], NULL, &act);
2821 451 : if (act.sa_handler != SIG_IGN)
2822 451 : sigaddset (&caught_signals, sig[i]);
2823 : }
2824 :
2825 41 : act.sa_handler = sighandler;
2826 41 : act.sa_mask = caught_signals;
2827 41 : act.sa_flags = 0;
2828 :
2829 492 : for (i = 0; i < nsigs; i++)
2830 451 : if (sigismember (&caught_signals, sig[i]))
2831 451 : sigaction (sig[i], &act, NULL);
2832 : #else
2833 : for (i = 0; i < nsigs; i++)
2834 : if (signal (sig[i], SIG_IGN) != SIG_IGN)
2835 : {
2836 : signal (sig[i], sighandler);
2837 : siginterrupt (sig[i], 1);
2838 : }
2839 : #endif
2840 : }
2841 :
2842 : /* The signal mask is known, so it is safe to invoke exit_cleanup. */
2843 41 : atexit (exit_cleanup);
2844 :
2845 41 : gkey.sword = gkey.eword = SIZE_MAX;
2846 41 : gkey.ignore = NULL;
2847 41 : gkey.translate = NULL;
2848 41 : gkey.numeric = gkey.general_numeric = gkey.random = false;
2849 41 : gkey.month = gkey.reverse = false;
2850 41 : gkey.skipsblanks = gkey.skipeblanks = false;
2851 :
2852 41 : files = xnmalloc (argc, sizeof *files);
2853 :
2854 : for (;;)
2855 62 : {
2856 : /* Parse an operand as a file after "--" was seen; or if
2857 : pedantic and a file was seen, unless the POSIX version
2858 : predates 1003.1-2001 and -c was not seen and the operand is
2859 : "-o FILE" or "-oFILE". */
2860 103 : int oi = -1;
2861 :
2862 103 : if (c == -1
2863 76 : || (posixly_correct && nfiles != 0
2864 0 : && ! (obsolete_usage
2865 0 : && ! checkonly
2866 0 : && optind != argc
2867 0 : && argv[optind][0] == '-' && argv[optind][1] == 'o'
2868 0 : && (argv[optind][2] || optind + 1 != argc)))
2869 76 : || ((c = getopt_long (argc, argv, short_options,
2870 : long_options, &oi))
2871 : == -1))
2872 : {
2873 61 : if (argc <= optind)
2874 34 : break;
2875 27 : files[nfiles++] = argv[optind++];
2876 : }
2877 42 : else switch (c)
2878 : {
2879 27 : case 1:
2880 27 : key = NULL;
2881 27 : if (optarg[0] == '+')
2882 : {
2883 26 : bool minus_pos_usage = (optind != argc && argv[optind][0] == '-'
2884 16 : && ISDIGIT (argv[optind][1]));
2885 9 : obsolete_usage |= minus_pos_usage & ~posixly_correct;
2886 9 : if (obsolete_usage)
2887 : {
2888 : /* Treat +POS1 [-POS2] as a key if possible; but silently
2889 : treat an operand as a file if it is not a valid +POS1. */
2890 3 : key = key_init (&key_buf);
2891 3 : s = parse_field_count (optarg + 1, &key->sword, NULL);
2892 3 : if (s && *s == '.')
2893 0 : s = parse_field_count (s + 1, &key->schar, NULL);
2894 3 : if (! (key->sword | key->schar))
2895 3 : key->sword = SIZE_MAX;
2896 3 : if (! s || *set_ordering (s, key, bl_start))
2897 3 : key = NULL;
2898 : else
2899 : {
2900 0 : if (minus_pos_usage)
2901 : {
2902 0 : char const *optarg1 = argv[optind++];
2903 0 : s = parse_field_count (optarg1 + 1, &key->eword,
2904 : N_("invalid number after `-'"));
2905 0 : if (*s == '.')
2906 0 : s = parse_field_count (s + 1, &key->echar,
2907 : N_("invalid number after `.'"));
2908 0 : if (*set_ordering (s, key, bl_end))
2909 0 : badfieldspec (optarg1,
2910 : N_("stray character in field spec"));
2911 : }
2912 0 : insertkey (key);
2913 : }
2914 : }
2915 : }
2916 27 : if (! key)
2917 27 : files[nfiles++] = optarg;
2918 27 : break;
2919 :
2920 0 : case SORT_OPTION:
2921 0 : c = XARGMATCH ("--sort", optarg, sort_args, sort_types);
2922 : /* Fall through. */
2923 8 : case 'b':
2924 : case 'd':
2925 : case 'f':
2926 : case 'g':
2927 : case 'i':
2928 : case 'M':
2929 : case 'n':
2930 : case 'r':
2931 : case 'R':
2932 : {
2933 : char str[2];
2934 8 : str[0] = c;
2935 8 : str[1] = '\0';
2936 8 : set_ordering (str, &gkey, bl_both);
2937 : }
2938 8 : break;
2939 :
2940 0 : case CHECK_OPTION:
2941 0 : c = (optarg
2942 0 : ? XARGMATCH ("--check", optarg, check_args, check_types)
2943 0 : : 'c');
2944 : /* Fall through. */
2945 0 : case 'c':
2946 : case 'C':
2947 0 : if (checkonly && checkonly != c)
2948 0 : incompatible_options ("cC");
2949 0 : checkonly = c;
2950 0 : break;
2951 :
2952 0 : case COMPRESS_PROGRAM_OPTION:
2953 0 : if (compress_program && !STREQ (compress_program, optarg))
2954 0 : error (SORT_FAILURE, 0, _("multiple compress programs specified"));
2955 0 : compress_program = optarg;
2956 0 : break;
2957 :
2958 0 : case 'k':
2959 0 : key = key_init (&key_buf);
2960 :
2961 : /* Get POS1. */
2962 0 : s = parse_field_count (optarg, &key->sword,
2963 : N_("invalid number at field start"));
2964 0 : if (! key->sword--)
2965 : {
2966 : /* Provoke with `sort -k0' */
2967 0 : badfieldspec (optarg, N_("field number is zero"));
2968 : }
2969 0 : if (*s == '.')
2970 : {
2971 0 : s = parse_field_count (s + 1, &key->schar,
2972 : N_("invalid number after `.'"));
2973 0 : if (! key->schar--)
2974 : {
2975 : /* Provoke with `sort -k1.0' */
2976 0 : badfieldspec (optarg, N_("character offset is zero"));
2977 : }
2978 : }
2979 0 : if (! (key->sword | key->schar))
2980 0 : key->sword = SIZE_MAX;
2981 0 : s = set_ordering (s, key, bl_start);
2982 0 : if (*s != ',')
2983 : {
2984 0 : key->eword = SIZE_MAX;
2985 0 : key->echar = 0;
2986 : }
2987 : else
2988 : {
2989 : /* Get POS2. */
2990 0 : s = parse_field_count (s + 1, &key->eword,
2991 : N_("invalid number after `,'"));
2992 0 : if (! key->eword--)
2993 : {
2994 : /* Provoke with `sort -k1,0' */
2995 0 : badfieldspec (optarg, N_("field number is zero"));
2996 : }
2997 0 : if (*s == '.')
2998 0 : s = parse_field_count (s + 1, &key->echar,
2999 : N_("invalid number after `.'"));
3000 : else
3001 : {
3002 : /* `-k 2,3' is equivalent to `+1 -3'. */
3003 0 : key->eword++;
3004 : }
3005 0 : s = set_ordering (s, key, bl_end);
3006 : }
3007 0 : if (*s)
3008 0 : badfieldspec (optarg, N_("stray character in field spec"));
3009 0 : insertkey (key);
3010 0 : break;
3011 :
3012 0 : case 'm':
3013 0 : mergeonly = true;
3014 0 : break;
3015 :
3016 0 : case 'o':
3017 0 : if (outfile && !STREQ (outfile, optarg))
3018 0 : error (SORT_FAILURE, 0, _("multiple output files specified"));
3019 0 : outfile = optarg;
3020 0 : break;
3021 :
3022 0 : case RANDOM_SOURCE_OPTION:
3023 0 : if (random_source && !STREQ (random_source, optarg))
3024 0 : error (SORT_FAILURE, 0, _("multiple random sources specified"));
3025 0 : random_source = optarg;
3026 0 : break;
3027 :
3028 0 : case 's':
3029 0 : stable = true;
3030 0 : break;
3031 :
3032 0 : case 'S':
3033 0 : specify_sort_size (oi, c, optarg);
3034 0 : break;
3035 :
3036 0 : case 't':
3037 : {
3038 0 : char newtab = optarg[0];
3039 0 : if (! newtab)
3040 0 : error (SORT_FAILURE, 0, _("empty tab"));
3041 0 : if (optarg[1])
3042 : {
3043 0 : if (STREQ (optarg, "\\0"))
3044 0 : newtab = '\0';
3045 : else
3046 : {
3047 : /* Provoke with `sort -txx'. Complain about
3048 : "multi-character tab" instead of "multibyte tab", so
3049 : that the diagnostic's wording does not need to be
3050 : changed once multibyte characters are supported. */
3051 0 : error (SORT_FAILURE, 0, _("multi-character tab %s"),
3052 : quote (optarg));
3053 : }
3054 : }
3055 0 : if (tab != TAB_DEFAULT && tab != newtab)
3056 0 : error (SORT_FAILURE, 0, _("incompatible tabs"));
3057 0 : tab = newtab;
3058 : }
3059 0 : break;
3060 :
3061 0 : case 'T':
3062 0 : add_temp_dir (optarg);
3063 0 : break;
3064 :
3065 0 : case 'u':
3066 0 : unique = true;
3067 0 : break;
3068 :
3069 0 : case 'y':
3070 : /* Accept and ignore e.g. -y0 for compatibility with Solaris 2.x
3071 : through Solaris 7. It is also accepted by many non-Solaris
3072 : "sort" implementations, e.g., AIX 5.2, HP-UX 11i v2, IRIX 6.5.
3073 : -y is marked as obsolete starting with Solaris 8 (1999), but is
3074 : still accepted as of Solaris 10 prerelease (2004).
3075 :
3076 : Solaris 2.5.1 "sort -y 100" reads the input file "100", but
3077 : emulate Solaris 8 and 9 "sort -y 100" which ignores the "100",
3078 : and which in general ignores the argument after "-y" if it
3079 : consists entirely of digits (it can even be empty). */
3080 0 : if (optarg == argv[optind - 1])
3081 : {
3082 : char const *p;
3083 0 : for (p = optarg; ISDIGIT (*p); p++)
3084 0 : continue;
3085 0 : optind -= (*p != '\0');
3086 : }
3087 0 : break;
3088 :
3089 0 : case 'z':
3090 0 : eolchar = 0;
3091 0 : break;
3092 :
3093 0 : case_GETOPT_HELP_CHAR;
3094 :
3095 0 : case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
3096 :
3097 7 : default:
3098 7 : usage (SORT_FAILURE);
3099 : }
3100 : }
3101 :
3102 : /* Inheritance of global options to individual keys. */
3103 34 : for (key = keylist; key; key = key->next)
3104 : {
3105 0 : if (! (key->ignore || key->translate
3106 0 : || (key->skipsblanks | key->reverse
3107 0 : | key->skipeblanks | key->month | key->numeric
3108 0 : | key->general_numeric
3109 0 : | key->random)))
3110 : {
3111 0 : key->ignore = gkey.ignore;
3112 0 : key->translate = gkey.translate;
3113 0 : key->skipsblanks = gkey.skipsblanks;
3114 0 : key->skipeblanks = gkey.skipeblanks;
3115 0 : key->month = gkey.month;
3116 0 : key->numeric = gkey.numeric;
3117 0 : key->general_numeric = gkey.general_numeric;
3118 0 : key->random = gkey.random;
3119 0 : key->reverse = gkey.reverse;
3120 : }
3121 :
3122 0 : need_random |= key->random;
3123 : }
3124 :
3125 34 : if (!keylist && (gkey.ignore || gkey.translate
3126 68 : || (gkey.skipsblanks | gkey.skipeblanks | gkey.month
3127 34 : | gkey.numeric | gkey.general_numeric
3128 34 : | gkey.random)))
3129 : {
3130 8 : insertkey (&gkey);
3131 8 : need_random |= gkey.random;
3132 : }
3133 :
3134 34 : check_ordering_compatibility ();
3135 :
3136 34 : reverse = gkey.reverse;
3137 :
3138 34 : if (need_random)
3139 : {
3140 0 : randread_source = randread_new (random_source, MD5_DIGEST_SIZE);
3141 0 : if (! randread_source)
3142 0 : die (_("open failed"), random_source);
3143 : }
3144 :
3145 34 : if (temp_dir_count == 0)
3146 : {
3147 34 : char const *tmp_dir = getenv ("TMPDIR");
3148 34 : add_temp_dir (tmp_dir ? tmp_dir : DEFAULT_TMPDIR);
3149 : }
3150 :
3151 34 : if (nfiles == 0)
3152 : {
3153 : static char *minus = "-";
3154 3 : nfiles = 1;
3155 3 : free (files);
3156 3 : files = −
3157 : }
3158 :
3159 34 : if (checkonly)
3160 : {
3161 0 : if (nfiles > 1)
3162 0 : error (SORT_FAILURE, 0, _("extra operand %s not allowed with -%c"),
3163 0 : quote (files[1]), checkonly);
3164 :
3165 0 : if (outfile)
3166 : {
3167 : static char opts[] = {0, 'o', 0};
3168 0 : opts[0] = checkonly;
3169 0 : incompatible_options (opts);
3170 : }
3171 :
3172 : /* POSIX requires that sort return 1 IFF invoked with -c or -C and the
3173 : input is not properly sorted. */
3174 0 : exit (check (files[0], checkonly) ? EXIT_SUCCESS : SORT_OUT_OF_ORDER);
3175 : }
3176 :
3177 34 : if (mergeonly)
3178 : {
3179 0 : struct sortfile *sortfiles = xcalloc (nfiles, sizeof *sortfiles);
3180 : size_t i;
3181 :
3182 0 : for (i = 0; i < nfiles; ++i)
3183 0 : sortfiles[i].name = files[i];
3184 :
3185 0 : merge (sortfiles, 0, nfiles, outfile);
3186 : IF_LINT (free (sortfiles));
3187 : }
3188 : else
3189 34 : sort (files, nfiles, outfile);
3190 :
3191 13 : if (have_read_stdin && fclose (stdin) == EOF)
3192 0 : die (_("close failed"), "-");
3193 :
3194 13 : exit (EXIT_SUCCESS);
3195 : }
|