Line data Source code
1 : /* uniq -- remove duplicate lines from a sorted file
2 : Copyright (C) 86, 91, 1995-2007 Free Software Foundation, Inc.
3 :
4 : This program is free software: you can redistribute it and/or modify
5 : it under the terms of the GNU General Public License as published by
6 : the Free Software Foundation, either version 3 of the License, or
7 : (at your option) any later version.
8 :
9 : This program is distributed in the hope that it will be useful,
10 : but WITHOUT ANY WARRANTY; without even the implied warranty of
11 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 : GNU General Public License for more details.
13 :
14 : You should have received a copy of the GNU General Public License
15 : along with this program. If not, see <http://www.gnu.org/licenses/>. */
16 :
17 : /* Written by Richard Stallman and David MacKenzie. */
18 :
19 : #include <config.h>
20 :
21 : #include <stdio.h>
22 : #include <getopt.h>
23 : #include <sys/types.h>
24 :
25 : #include "system.h"
26 : #include "argmatch.h"
27 : #include "linebuffer.h"
28 : #include "error.h"
29 : #include "hard-locale.h"
30 : #include "posixver.h"
31 : #include "quote.h"
32 : #include "xmemcoll.h"
33 : #include "xstrtol.h"
34 : #include "memcasecmp.h"
35 :
36 : /* The official name of this program (e.g., no `g' prefix). */
37 : #define PROGRAM_NAME "uniq"
38 :
39 : #define AUTHORS "Richard Stallman", "David MacKenzie"
40 :
41 : #define SWAP_LINES(A, B) \
42 : do \
43 : { \
44 : struct linebuffer *_tmp; \
45 : _tmp = (A); \
46 : (A) = (B); \
47 : (B) = _tmp; \
48 : } \
49 : while (0)
50 :
51 : /* The name this program was run with. */
52 : char *program_name;
53 :
54 : /* True if the LC_COLLATE locale is hard. */
55 : static bool hard_LC_COLLATE;
56 :
57 : /* Number of fields to skip on each line when doing comparisons. */
58 : static size_t skip_fields;
59 :
60 : /* Number of chars to skip after skipping any fields. */
61 : static size_t skip_chars;
62 :
63 : /* Number of chars to compare. */
64 : static size_t check_chars;
65 :
66 : enum countmode
67 : {
68 : count_occurrences, /* -c Print count before output lines. */
69 : count_none /* Default. Do not print counts. */
70 : };
71 :
72 : /* Whether and how to precede the output lines with a count of the number of
73 : times they occurred in the input. */
74 : static enum countmode countmode;
75 :
76 : /* Which lines to output: unique lines, the first of a group of
77 : repeated lines, and the second and subsequented of a group of
78 : repeated lines. */
79 : static bool output_unique;
80 : static bool output_first_repeated;
81 : static bool output_later_repeated;
82 :
83 : /* If true, ignore case when comparing. */
84 : static bool ignore_case;
85 :
86 : enum delimit_method
87 : {
88 : /* No delimiters output. --all-repeated[=none] */
89 : DM_NONE,
90 :
91 : /* Delimiter precedes all groups. --all-repeated=prepend */
92 : DM_PREPEND,
93 :
94 : /* Delimit all groups. --all-repeated=separate */
95 : DM_SEPARATE
96 : };
97 :
98 : static char const *const delimit_method_string[] =
99 : {
100 : "none", "prepend", "separate", NULL
101 : };
102 :
103 : static enum delimit_method const delimit_method_map[] =
104 : {
105 : DM_NONE, DM_PREPEND, DM_SEPARATE
106 : };
107 :
108 : /* Select whether/how to delimit groups of duplicate lines. */
109 : static enum delimit_method delimit_groups;
110 :
111 : static struct option const longopts[] =
112 : {
113 : {"count", no_argument, NULL, 'c'},
114 : {"repeated", no_argument, NULL, 'd'},
115 : {"all-repeated", optional_argument, NULL, 'D'},
116 : {"ignore-case", no_argument, NULL, 'i'},
117 : {"unique", no_argument, NULL, 'u'},
118 : {"skip-fields", required_argument, NULL, 'f'},
119 : {"skip-chars", required_argument, NULL, 's'},
120 : {"check-chars", required_argument, NULL, 'w'},
121 : {"zero-terminated", no_argument, NULL, 'z'},
122 : {GETOPT_HELP_OPTION_DECL},
123 : {GETOPT_VERSION_OPTION_DECL},
124 : {NULL, 0, NULL, 0}
125 : };
126 :
127 : void
128 29 : usage (int status)
129 : {
130 29 : if (status != EXIT_SUCCESS)
131 28 : fprintf (stderr, _("Try `%s --help' for more information.\n"),
132 : program_name);
133 : else
134 : {
135 1 : printf (_("\
136 : Usage: %s [OPTION]... [INPUT [OUTPUT]]\n\
137 : "),
138 : program_name);
139 1 : fputs (_("\
140 : Discard all but one of successive identical lines from INPUT (or\n\
141 : standard input), writing to OUTPUT (or standard output).\n\
142 : \n\
143 : "), stdout);
144 1 : fputs (_("\
145 : Mandatory arguments to long options are mandatory for short options too.\n\
146 : "), stdout);
147 1 : fputs (_("\
148 : -c, --count prefix lines by the number of occurrences\n\
149 : -d, --repeated only print duplicate lines\n\
150 : "), stdout);
151 1 : fputs (_("\
152 : -D, --all-repeated[=delimit-method] print all duplicate lines\n\
153 : delimit-method={none(default),prepend,separate}\n\
154 : Delimiting is done with blank lines.\n\
155 : -f, --skip-fields=N avoid comparing the first N fields\n\
156 : -i, --ignore-case ignore differences in case when comparing\n\
157 : -s, --skip-chars=N avoid comparing the first N characters\n\
158 : -u, --unique only print unique lines\n\
159 : -z, --zero-terminated end lines with 0 byte, not newline\n\
160 : "), stdout);
161 1 : fputs (_("\
162 : -w, --check-chars=N compare no more than N characters in lines\n\
163 : "), stdout);
164 1 : fputs (HELP_OPTION_DESCRIPTION, stdout);
165 1 : fputs (VERSION_OPTION_DESCRIPTION, stdout);
166 1 : fputs (_("\
167 : \n\
168 : A field is a run of whitespace, then non-whitespace characters.\n\
169 : Fields are skipped before chars.\n\
170 : "), stdout);
171 1 : fputs (_("\
172 : \n\
173 : Note: 'uniq' does not detect repeated lines unless they are adjacent.\n\
174 : You may want to sort the input first, or use `sort -u' without `uniq'.\n\
175 : "), stdout);
176 1 : emit_bug_reporting_address ();
177 : }
178 29 : exit (status);
179 : }
180 :
181 : /* Convert OPT to size_t, reporting an error using MSGID if OPT is
182 : invalid. Silently convert too-large values to SIZE_MAX. */
183 :
184 : static size_t
185 15 : size_opt (char const *opt, char const *msgid)
186 : {
187 : unsigned long int size;
188 : verify (SIZE_MAX <= ULONG_MAX);
189 :
190 15 : switch (xstrtoul (opt, NULL, 10, &size, ""))
191 : {
192 4 : case LONGINT_OK:
193 : case LONGINT_OVERFLOW:
194 4 : break;
195 :
196 11 : default:
197 11 : error (EXIT_FAILURE, 0, "%s: %s", opt, _(msgid));
198 : }
199 :
200 4 : return MIN (size, SIZE_MAX);
201 : }
202 :
203 : /* Given a linebuffer LINE,
204 : return a pointer to the beginning of the line's field to be compared. */
205 :
206 : static char *
207 340 : find_field (const struct linebuffer *line)
208 : {
209 : size_t count;
210 340 : char *lp = line->buffer;
211 340 : size_t size = line->length - 1;
212 340 : size_t i = 0;
213 :
214 349 : for (count = 0; count < skip_fields && i < size; count++)
215 : {
216 22 : while (i < size && isblank (lp[i]))
217 4 : i++;
218 25 : while (i < size && !isblank (lp[i]))
219 7 : i++;
220 : }
221 :
222 340 : for (count = 0; count < skip_chars && i < size; count++)
223 0 : i++;
224 :
225 340 : return lp + i;
226 : }
227 :
228 : /* Return false if two strings OLD and NEW match, true if not.
229 : OLD and NEW point not to the beginnings of the lines
230 : but rather to the beginnings of the fields to compare.
231 : OLDLEN and NEWLEN are their lengths. */
232 :
233 : static bool
234 293 : different (char *old, char *new, size_t oldlen, size_t newlen)
235 : {
236 293 : if (check_chars < oldlen)
237 0 : oldlen = check_chars;
238 293 : if (check_chars < newlen)
239 0 : newlen = check_chars;
240 :
241 293 : if (ignore_case)
242 : {
243 : /* FIXME: This should invoke strcoll somehow. */
244 21 : return oldlen != newlen || memcasecmp (old, new, oldlen);
245 : }
246 272 : else if (hard_LC_COLLATE)
247 0 : return xmemcoll (old, oldlen, new, newlen) != 0;
248 : else
249 272 : return oldlen != newlen || memcmp (old, new, oldlen);
250 : }
251 :
252 : /* Output the line in linebuffer LINE to standard output
253 : provided that the switches say it should be output.
254 : MATCH is true if the line matches the previous line.
255 : If requested, print the number of times it occurred, as well;
256 : LINECOUNT + 1 is the number of times that the line occurred. */
257 :
258 : static void
259 83 : writeline (struct linebuffer const *line,
260 : bool match, uintmax_t linecount)
261 : {
262 159 : if (! (linecount == 0 ? output_unique
263 76 : : !match ? output_first_repeated
264 : : output_later_repeated))
265 7 : return;
266 :
267 76 : if (countmode == count_occurrences)
268 4 : printf ("%7" PRIuMAX " ", linecount + 1);
269 :
270 76 : fwrite (line->buffer, sizeof (char), line->length, stdout);
271 : }
272 :
273 : /* Process input file INFILE with output to OUTFILE.
274 : If either is "-", use the standard I/O stream for it instead. */
275 :
276 : static void
277 54 : check_file (const char *infile, const char *outfile, char delimiter)
278 : {
279 : struct linebuffer lb1, lb2;
280 : struct linebuffer *thisline, *prevline;
281 :
282 54 : if (! (STREQ (infile, "-") || freopen (infile, "r", stdin)))
283 6 : error (EXIT_FAILURE, errno, "%s", infile);
284 48 : if (! (STREQ (outfile, "-") || freopen (outfile, "w", stdout)))
285 0 : error (EXIT_FAILURE, errno, "%s", outfile);
286 :
287 48 : thisline = &lb1;
288 48 : prevline = &lb2;
289 :
290 48 : initbuffer (thisline);
291 48 : initbuffer (prevline);
292 :
293 : /* The duplication in the following `if' and `else' blocks is an
294 : optimization to distinguish the common case (in which none of
295 : the following options has been specified: --count, -repeated,
296 : --all-repeated, --unique) from the others. In the common case,
297 : this optimization lets uniq output each different line right away,
298 : without waiting to see if the next one is different. */
299 :
300 48 : if (output_unique && output_first_repeated && countmode == count_none)
301 31 : {
302 : char *prevfield IF_LINT (= NULL);
303 : size_t prevlen IF_LINT (= 0);
304 :
305 272 : while (!feof (stdin))
306 : {
307 : char *thisfield;
308 : size_t thislen;
309 240 : if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
310 30 : break;
311 210 : thisfield = find_field (thisline);
312 210 : thislen = thisline->length - 1 - (thisfield - thisline->buffer);
313 210 : if (prevline->length == 0
314 180 : || different (thisfield, prevfield, thislen, prevlen))
315 : {
316 41 : fwrite (thisline->buffer, sizeof (char),
317 : thisline->length, stdout);
318 :
319 41 : SWAP_LINES (prevline, thisline);
320 41 : prevfield = thisfield;
321 41 : prevlen = thislen;
322 : }
323 : }
324 : }
325 : else
326 : {
327 : char *prevfield;
328 : size_t prevlen;
329 17 : uintmax_t match_count = 0;
330 17 : bool first_delimiter = true;
331 :
332 17 : if (readlinebuffer_delim (prevline, stdin, delimiter) == 0)
333 0 : goto closefiles;
334 17 : prevfield = find_field (prevline);
335 17 : prevlen = prevline->length - 1 - (prevfield - prevline->buffer);
336 :
337 147 : while (!feof (stdin))
338 : {
339 : bool match;
340 : char *thisfield;
341 : size_t thislen;
342 129 : if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
343 : {
344 16 : if (ferror (stdin))
345 0 : goto closefiles;
346 16 : break;
347 : }
348 113 : thisfield = find_field (thisline);
349 113 : thislen = thisline->length - 1 - (thisfield - thisline->buffer);
350 113 : match = !different (thisfield, prevfield, thislen, prevlen);
351 113 : match_count += match;
352 :
353 113 : if (match_count == UINTMAX_MAX)
354 : {
355 : if (count_occurrences)
356 : error (EXIT_FAILURE, 0, _("too many repeated lines"));
357 0 : match_count--;
358 : }
359 :
360 113 : if (delimit_groups != DM_NONE)
361 : {
362 39 : if (!match)
363 : {
364 5 : if (match_count) /* a previous match */
365 2 : first_delimiter = false; /* Only used when DM_SEPARATE */
366 : }
367 34 : else if (match_count == 1)
368 : {
369 8 : if ((delimit_groups == DM_PREPEND)
370 3 : || (delimit_groups == DM_SEPARATE
371 3 : && !first_delimiter))
372 6 : putchar (delimiter);
373 : }
374 : }
375 :
376 113 : if (!match || output_later_repeated)
377 : {
378 66 : writeline (prevline, match, match_count);
379 66 : SWAP_LINES (prevline, thisline);
380 66 : prevfield = thisfield;
381 66 : prevlen = thislen;
382 66 : if (!match)
383 11 : match_count = 0;
384 : }
385 : }
386 :
387 17 : writeline (prevline, false, match_count);
388 : }
389 :
390 48 : closefiles:
391 48 : if (ferror (stdin) || fclose (stdin) != 0)
392 2 : error (EXIT_FAILURE, 0, _("error reading %s"), infile);
393 :
394 : /* stdout is handled via the atexit-invoked close_stdout function. */
395 :
396 46 : free (lb1.buffer);
397 46 : free (lb2.buffer);
398 46 : }
399 :
400 : enum Skip_field_option_type
401 : {
402 : SFO_NONE,
403 : SFO_OBSOLETE,
404 : SFO_NEW
405 : };
406 :
407 : int
408 96 : main (int argc, char **argv)
409 : {
410 96 : int optc = 0;
411 96 : bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
412 96 : enum Skip_field_option_type skip_field_option_type = SFO_NONE;
413 96 : int nfiles = 0;
414 : char const *file[2];
415 96 : char delimiter = '\n'; /* change with --zero-terminated, -z */
416 :
417 96 : file[0] = file[1] = "-";
418 : initialize_main (&argc, &argv);
419 96 : program_name = argv[0];
420 96 : setlocale (LC_ALL, "");
421 : bindtextdomain (PACKAGE, LOCALEDIR);
422 : textdomain (PACKAGE);
423 96 : hard_LC_COLLATE = hard_locale (LC_COLLATE);
424 :
425 96 : atexit (close_stdout);
426 :
427 96 : skip_chars = 0;
428 96 : skip_fields = 0;
429 96 : check_chars = SIZE_MAX;
430 96 : output_unique = output_first_repeated = true;
431 96 : output_later_repeated = false;
432 96 : countmode = count_none;
433 96 : delimit_groups = DM_NONE;
434 :
435 : for (;;)
436 : {
437 : /* Parse an operand with leading "+" as a file after "--" was
438 : seen; or if pedantic and a file was seen; or if not
439 : obsolete. */
440 :
441 336 : if (optc == -1
442 194 : || (posixly_correct && nfiles != 0)
443 194 : || ((optc = getopt_long (argc, argv,
444 : "-0123456789Dcdf:is:uw:z", longopts, NULL))
445 : == -1))
446 : {
447 77 : if (argc <= optind)
448 55 : break;
449 22 : if (nfiles == 2)
450 : {
451 0 : error (0, 0, _("extra operand %s"), quote (argv[optind]));
452 0 : usage (EXIT_FAILURE);
453 : }
454 22 : file[nfiles++] = argv[optind++];
455 : }
456 139 : else switch (optc)
457 : {
458 70 : case 1:
459 : {
460 : unsigned long int size;
461 70 : if (optarg[0] == '+'
462 32 : && posix2_version () < 200112
463 0 : && xstrtoul (optarg, NULL, 10, &size, "") == LONGINT_OK
464 : && size <= SIZE_MAX)
465 0 : skip_chars = size;
466 70 : else if (nfiles == 2)
467 : {
468 16 : error (0, 0, _("extra operand %s"), quote (optarg));
469 16 : usage (EXIT_FAILURE);
470 : }
471 : else
472 54 : file[nfiles++] = optarg;
473 : }
474 54 : break;
475 :
476 15 : case '0':
477 : case '1':
478 : case '2':
479 : case '3':
480 : case '4':
481 : case '5':
482 : case '6':
483 : case '7':
484 : case '8':
485 : case '9':
486 : {
487 15 : if (skip_field_option_type == SFO_NEW)
488 1 : skip_fields = 0;
489 :
490 15 : if (!DECIMAL_DIGIT_ACCUMULATE (skip_fields, optc - '0', size_t))
491 0 : skip_fields = SIZE_MAX;
492 :
493 15 : skip_field_option_type = SFO_OBSOLETE;
494 : }
495 15 : break;
496 :
497 4 : case 'c':
498 4 : countmode = count_occurrences;
499 4 : break;
500 :
501 5 : case 'd':
502 5 : output_unique = false;
503 5 : break;
504 :
505 11 : case 'D':
506 11 : output_unique = false;
507 11 : output_later_repeated = true;
508 11 : if (optarg == NULL)
509 2 : delimit_groups = DM_NONE;
510 : else
511 9 : delimit_groups = XARGMATCH ("--all-repeated", optarg,
512 : delimit_method_string,
513 : delimit_method_map);
514 10 : break;
515 :
516 4 : case 'f':
517 4 : skip_field_option_type = SFO_NEW;
518 4 : skip_fields = size_opt (optarg,
519 : N_("invalid number of fields to skip"));
520 2 : break;
521 :
522 5 : case 'i':
523 5 : ignore_case = true;
524 5 : break;
525 :
526 2 : case 's':
527 2 : skip_chars = size_opt (optarg,
528 : N_("invalid number of bytes to skip"));
529 1 : break;
530 :
531 1 : case 'u':
532 1 : output_first_repeated = false;
533 1 : break;
534 :
535 9 : case 'w':
536 9 : check_chars = size_opt (optarg,
537 : N_("invalid number of bytes to compare"));
538 1 : break;
539 :
540 0 : case 'z':
541 0 : delimiter = '\0';
542 0 : break;
543 :
544 1 : case_GETOPT_HELP_CHAR;
545 :
546 2 : case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
547 :
548 10 : default:
549 10 : usage (EXIT_FAILURE);
550 : }
551 : }
552 :
553 55 : if (countmode == count_occurrences && output_later_repeated)
554 : {
555 1 : error (0, 0,
556 : _("printing all duplicated lines and repeat counts is meaningless"));
557 1 : usage (EXIT_FAILURE);
558 : }
559 :
560 54 : check_file (file[0], file[1], delimiter);
561 :
562 46 : exit (EXIT_SUCCESS);
563 : }
|