LCOV - code coverage report
Current view: top level - src - uniq.c (source / functions) Hit Total Coverage
Test: coreutils.info Lines: 183 199 92.0 %
Date: 2018-01-30 Functions: 7 7 100.0 %

          Line data    Source code
       1             : /* uniq -- remove duplicate lines from a sorted file
       2             :    Copyright (C) 86, 91, 1995-2007 Free Software Foundation, Inc.
       3             : 
       4             :    This program is free software: you can redistribute it and/or modify
       5             :    it under the terms of the GNU General Public License as published by
       6             :    the Free Software Foundation, either version 3 of the License, or
       7             :    (at your option) any later version.
       8             : 
       9             :    This program is distributed in the hope that it will be useful,
      10             :    but WITHOUT ANY WARRANTY; without even the implied warranty of
      11             :    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      12             :    GNU General Public License for more details.
      13             : 
      14             :    You should have received a copy of the GNU General Public License
      15             :    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
      16             : 
      17             : /* Written by Richard Stallman and David MacKenzie. */
      18             : 
      19             : #include <config.h>
      20             : 
      21             : #include <stdio.h>
      22             : #include <getopt.h>
      23             : #include <sys/types.h>
      24             : 
      25             : #include "system.h"
      26             : #include "argmatch.h"
      27             : #include "linebuffer.h"
      28             : #include "error.h"
      29             : #include "hard-locale.h"
      30             : #include "posixver.h"
      31             : #include "quote.h"
      32             : #include "xmemcoll.h"
      33             : #include "xstrtol.h"
      34             : #include "memcasecmp.h"
      35             : 
      36             : /* The official name of this program (e.g., no `g' prefix).  */
      37             : #define PROGRAM_NAME "uniq"
      38             : 
      39             : #define AUTHORS "Richard Stallman", "David MacKenzie"
      40             : 
      41             : #define SWAP_LINES(A, B)                        \
      42             :   do                                            \
      43             :     {                                           \
      44             :       struct linebuffer *_tmp;                  \
      45             :       _tmp = (A);                               \
      46             :       (A) = (B);                                \
      47             :       (B) = _tmp;                               \
      48             :     }                                           \
      49             :   while (0)
      50             : 
      51             : /* The name this program was run with. */
      52             : char *program_name;
      53             : 
      54             : /* True if the LC_COLLATE locale is hard.  */
      55             : static bool hard_LC_COLLATE;
      56             : 
      57             : /* Number of fields to skip on each line when doing comparisons. */
      58             : static size_t skip_fields;
      59             : 
      60             : /* Number of chars to skip after skipping any fields. */
      61             : static size_t skip_chars;
      62             : 
      63             : /* Number of chars to compare. */
      64             : static size_t check_chars;
      65             : 
      66             : enum countmode
      67             : {
      68             :   count_occurrences,            /* -c Print count before output lines. */
      69             :   count_none                    /* Default.  Do not print counts. */
      70             : };
      71             : 
      72             : /* Whether and how to precede the output lines with a count of the number of
      73             :    times they occurred in the input. */
      74             : static enum countmode countmode;
      75             : 
      76             : /* Which lines to output: unique lines, the first of a group of
      77             :    repeated lines, and the second and subsequented of a group of
      78             :    repeated lines.  */
      79             : static bool output_unique;
      80             : static bool output_first_repeated;
      81             : static bool output_later_repeated;
      82             : 
      83             : /* If true, ignore case when comparing.  */
      84             : static bool ignore_case;
      85             : 
      86             : enum delimit_method
      87             : {
      88             :   /* No delimiters output.  --all-repeated[=none] */
      89             :   DM_NONE,
      90             : 
      91             :   /* Delimiter precedes all groups.  --all-repeated=prepend */
      92             :   DM_PREPEND,
      93             : 
      94             :   /* Delimit all groups.  --all-repeated=separate */
      95             :   DM_SEPARATE
      96             : };
      97             : 
      98             : static char const *const delimit_method_string[] =
      99             : {
     100             :   "none", "prepend", "separate", NULL
     101             : };
     102             : 
     103             : static enum delimit_method const delimit_method_map[] =
     104             : {
     105             :   DM_NONE, DM_PREPEND, DM_SEPARATE
     106             : };
     107             : 
     108             : /* Select whether/how to delimit groups of duplicate lines.  */
     109             : static enum delimit_method delimit_groups;
     110             : 
     111             : static struct option const longopts[] =
     112             : {
     113             :   {"count", no_argument, NULL, 'c'},
     114             :   {"repeated", no_argument, NULL, 'd'},
     115             :   {"all-repeated", optional_argument, NULL, 'D'},
     116             :   {"ignore-case", no_argument, NULL, 'i'},
     117             :   {"unique", no_argument, NULL, 'u'},
     118             :   {"skip-fields", required_argument, NULL, 'f'},
     119             :   {"skip-chars", required_argument, NULL, 's'},
     120             :   {"check-chars", required_argument, NULL, 'w'},
     121             :   {"zero-terminated", no_argument, NULL, 'z'},
     122             :   {GETOPT_HELP_OPTION_DECL},
     123             :   {GETOPT_VERSION_OPTION_DECL},
     124             :   {NULL, 0, NULL, 0}
     125             : };
     126             : 
     127             : void
     128          29 : usage (int status)
     129             : {
     130          29 :   if (status != EXIT_SUCCESS)
     131          28 :     fprintf (stderr, _("Try `%s --help' for more information.\n"),
     132             :              program_name);
     133             :   else
     134             :     {
     135           1 :       printf (_("\
     136             : Usage: %s [OPTION]... [INPUT [OUTPUT]]\n\
     137             : "),
     138             :               program_name);
     139           1 :       fputs (_("\
     140             : Discard all but one of successive identical lines from INPUT (or\n\
     141             : standard input), writing to OUTPUT (or standard output).\n\
     142             : \n\
     143             : "), stdout);
     144           1 :      fputs (_("\
     145             : Mandatory arguments to long options are mandatory for short options too.\n\
     146             : "), stdout);
     147           1 :      fputs (_("\
     148             :   -c, --count           prefix lines by the number of occurrences\n\
     149             :   -d, --repeated        only print duplicate lines\n\
     150             : "), stdout);
     151           1 :      fputs (_("\
     152             :   -D, --all-repeated[=delimit-method]  print all duplicate lines\n\
     153             :                         delimit-method={none(default),prepend,separate}\n\
     154             :                         Delimiting is done with blank lines.\n\
     155             :   -f, --skip-fields=N   avoid comparing the first N fields\n\
     156             :   -i, --ignore-case     ignore differences in case when comparing\n\
     157             :   -s, --skip-chars=N    avoid comparing the first N characters\n\
     158             :   -u, --unique          only print unique lines\n\
     159             :   -z, --zero-terminated  end lines with 0 byte, not newline\n\
     160             : "), stdout);
     161           1 :      fputs (_("\
     162             :   -w, --check-chars=N   compare no more than N characters in lines\n\
     163             : "), stdout);
     164           1 :      fputs (HELP_OPTION_DESCRIPTION, stdout);
     165           1 :      fputs (VERSION_OPTION_DESCRIPTION, stdout);
     166           1 :      fputs (_("\
     167             : \n\
     168             : A field is a run of whitespace, then non-whitespace characters.\n\
     169             : Fields are skipped before chars.\n\
     170             : "), stdout);
     171           1 :      fputs (_("\
     172             : \n\
     173             : Note: 'uniq' does not detect repeated lines unless they are adjacent.\n\
     174             : You may want to sort the input first, or use `sort -u' without `uniq'.\n\
     175             : "), stdout);
     176           1 :       emit_bug_reporting_address ();
     177             :     }
     178          29 :   exit (status);
     179             : }
     180             : 
     181             : /* Convert OPT to size_t, reporting an error using MSGID if OPT is
     182             :    invalid.  Silently convert too-large values to SIZE_MAX.  */
     183             : 
     184             : static size_t
     185          15 : size_opt (char const *opt, char const *msgid)
     186             : {
     187             :   unsigned long int size;
     188             :   verify (SIZE_MAX <= ULONG_MAX);
     189             : 
     190          15 :   switch (xstrtoul (opt, NULL, 10, &size, ""))
     191             :     {
     192           4 :     case LONGINT_OK:
     193             :     case LONGINT_OVERFLOW:
     194           4 :       break;
     195             : 
     196          11 :     default:
     197          11 :       error (EXIT_FAILURE, 0, "%s: %s", opt, _(msgid));
     198             :     }
     199             : 
     200           4 :   return MIN (size, SIZE_MAX);
     201             : }
     202             : 
     203             : /* Given a linebuffer LINE,
     204             :    return a pointer to the beginning of the line's field to be compared. */
     205             : 
     206             : static char *
     207         340 : find_field (const struct linebuffer *line)
     208             : {
     209             :   size_t count;
     210         340 :   char *lp = line->buffer;
     211         340 :   size_t size = line->length - 1;
     212         340 :   size_t i = 0;
     213             : 
     214         349 :   for (count = 0; count < skip_fields && i < size; count++)
     215             :     {
     216          22 :       while (i < size && isblank (lp[i]))
     217           4 :         i++;
     218          25 :       while (i < size && !isblank (lp[i]))
     219           7 :         i++;
     220             :     }
     221             : 
     222         340 :   for (count = 0; count < skip_chars && i < size; count++)
     223           0 :     i++;
     224             : 
     225         340 :   return lp + i;
     226             : }
     227             : 
     228             : /* Return false if two strings OLD and NEW match, true if not.
     229             :    OLD and NEW point not to the beginnings of the lines
     230             :    but rather to the beginnings of the fields to compare.
     231             :    OLDLEN and NEWLEN are their lengths. */
     232             : 
     233             : static bool
     234         293 : different (char *old, char *new, size_t oldlen, size_t newlen)
     235             : {
     236         293 :   if (check_chars < oldlen)
     237           0 :     oldlen = check_chars;
     238         293 :   if (check_chars < newlen)
     239           0 :     newlen = check_chars;
     240             : 
     241         293 :   if (ignore_case)
     242             :     {
     243             :       /* FIXME: This should invoke strcoll somehow.  */
     244          21 :       return oldlen != newlen || memcasecmp (old, new, oldlen);
     245             :     }
     246         272 :   else if (hard_LC_COLLATE)
     247           0 :     return xmemcoll (old, oldlen, new, newlen) != 0;
     248             :   else
     249         272 :     return oldlen != newlen || memcmp (old, new, oldlen);
     250             : }
     251             : 
     252             : /* Output the line in linebuffer LINE to standard output
     253             :    provided that the switches say it should be output.
     254             :    MATCH is true if the line matches the previous line.
     255             :    If requested, print the number of times it occurred, as well;
     256             :    LINECOUNT + 1 is the number of times that the line occurred. */
     257             : 
     258             : static void
     259          83 : writeline (struct linebuffer const *line,
     260             :            bool match, uintmax_t linecount)
     261             : {
     262         159 :   if (! (linecount == 0 ? output_unique
     263          76 :          : !match ? output_first_repeated
     264             :          : output_later_repeated))
     265           7 :     return;
     266             : 
     267          76 :   if (countmode == count_occurrences)
     268           4 :     printf ("%7" PRIuMAX " ", linecount + 1);
     269             : 
     270          76 :   fwrite (line->buffer, sizeof (char), line->length, stdout);
     271             : }
     272             : 
     273             : /* Process input file INFILE with output to OUTFILE.
     274             :    If either is "-", use the standard I/O stream for it instead. */
     275             : 
     276             : static void
     277          54 : check_file (const char *infile, const char *outfile, char delimiter)
     278             : {
     279             :   struct linebuffer lb1, lb2;
     280             :   struct linebuffer *thisline, *prevline;
     281             : 
     282          54 :   if (! (STREQ (infile, "-") || freopen (infile, "r", stdin)))
     283           6 :     error (EXIT_FAILURE, errno, "%s", infile);
     284          48 :   if (! (STREQ (outfile, "-") || freopen (outfile, "w", stdout)))
     285           0 :     error (EXIT_FAILURE, errno, "%s", outfile);
     286             : 
     287          48 :   thisline = &lb1;
     288          48 :   prevline = &lb2;
     289             : 
     290          48 :   initbuffer (thisline);
     291          48 :   initbuffer (prevline);
     292             : 
     293             :   /* The duplication in the following `if' and `else' blocks is an
     294             :      optimization to distinguish the common case (in which none of
     295             :      the following options has been specified: --count, -repeated,
     296             :      --all-repeated, --unique) from the others.  In the common case,
     297             :      this optimization lets uniq output each different line right away,
     298             :      without waiting to see if the next one is different.  */
     299             : 
     300          48 :   if (output_unique && output_first_repeated && countmode == count_none)
     301          31 :     {
     302             :       char *prevfield IF_LINT (= NULL);
     303             :       size_t prevlen IF_LINT (= 0);
     304             : 
     305         272 :       while (!feof (stdin))
     306             :         {
     307             :           char *thisfield;
     308             :           size_t thislen;
     309         240 :           if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
     310          30 :             break;
     311         210 :           thisfield = find_field (thisline);
     312         210 :           thislen = thisline->length - 1 - (thisfield - thisline->buffer);
     313         210 :           if (prevline->length == 0
     314         180 :               || different (thisfield, prevfield, thislen, prevlen))
     315             :             {
     316          41 :               fwrite (thisline->buffer, sizeof (char),
     317             :                       thisline->length, stdout);
     318             : 
     319          41 :               SWAP_LINES (prevline, thisline);
     320          41 :               prevfield = thisfield;
     321          41 :               prevlen = thislen;
     322             :             }
     323             :         }
     324             :     }
     325             :   else
     326             :     {
     327             :       char *prevfield;
     328             :       size_t prevlen;
     329          17 :       uintmax_t match_count = 0;
     330          17 :       bool first_delimiter = true;
     331             : 
     332          17 :       if (readlinebuffer_delim (prevline, stdin, delimiter) == 0)
     333           0 :         goto closefiles;
     334          17 :       prevfield = find_field (prevline);
     335          17 :       prevlen = prevline->length - 1 - (prevfield - prevline->buffer);
     336             : 
     337         147 :       while (!feof (stdin))
     338             :         {
     339             :           bool match;
     340             :           char *thisfield;
     341             :           size_t thislen;
     342         129 :           if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
     343             :             {
     344          16 :               if (ferror (stdin))
     345           0 :                 goto closefiles;
     346          16 :               break;
     347             :             }
     348         113 :           thisfield = find_field (thisline);
     349         113 :           thislen = thisline->length - 1 - (thisfield - thisline->buffer);
     350         113 :           match = !different (thisfield, prevfield, thislen, prevlen);
     351         113 :           match_count += match;
     352             : 
     353         113 :           if (match_count == UINTMAX_MAX)
     354             :             {
     355             :               if (count_occurrences)
     356             :                 error (EXIT_FAILURE, 0, _("too many repeated lines"));
     357           0 :               match_count--;
     358             :             }
     359             : 
     360         113 :           if (delimit_groups != DM_NONE)
     361             :             {
     362          39 :               if (!match)
     363             :                 {
     364           5 :                   if (match_count) /* a previous match */
     365           2 :                     first_delimiter = false; /* Only used when DM_SEPARATE */
     366             :                 }
     367          34 :               else if (match_count == 1)
     368             :                 {
     369           8 :                   if ((delimit_groups == DM_PREPEND)
     370           3 :                       || (delimit_groups == DM_SEPARATE
     371           3 :                           && !first_delimiter))
     372           6 :                     putchar (delimiter);
     373             :                 }
     374             :             }
     375             : 
     376         113 :           if (!match || output_later_repeated)
     377             :             {
     378          66 :               writeline (prevline, match, match_count);
     379          66 :               SWAP_LINES (prevline, thisline);
     380          66 :               prevfield = thisfield;
     381          66 :               prevlen = thislen;
     382          66 :               if (!match)
     383          11 :                 match_count = 0;
     384             :             }
     385             :         }
     386             : 
     387          17 :       writeline (prevline, false, match_count);
     388             :     }
     389             : 
     390          48 :  closefiles:
     391          48 :   if (ferror (stdin) || fclose (stdin) != 0)
     392           2 :     error (EXIT_FAILURE, 0, _("error reading %s"), infile);
     393             : 
     394             :   /* stdout is handled via the atexit-invoked close_stdout function.  */
     395             : 
     396          46 :   free (lb1.buffer);
     397          46 :   free (lb2.buffer);
     398          46 : }
     399             : 
     400             : enum Skip_field_option_type
     401             :   {
     402             :     SFO_NONE,
     403             :     SFO_OBSOLETE,
     404             :     SFO_NEW
     405             :   };
     406             : 
     407             : int
     408          96 : main (int argc, char **argv)
     409             : {
     410          96 :   int optc = 0;
     411          96 :   bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
     412          96 :   enum Skip_field_option_type skip_field_option_type = SFO_NONE;
     413          96 :   int nfiles = 0;
     414             :   char const *file[2];
     415          96 :   char delimiter = '\n';        /* change with --zero-terminated, -z */
     416             : 
     417          96 :   file[0] = file[1] = "-";
     418             :   initialize_main (&argc, &argv);
     419          96 :   program_name = argv[0];
     420          96 :   setlocale (LC_ALL, "");
     421             :   bindtextdomain (PACKAGE, LOCALEDIR);
     422             :   textdomain (PACKAGE);
     423          96 :   hard_LC_COLLATE = hard_locale (LC_COLLATE);
     424             : 
     425          96 :   atexit (close_stdout);
     426             : 
     427          96 :   skip_chars = 0;
     428          96 :   skip_fields = 0;
     429          96 :   check_chars = SIZE_MAX;
     430          96 :   output_unique = output_first_repeated = true;
     431          96 :   output_later_repeated = false;
     432          96 :   countmode = count_none;
     433          96 :   delimit_groups = DM_NONE;
     434             : 
     435             :   for (;;)
     436             :     {
     437             :       /* Parse an operand with leading "+" as a file after "--" was
     438             :          seen; or if pedantic and a file was seen; or if not
     439             :          obsolete.  */
     440             : 
     441         336 :       if (optc == -1
     442         194 :           || (posixly_correct && nfiles != 0)
     443         194 :           || ((optc = getopt_long (argc, argv,
     444             :                                    "-0123456789Dcdf:is:uw:z", longopts, NULL))
     445             :               == -1))
     446             :         {
     447          77 :           if (argc <= optind)
     448          55 :             break;
     449          22 :           if (nfiles == 2)
     450             :             {
     451           0 :               error (0, 0, _("extra operand %s"), quote (argv[optind]));
     452           0 :               usage (EXIT_FAILURE);
     453             :             }
     454          22 :           file[nfiles++] = argv[optind++];
     455             :         }
     456         139 :       else switch (optc)
     457             :         {
     458          70 :         case 1:
     459             :           {
     460             :             unsigned long int size;
     461          70 :             if (optarg[0] == '+'
     462          32 :                 && posix2_version () < 200112
     463           0 :                 && xstrtoul (optarg, NULL, 10, &size, "") == LONGINT_OK
     464             :                 && size <= SIZE_MAX)
     465           0 :               skip_chars = size;
     466          70 :             else if (nfiles == 2)
     467             :               {
     468          16 :                 error (0, 0, _("extra operand %s"), quote (optarg));
     469          16 :                 usage (EXIT_FAILURE);
     470             :               }
     471             :             else
     472          54 :               file[nfiles++] = optarg;
     473             :           }
     474          54 :           break;
     475             : 
     476          15 :         case '0':
     477             :         case '1':
     478             :         case '2':
     479             :         case '3':
     480             :         case '4':
     481             :         case '5':
     482             :         case '6':
     483             :         case '7':
     484             :         case '8':
     485             :         case '9':
     486             :           {
     487          15 :             if (skip_field_option_type == SFO_NEW)
     488           1 :               skip_fields = 0;
     489             : 
     490          15 :             if (!DECIMAL_DIGIT_ACCUMULATE (skip_fields, optc - '0', size_t))
     491           0 :               skip_fields = SIZE_MAX;
     492             : 
     493          15 :             skip_field_option_type = SFO_OBSOLETE;
     494             :           }
     495          15 :           break;
     496             : 
     497           4 :         case 'c':
     498           4 :           countmode = count_occurrences;
     499           4 :           break;
     500             : 
     501           5 :         case 'd':
     502           5 :           output_unique = false;
     503           5 :           break;
     504             : 
     505          11 :         case 'D':
     506          11 :           output_unique = false;
     507          11 :           output_later_repeated = true;
     508          11 :           if (optarg == NULL)
     509           2 :             delimit_groups = DM_NONE;
     510             :           else
     511           9 :             delimit_groups = XARGMATCH ("--all-repeated", optarg,
     512             :                                         delimit_method_string,
     513             :                                         delimit_method_map);
     514          10 :           break;
     515             : 
     516           4 :         case 'f':
     517           4 :           skip_field_option_type = SFO_NEW;
     518           4 :           skip_fields = size_opt (optarg,
     519             :                                   N_("invalid number of fields to skip"));
     520           2 :           break;
     521             : 
     522           5 :         case 'i':
     523           5 :           ignore_case = true;
     524           5 :           break;
     525             : 
     526           2 :         case 's':
     527           2 :           skip_chars = size_opt (optarg,
     528             :                                  N_("invalid number of bytes to skip"));
     529           1 :           break;
     530             : 
     531           1 :         case 'u':
     532           1 :           output_first_repeated = false;
     533           1 :           break;
     534             : 
     535           9 :         case 'w':
     536           9 :           check_chars = size_opt (optarg,
     537             :                                   N_("invalid number of bytes to compare"));
     538           1 :           break;
     539             : 
     540           0 :         case 'z':
     541           0 :           delimiter = '\0';
     542           0 :           break;
     543             : 
     544           1 :         case_GETOPT_HELP_CHAR;
     545             : 
     546           2 :         case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
     547             : 
     548          10 :         default:
     549          10 :           usage (EXIT_FAILURE);
     550             :         }
     551             :     }
     552             : 
     553          55 :   if (countmode == count_occurrences && output_later_repeated)
     554             :     {
     555           1 :       error (0, 0,
     556             :            _("printing all duplicated lines and repeat counts is meaningless"));
     557           1 :       usage (EXIT_FAILURE);
     558             :     }
     559             : 
     560          54 :   check_file (file[0], file[1], delimiter);
     561             : 
     562          46 :   exit (EXIT_SUCCESS);
     563             : }

Generated by: LCOV version 1.10