Line data Source code
1 : /* join - join lines of two files on a common field
2 : Copyright (C) 91, 1995-2006, 2008 Free Software Foundation, Inc.
3 :
4 : This program is free software: you can redistribute it and/or modify
5 : it under the terms of the GNU General Public License as published by
6 : the Free Software Foundation, either version 3 of the License, or
7 : (at your option) any later version.
8 :
9 : This program is distributed in the hope that it will be useful,
10 : but WITHOUT ANY WARRANTY; without even the implied warranty of
11 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 : GNU General Public License for more details.
13 :
14 : You should have received a copy of the GNU General Public License
15 : along with this program. If not, see <http://www.gnu.org/licenses/>.
16 :
17 : Written by Mike Haertel, mike@gnu.ai.mit.edu. */
18 :
19 : #include <config.h>
20 :
21 : #include <assert.h>
22 : #include <sys/types.h>
23 : #include <getopt.h>
24 :
25 : #include "system.h"
26 : #include "error.h"
27 : #include "hard-locale.h"
28 : #include "linebuffer.h"
29 : #include "memcasecmp.h"
30 : #include "quote.h"
31 : #include "stdio--.h"
32 : #include "xmemcoll.h"
33 : #include "xstrtol.h"
34 : #include "argmatch.h"
35 :
36 : /* The official name of this program (e.g., no `g' prefix). */
37 : #define PROGRAM_NAME "join"
38 :
39 : #define AUTHORS "Mike Haertel"
40 :
41 : #define join system_join
42 :
43 : /* An element of the list identifying which fields to print for each
44 : output line. */
45 : struct outlist
46 : {
47 : /* File number: 0, 1, or 2. 0 means use the join field.
48 : 1 means use the first file argument, 2 the second. */
49 : int file;
50 :
51 : /* Field index (zero-based), specified only when FILE is 1 or 2. */
52 : size_t field;
53 :
54 : struct outlist *next;
55 : };
56 :
57 : /* A field of a line. */
58 : struct field
59 : {
60 : char *beg; /* First character in field. */
61 : size_t len; /* The length of the field. */
62 : };
63 :
64 : /* A line read from an input file. */
65 : struct line
66 : {
67 : struct linebuffer buf; /* The line itself. */
68 : size_t nfields; /* Number of elements in `fields'. */
69 : size_t nfields_allocated; /* Number of elements allocated for `fields'. */
70 : struct field *fields;
71 : };
72 :
73 : /* One or more consecutive lines read from a file that all have the
74 : same join field value. */
75 : struct seq
76 : {
77 : size_t count; /* Elements used in `lines'. */
78 : size_t alloc; /* Elements allocated in `lines'. */
79 : struct line *lines;
80 : };
81 :
82 : /* The name this program was run with. */
83 : char *program_name;
84 :
85 : /* The previous line read from each file. */
86 : static struct line *prevline[2];
87 :
88 : /* True if the LC_COLLATE locale is hard. */
89 : static bool hard_LC_COLLATE;
90 :
91 : /* If nonzero, print unpairable lines in file 1 or 2. */
92 : static bool print_unpairables_1, print_unpairables_2;
93 :
94 : /* If nonzero, print pairable lines. */
95 : static bool print_pairables;
96 :
97 : /* If nonzero, we have seen at least one unpairable line. */
98 : static bool seen_unpairable;
99 :
100 : /* If nonzero, we have warned about disorder in that file. */
101 : static bool issued_disorder_warning[2];
102 :
103 : /* Empty output field filler. */
104 : static char const *empty_filler;
105 :
106 : /* Field to join on; SIZE_MAX means they haven't been determined yet. */
107 : static size_t join_field_1 = SIZE_MAX;
108 : static size_t join_field_2 = SIZE_MAX;
109 :
110 : /* List of fields to print. */
111 : static struct outlist outlist_head;
112 :
113 : /* Last element in `outlist', where a new element can be added. */
114 : static struct outlist *outlist_end = &outlist_head;
115 :
116 : /* Tab character separating fields. If negative, fields are separated
117 : by any nonempty string of blanks, otherwise by exactly one
118 : tab character whose value (when cast to unsigned char) equals TAB. */
119 : static int tab = -1;
120 :
121 : /* If nonzero, check that the input is correctly ordered. */
122 : static enum
123 : {
124 : CHECK_ORDER_DEFAULT,
125 : CHECK_ORDER_ENABLED,
126 : CHECK_ORDER_DISABLED
127 : } check_input_order;
128 :
129 : enum
130 : {
131 : CHECK_ORDER_OPTION = CHAR_MAX + 1,
132 : NOCHECK_ORDER_OPTION
133 : };
134 :
135 :
136 : static struct option const longopts[] =
137 : {
138 : {"ignore-case", no_argument, NULL, 'i'},
139 : {"check-order", no_argument, NULL, CHECK_ORDER_OPTION},
140 : {"nocheck-order", no_argument, NULL, NOCHECK_ORDER_OPTION},
141 : {GETOPT_HELP_OPTION_DECL},
142 : {GETOPT_VERSION_OPTION_DECL},
143 : {NULL, 0, NULL, 0}
144 : };
145 :
146 : /* Used to print non-joining lines */
147 : static struct line uni_blank;
148 :
149 : /* If nonzero, ignore case when comparing join fields. */
150 : static bool ignore_case;
151 :
152 : void
153 49 : usage (int status)
154 : {
155 49 : if (status != EXIT_SUCCESS)
156 47 : fprintf (stderr, _("Try `%s --help' for more information.\n"),
157 : program_name);
158 : else
159 : {
160 2 : printf (_("\
161 : Usage: %s [OPTION]... FILE1 FILE2\n\
162 : "),
163 : program_name);
164 2 : fputs (_("\
165 : For each pair of input lines with identical join fields, write a line to\n\
166 : standard output. The default join field is the first, delimited\n\
167 : by whitespace. When FILE1 or FILE2 (not both) is -, read standard input.\n\
168 : \n\
169 : -a FILENUM print unpairable lines coming from file FILENUM, where\n\
170 : FILENUM is 1 or 2, corresponding to FILE1 or FILE2\n\
171 : -e EMPTY replace missing input fields with EMPTY\n\
172 : "), stdout);
173 2 : fputs (_("\
174 : -i, --ignore-case ignore differences in case when comparing fields\n\
175 : -j FIELD equivalent to `-1 FIELD -2 FIELD'\n\
176 : -o FORMAT obey FORMAT while constructing output line\n\
177 : -t CHAR use CHAR as input and output field separator\n\
178 : "), stdout);
179 2 : fputs (_("\
180 : -v FILENUM like -a FILENUM, but suppress joined output lines\n\
181 : -1 FIELD join on this FIELD of file 1\n\
182 : -2 FIELD join on this FIELD of file 2\n\
183 : --check-order check that the input is correctly sorted, even\n\
184 : if all input lines are pairable\n\
185 : --nocheck-order do not check that the input is correctly sorted\n\
186 : "), stdout);
187 2 : fputs (HELP_OPTION_DESCRIPTION, stdout);
188 2 : fputs (VERSION_OPTION_DESCRIPTION, stdout);
189 2 : fputs (_("\
190 : \n\
191 : Unless -t CHAR is given, leading blanks separate fields and are ignored,\n\
192 : else fields are separated by CHAR. Any FIELD is a field number counted\n\
193 : from 1. FORMAT is one or more comma or blank separated specifications,\n\
194 : each being `FILENUM.FIELD' or `0'. Default FORMAT outputs the join field,\n\
195 : the remaining fields from FILE1, the remaining fields from FILE2, all\n\
196 : separated by CHAR.\n\
197 : \n\
198 : Important: FILE1 and FILE2 must be sorted on the join fields.\n\
199 : E.g., use `sort -k 1b,1' if `join' has no options.\n\
200 : If the input is not sorted and some lines cannot be joined, a\n\
201 : warning message will be given.\n\
202 : "), stdout);
203 2 : emit_bug_reporting_address ();
204 : }
205 49 : exit (status);
206 : }
207 :
208 : /* Record a field in LINE, with location FIELD and size LEN. */
209 :
210 : static void
211 48 : extract_field (struct line *line, char *field, size_t len)
212 : {
213 48 : if (line->nfields >= line->nfields_allocated)
214 : {
215 44 : line->fields = X2NREALLOC (line->fields, &line->nfields_allocated);
216 : }
217 48 : line->fields[line->nfields].beg = field;
218 48 : line->fields[line->nfields].len = len;
219 48 : ++(line->nfields);
220 48 : }
221 :
222 : /* Fill in the `fields' structure in LINE. */
223 :
224 : static void
225 313 : xfields (struct line *line)
226 : {
227 313 : char *ptr = line->buf.buffer;
228 313 : char const *lim = ptr + line->buf.length - 1;
229 :
230 313 : if (ptr == lim)
231 267 : return;
232 :
233 46 : if (0 <= tab)
234 : {
235 : char *sep;
236 0 : for (; (sep = memchr (ptr, tab, lim - ptr)) != NULL; ptr = sep + 1)
237 0 : extract_field (line, ptr, sep - ptr);
238 : }
239 : else
240 : {
241 : /* Skip leading blanks before the first field. */
242 92 : while (isblank (to_uchar (*ptr)))
243 2 : if (++ptr == lim)
244 2 : return;
245 :
246 : do
247 : {
248 : char *sep;
249 48 : for (sep = ptr + 1; sep != lim && ! isblank (to_uchar (*sep)); sep++)
250 2 : continue;
251 46 : extract_field (line, ptr, sep - ptr);
252 46 : if (sep == lim)
253 42 : return;
254 5 : for (ptr = sep + 1; ptr != lim && isblank (to_uchar (*ptr)); ptr++)
255 1 : continue;
256 : }
257 4 : while (ptr != lim);
258 : }
259 :
260 2 : extract_field (line, ptr, lim - ptr);
261 : }
262 :
263 : static struct line *
264 313 : dup_line (const struct line *old)
265 : {
266 313 : struct line *newline = xmalloc (sizeof *newline);
267 : size_t i;
268 :
269 : /* Duplicate the buffer. */
270 313 : initbuffer (&newline->buf);
271 313 : newline->buf.buffer = xmalloc (old->buf.size);
272 313 : newline->buf.size = old->buf.size;
273 313 : memcpy (newline->buf.buffer, old->buf.buffer, old->buf.length);
274 313 : newline->buf.length = old->buf.length;
275 :
276 : /* Duplicate the field positions. */
277 313 : newline->fields = xnmalloc (old->nfields_allocated, sizeof *newline->fields);
278 313 : newline->nfields = old->nfields;
279 313 : newline->nfields_allocated = old->nfields_allocated;
280 :
281 361 : for (i = 0; i < old->nfields; i++)
282 : {
283 48 : newline->fields[i].len = old->fields[i].len;
284 96 : newline->fields[i].beg = newline->buf.buffer + (old->fields[i].beg
285 48 : - old->buf.buffer);
286 : }
287 313 : return newline;
288 : }
289 :
290 : static void
291 558 : freeline (struct line *line)
292 : {
293 558 : free (line->fields);
294 558 : free (line->buf.buffer);
295 558 : line->buf.buffer = NULL;
296 558 : }
297 :
298 : /* Return <0 if the join field in LINE1 compares less than the one in LINE2;
299 : >0 if it compares greater; 0 if it compares equal.
300 : Report an error and exit if the comparison fails.
301 : Use join fields JF_1 and JF_2 respectively. */
302 :
303 : static int
304 340 : keycmp (struct line const *line1, struct line const *line2,
305 : size_t jf_1, size_t jf_2)
306 : {
307 : /* Start of field to compare in each file. */
308 : char *beg1;
309 : char *beg2;
310 :
311 : size_t len1;
312 : size_t len2; /* Length of fields to compare. */
313 : int diff;
314 :
315 340 : if (jf_1 < line1->nfields)
316 : {
317 98 : beg1 = line1->fields[jf_1].beg;
318 98 : len1 = line1->fields[jf_1].len;
319 : }
320 : else
321 : {
322 242 : beg1 = NULL;
323 242 : len1 = 0;
324 : }
325 :
326 340 : if (jf_2 < line2->nfields)
327 : {
328 68 : beg2 = line2->fields[jf_2].beg;
329 68 : len2 = line2->fields[jf_2].len;
330 : }
331 : else
332 : {
333 272 : beg2 = NULL;
334 272 : len2 = 0;
335 : }
336 :
337 340 : if (len1 == 0)
338 242 : return len2 == 0 ? 0 : -1;
339 98 : if (len2 == 0)
340 79 : return 1;
341 :
342 19 : if (ignore_case)
343 : {
344 : /* FIXME: ignore_case does not work with NLS (in particular,
345 : with multibyte chars). */
346 3 : diff = memcasecmp (beg1, beg2, MIN (len1, len2));
347 : }
348 : else
349 : {
350 16 : if (hard_LC_COLLATE)
351 0 : return xmemcoll (beg1, len1, beg2, len2);
352 16 : diff = memcmp (beg1, beg2, MIN (len1, len2));
353 : }
354 :
355 19 : if (diff)
356 3 : return diff;
357 16 : return len1 < len2 ? -1 : len1 != len2;
358 : }
359 :
360 : /* Check that successive input lines PREV and CURRENT from input file
361 : WHATFILE are presented in order, unless the user may be relying on
362 : the GNU extension that input lines may be out of order if no input
363 : lines are unpairable.
364 :
365 : If the user specified --nocheck-order, the check is not made.
366 : If the user specified --check-order, the problem is fatal.
367 : Otherwise (the default), the message is simply a warning.
368 :
369 : A message is printed at most once per input file. */
370 :
371 : static void
372 245 : check_order (const struct line *prev,
373 : const struct line *current,
374 : int whatfile)
375 : {
376 245 : if (check_input_order != CHECK_ORDER_DISABLED
377 231 : && ((check_input_order == CHECK_ORDER_ENABLED) || seen_unpairable))
378 : {
379 95 : if (!issued_disorder_warning[whatfile-1])
380 : {
381 71 : size_t join_field = whatfile == 1 ? join_field_1 : join_field_2;
382 71 : if (keycmp (prev, current, join_field, join_field) > 0)
383 : {
384 14 : error ((check_input_order == CHECK_ORDER_ENABLED
385 : ? EXIT_FAILURE : 0),
386 : 0, _("File %d is not in sorted order"), whatfile);
387 :
388 : /* If we get to here, the message was just a warning, but we
389 : want only to issue it once. */
390 14 : issued_disorder_warning[whatfile-1] = true;
391 : }
392 : }
393 : }
394 245 : }
395 :
396 : /* Read a line from FP into LINE and split it into fields.
397 : Return true if successful. */
398 :
399 : static bool
400 363 : get_line (FILE *fp, struct line *line, int which)
401 : {
402 363 : initbuffer (&line->buf);
403 :
404 363 : if (! readlinebuffer (&line->buf, fp))
405 : {
406 50 : if (ferror (fp))
407 25 : error (EXIT_FAILURE, errno, _("read error"));
408 25 : free (line->buf.buffer);
409 25 : line->buf.buffer = NULL;
410 25 : return false;
411 : }
412 :
413 313 : line->nfields_allocated = 0;
414 313 : line->nfields = 0;
415 313 : line->fields = NULL;
416 313 : xfields (line);
417 :
418 313 : if (prevline[which - 1])
419 : {
420 245 : check_order (prevline[which - 1], line, which);
421 245 : freeline (prevline[which - 1]);
422 245 : free (prevline[which - 1]);
423 : }
424 313 : prevline[which - 1] = dup_line (line);
425 313 : return true;
426 : }
427 :
428 : static void
429 132 : free_prevline (void)
430 : {
431 : size_t i;
432 :
433 396 : for (i = 0; i < ARRAY_CARDINALITY (prevline); i++)
434 : {
435 264 : if (prevline[i])
436 68 : freeline (prevline[i]);
437 264 : free (prevline[i]);
438 264 : prevline[i] = NULL;
439 : }
440 132 : }
441 :
442 : static void
443 82 : initseq (struct seq *seq)
444 : {
445 82 : seq->count = 0;
446 82 : seq->alloc = 0;
447 82 : seq->lines = NULL;
448 82 : }
449 :
450 : /* Read a line from FP and add it to SEQ. Return true if successful. */
451 :
452 : static bool
453 342 : getseq (FILE *fp, struct seq *seq, int whichfile)
454 : {
455 342 : if (seq->count == seq->alloc)
456 188 : seq->lines = X2NREALLOC (seq->lines, &seq->alloc);
457 :
458 342 : if (get_line (fp, &seq->lines[seq->count], whichfile))
459 : {
460 296 : ++seq->count;
461 296 : return true;
462 : }
463 23 : return false;
464 : }
465 :
466 : /* Read a line from FP and add it to SEQ, as the first item if FIRST is
467 : true, else as the next. */
468 : static bool
469 260 : advance_seq (FILE *fp, struct seq *seq, bool first, int whichfile)
470 : {
471 260 : if (first)
472 : {
473 80 : freeline (&seq->lines[0]);
474 80 : seq->count = 0;
475 : }
476 260 : return getseq (fp, seq, whichfile);
477 : }
478 :
479 : static void
480 32 : delseq (struct seq *seq)
481 : {
482 : size_t i;
483 44 : for (i = 0; i < seq->count; i++)
484 12 : if (seq->lines[i].buf.buffer)
485 2 : freeline (&seq->lines[i]);
486 32 : free (seq->lines);
487 32 : }
488 :
489 :
490 : /* Print field N of LINE if it exists and is nonempty, otherwise
491 : `empty_filler' if it is nonempty. */
492 :
493 : static void
494 258 : prfield (size_t n, struct line const *line)
495 : {
496 : size_t len;
497 :
498 258 : if (n < line->nfields)
499 : {
500 13 : len = line->fields[n].len;
501 13 : if (len)
502 13 : fwrite (line->fields[n].beg, 1, len, stdout);
503 0 : else if (empty_filler)
504 0 : fputs (empty_filler, stdout);
505 : }
506 245 : else if (empty_filler)
507 0 : fputs (empty_filler, stdout);
508 258 : }
509 :
510 : /* Print the join of LINE1 and LINE2. */
511 :
512 : static void
513 258 : prjoin (struct line const *line1, struct line const *line2)
514 : {
515 : const struct outlist *outlist;
516 258 : char output_separator = tab < 0 ? ' ' : tab;
517 :
518 258 : outlist = outlist_head.next;
519 258 : if (outlist)
520 : {
521 : const struct outlist *o;
522 :
523 0 : o = outlist;
524 : while (1)
525 0 : {
526 : size_t field;
527 : struct line const *line;
528 :
529 0 : if (o->file == 0)
530 : {
531 0 : if (line1 == &uni_blank)
532 : {
533 0 : line = line2;
534 0 : field = join_field_2;
535 : }
536 : else
537 : {
538 0 : line = line1;
539 0 : field = join_field_1;
540 : }
541 : }
542 : else
543 : {
544 0 : line = (o->file == 1 ? line1 : line2);
545 0 : field = o->field;
546 : }
547 0 : prfield (field, line);
548 0 : o = o->next;
549 0 : if (o == NULL)
550 0 : break;
551 0 : putchar (output_separator);
552 : }
553 0 : putchar ('\n');
554 : }
555 : else
556 : {
557 : size_t i;
558 :
559 258 : if (line1 == &uni_blank)
560 : {
561 : struct line const *t;
562 0 : t = line1;
563 0 : line1 = line2;
564 0 : line2 = t;
565 : }
566 258 : prfield (join_field_1, line1);
567 258 : for (i = 0; i < join_field_1 && i < line1->nfields; ++i)
568 : {
569 0 : putchar (output_separator);
570 0 : prfield (i, line1);
571 : }
572 258 : for (i = join_field_1 + 1; i < line1->nfields; ++i)
573 : {
574 0 : putchar (output_separator);
575 0 : prfield (i, line1);
576 : }
577 :
578 258 : for (i = 0; i < join_field_2 && i < line2->nfields; ++i)
579 : {
580 0 : putchar (output_separator);
581 0 : prfield (i, line2);
582 : }
583 258 : for (i = join_field_2 + 1; i < line2->nfields; ++i)
584 : {
585 0 : putchar (output_separator);
586 0 : prfield (i, line2);
587 : }
588 258 : putchar ('\n');
589 : }
590 258 : }
591 :
592 : /* Print the join of the files in FP1 and FP2. */
593 :
594 : static void
595 41 : join (FILE *fp1, FILE *fp2)
596 : {
597 : struct seq seq1, seq2;
598 : struct line line;
599 : int diff;
600 : bool eof1, eof2, checktail;
601 :
602 : /* Read the first line of each file. */
603 41 : initseq (&seq1);
604 41 : getseq (fp1, &seq1, 1);
605 41 : initseq (&seq2);
606 41 : getseq (fp2, &seq2, 2);
607 :
608 157 : while (seq1.count && seq2.count)
609 : {
610 : size_t i;
611 112 : diff = keycmp (&seq1.lines[0], &seq2.lines[0],
612 : join_field_1, join_field_2);
613 112 : if (diff < 0)
614 : {
615 32 : if (print_unpairables_1)
616 0 : prjoin (&seq1.lines[0], &uni_blank);
617 32 : advance_seq (fp1, &seq1, true, 1);
618 30 : seen_unpairable = true;
619 30 : continue;
620 : }
621 80 : if (diff > 0)
622 : {
623 48 : if (print_unpairables_2)
624 0 : prjoin (&uni_blank, &seq2.lines[0]);
625 48 : advance_seq (fp2, &seq2, true, 2);
626 48 : seen_unpairable = true;
627 48 : continue;
628 : }
629 :
630 : /* Keep reading lines from file1 as long as they continue to
631 : match the current line from file2. */
632 32 : eof1 = false;
633 : do
634 111 : if (!advance_seq (fp1, &seq1, false, 1))
635 : {
636 8 : eof1 = true;
637 8 : ++seq1.count;
638 8 : break;
639 : }
640 97 : while (!keycmp (&seq1.lines[seq1.count - 1], &seq2.lines[0],
641 97 : join_field_1, join_field_2));
642 :
643 : /* Keep reading lines from file2 as long as they continue to
644 : match the current line from file1. */
645 26 : eof2 = false;
646 : do
647 69 : if (!advance_seq (fp2, &seq2, false, 2))
648 : {
649 8 : eof2 = true;
650 8 : ++seq2.count;
651 8 : break;
652 : }
653 60 : while (!keycmp (&seq1.lines[0], &seq2.lines[seq2.count - 1],
654 60 : join_field_1, join_field_2));
655 :
656 25 : if (print_pairables)
657 : {
658 91 : for (i = 0; i < seq1.count - 1; ++i)
659 : {
660 : size_t j;
661 324 : for (j = 0; j < seq2.count - 1; ++j)
662 258 : prjoin (&seq1.lines[i], &seq2.lines[j]);
663 : }
664 : }
665 :
666 91 : for (i = 0; i < seq1.count - 1; ++i)
667 66 : freeline (&seq1.lines[i]);
668 25 : if (!eof1)
669 : {
670 18 : seq1.lines[0] = seq1.lines[seq1.count - 1];
671 18 : seq1.count = 1;
672 : }
673 : else
674 7 : seq1.count = 0;
675 :
676 93 : for (i = 0; i < seq2.count - 1; ++i)
677 68 : freeline (&seq2.lines[i]);
678 25 : if (!eof2)
679 : {
680 17 : seq2.lines[0] = seq2.lines[seq2.count - 1];
681 17 : seq2.count = 1;
682 : }
683 : else
684 8 : seq2.count = 0;
685 : }
686 :
687 : /* If the user did not specify --check-order, and the we read the
688 : tail ends of both inputs to verify that they are in order. We
689 : skip the rest of the tail once we have issued a warning for that
690 : file, unless we actually need to print the unpairable lines. */
691 18 : if (check_input_order != CHECK_ORDER_DISABLED
692 18 : && !(issued_disorder_warning[0] && issued_disorder_warning[1]))
693 16 : checktail = true;
694 : else
695 2 : checktail = false;
696 :
697 18 : if ((print_unpairables_1 || checktail) && seq1.count)
698 : {
699 8 : if (print_unpairables_1)
700 0 : prjoin (&seq1.lines[0], &uni_blank);
701 8 : freeline (&seq1.lines[0]);
702 8 : seen_unpairable = true;
703 21 : while (get_line (fp1, &line, 1))
704 : {
705 11 : if (print_unpairables_1)
706 0 : prjoin (&line, &uni_blank);
707 11 : freeline (&line);
708 11 : if (issued_disorder_warning[0] && !print_unpairables_1)
709 6 : break;
710 : }
711 : }
712 :
713 16 : if ((print_unpairables_2 || checktail) && seq2.count)
714 : {
715 4 : if (print_unpairables_2)
716 0 : prjoin (&uni_blank, &seq2.lines[0]);
717 4 : freeline (&seq2.lines[0]);
718 4 : seen_unpairable = true;
719 12 : while (get_line (fp2, &line, 2))
720 : {
721 6 : if (print_unpairables_2)
722 0 : prjoin (&uni_blank, &line);
723 6 : freeline (&line);
724 6 : if (issued_disorder_warning[1] && !print_unpairables_2)
725 2 : break;
726 : }
727 : }
728 :
729 16 : delseq (&seq1);
730 16 : delseq (&seq2);
731 16 : }
732 :
733 : /* Add a field spec for field FIELD of file FILE to `outlist'. */
734 :
735 : static void
736 2 : add_field (int file, size_t field)
737 : {
738 : struct outlist *o;
739 :
740 2 : assert (file == 0 || file == 1 || file == 2);
741 2 : assert (file != 0 || field == 0);
742 :
743 2 : o = xmalloc (sizeof *o);
744 2 : o->file = file;
745 2 : o->field = field;
746 2 : o->next = NULL;
747 :
748 : /* Add to the end of the list so the fields are in the right order. */
749 2 : outlist_end->next = o;
750 2 : outlist_end = o;
751 2 : }
752 :
753 : /* Convert a string of decimal digits, STR (the 1-based join field number),
754 : to an integral value. Upon successful conversion, return one less
755 : (the zero-based field number). Silently convert too-large values
756 : to SIZE_MAX - 1. Otherwise, if a value cannot be converted, give a
757 : diagnostic and exit. */
758 :
759 : static size_t
760 11 : string_to_join_field (char const *str)
761 : {
762 : size_t result;
763 : unsigned long int val;
764 : verify (SIZE_MAX <= ULONG_MAX);
765 :
766 11 : strtol_error s_err = xstrtoul (str, NULL, 10, &val, "");
767 11 : if (s_err == LONGINT_OVERFLOW || (s_err == LONGINT_OK && SIZE_MAX < val))
768 0 : val = SIZE_MAX;
769 11 : else if (s_err != LONGINT_OK || val == 0)
770 8 : error (EXIT_FAILURE, 0, _("invalid field number: %s"), quote (str));
771 :
772 3 : result = val - 1;
773 :
774 3 : return result;
775 : }
776 :
777 : /* Convert a single field specifier string, S, to a *FILE_INDEX, *FIELD_INDEX
778 : pair. In S, the field index string is 1-based; *FIELD_INDEX is zero-based.
779 : If S is valid, return true. Otherwise, give a diagnostic and exit. */
780 :
781 : static void
782 10 : decode_field_spec (const char *s, int *file_index, size_t *field_index)
783 : {
784 : /* The first character must be 0, 1, or 2. */
785 10 : switch (s[0])
786 : {
787 3 : case '0':
788 3 : if (s[1])
789 : {
790 : /* `0' must be all alone -- no `.FIELD'. */
791 1 : error (EXIT_FAILURE, 0, _("invalid field specifier: %s"), quote (s));
792 : }
793 2 : *file_index = 0;
794 2 : *field_index = 0;
795 2 : break;
796 :
797 3 : case '1':
798 : case '2':
799 3 : if (s[1] != '.')
800 2 : error (EXIT_FAILURE, 0, _("invalid field specifier: %s"), quote (s));
801 1 : *file_index = s[0] - '0';
802 1 : *field_index = string_to_join_field (s + 2);
803 0 : break;
804 :
805 4 : default:
806 4 : error (EXIT_FAILURE, 0,
807 : _("invalid file number in field spec: %s"), quote (s));
808 :
809 : /* Tell gcc -W -Wall that we can't get beyond this point.
810 : This avoids a warning (otherwise legit) that the caller's copies
811 : of *file_index and *field_index might be used uninitialized. */
812 0 : abort ();
813 :
814 : break;
815 : }
816 2 : }
817 :
818 : /* Add the comma or blank separated field spec(s) in STR to `outlist'. */
819 :
820 : static void
821 9 : add_field_list (char *str)
822 : {
823 9 : char *p = str;
824 :
825 : do
826 : {
827 : int file_index;
828 : size_t field_index;
829 10 : char const *spec_item = p;
830 :
831 10 : p = strpbrk (p, ", \t");
832 10 : if (p)
833 4 : *p++ = '\0';
834 10 : decode_field_spec (spec_item, &file_index, &field_index);
835 2 : add_field (file_index, field_index);
836 : }
837 2 : while (p);
838 1 : }
839 :
840 : /* Set the join field *VAR to VAL, but report an error if *VAR is set
841 : more than once to incompatible values. */
842 :
843 : static void
844 8 : set_join_field (size_t *var, size_t val)
845 : {
846 8 : if (*var != SIZE_MAX && *var != val)
847 : {
848 0 : unsigned long int var1 = *var + 1;
849 0 : unsigned long int val1 = val + 1;
850 0 : error (EXIT_FAILURE, 0, _("incompatible join fields %lu, %lu"),
851 : var1, val1);
852 : }
853 8 : *var = val;
854 8 : }
855 :
856 : /* Status of command-line arguments. */
857 :
858 : enum operand_status
859 : {
860 : /* This argument must be an operand, i.e., one of the files to be
861 : joined. */
862 : MUST_BE_OPERAND,
863 :
864 : /* This might be the argument of the preceding -j1 or -j2 option,
865 : or it might be an operand. */
866 : MIGHT_BE_J1_ARG,
867 : MIGHT_BE_J2_ARG,
868 :
869 : /* This might be the argument of the preceding -o option, or it might be
870 : an operand. */
871 : MIGHT_BE_O_ARG
872 : };
873 :
874 : /* Add NAME to the array of input file NAMES with operand statuses
875 : OPERAND_STATUS; currently there are NFILES names in the list. */
876 :
877 : static void
878 132 : add_file_name (char *name, char *names[2],
879 : int operand_status[2], int joption_count[2], int *nfiles,
880 : int *prev_optc_status, int *optc_status)
881 : {
882 132 : int n = *nfiles;
883 :
884 132 : if (n == 2)
885 : {
886 1 : bool op0 = (operand_status[0] == MUST_BE_OPERAND);
887 1 : char *arg = names[op0];
888 1 : switch (operand_status[op0])
889 : {
890 1 : case MUST_BE_OPERAND:
891 1 : error (0, 0, _("extra operand %s"), quote (name));
892 1 : usage (EXIT_FAILURE);
893 :
894 0 : case MIGHT_BE_J1_ARG:
895 0 : joption_count[0]--;
896 0 : set_join_field (&join_field_1, string_to_join_field (arg));
897 0 : break;
898 :
899 0 : case MIGHT_BE_J2_ARG:
900 0 : joption_count[1]--;
901 0 : set_join_field (&join_field_2, string_to_join_field (arg));
902 0 : break;
903 :
904 0 : case MIGHT_BE_O_ARG:
905 0 : add_field_list (arg);
906 0 : break;
907 : }
908 0 : if (!op0)
909 : {
910 0 : operand_status[0] = operand_status[1];
911 0 : names[0] = names[1];
912 : }
913 0 : n = 1;
914 : }
915 :
916 131 : operand_status[n] = *prev_optc_status;
917 131 : names[n] = name;
918 131 : *nfiles = n + 1;
919 131 : if (*prev_optc_status == MIGHT_BE_O_ARG)
920 0 : *optc_status = MIGHT_BE_O_ARG;
921 131 : }
922 :
923 : int
924 132 : main (int argc, char **argv)
925 : {
926 : int optc_status;
927 132 : int prev_optc_status = MUST_BE_OPERAND;
928 : int operand_status[2];
929 132 : int joption_count[2] = { 0, 0 };
930 : char *names[2];
931 : FILE *fp1, *fp2;
932 : int optc;
933 132 : int nfiles = 0;
934 : int i;
935 :
936 : initialize_main (&argc, &argv);
937 132 : program_name = argv[0];
938 132 : setlocale (LC_ALL, "");
939 : bindtextdomain (PACKAGE, LOCALEDIR);
940 : textdomain (PACKAGE);
941 132 : hard_LC_COLLATE = hard_locale (LC_COLLATE);
942 :
943 132 : atexit (close_stdout);
944 132 : atexit (free_prevline);
945 :
946 132 : print_pairables = true;
947 132 : seen_unpairable = false;
948 132 : issued_disorder_warning[0] = issued_disorder_warning[1] = false;
949 132 : check_input_order = CHECK_ORDER_DEFAULT;
950 :
951 381 : while ((optc = getopt_long (argc, argv, "-a:e:i1:2:j:o:t:v:",
952 : longopts, NULL))
953 : != -1)
954 : {
955 159 : optc_status = MUST_BE_OPERAND;
956 :
957 159 : switch (optc)
958 : {
959 1 : case 'v':
960 1 : print_pairables = false;
961 : /* Fall through. */
962 :
963 15 : case 'a':
964 : {
965 : unsigned long int val;
966 15 : if (xstrtoul (optarg, NULL, 10, &val, "") != LONGINT_OK
967 5 : || (val != 1 && val != 2))
968 12 : error (EXIT_FAILURE, 0,
969 : _("invalid field number: %s"), quote (optarg));
970 3 : if (val == 1)
971 2 : print_unpairables_1 = true;
972 : else
973 1 : print_unpairables_2 = true;
974 : }
975 3 : break;
976 :
977 6 : case 'e':
978 6 : if (empty_filler && ! STREQ (empty_filler, optarg))
979 1 : error (EXIT_FAILURE, 0,
980 : _("conflicting empty-field replacement strings"));
981 5 : empty_filler = optarg;
982 5 : break;
983 :
984 8 : case 'i':
985 8 : ignore_case = true;
986 8 : break;
987 :
988 3 : case '1':
989 3 : set_join_field (&join_field_1, string_to_join_field (optarg));
990 0 : break;
991 :
992 3 : case '2':
993 3 : set_join_field (&join_field_2, string_to_join_field (optarg));
994 0 : break;
995 :
996 6 : case 'j':
997 6 : if ((optarg[0] == '1' || optarg[0] == '2') && !optarg[1]
998 4 : && optarg == argv[optind - 1] + 2)
999 2 : {
1000 : /* The argument was either "-j1" or "-j2". */
1001 2 : bool is_j2 = (optarg[0] == '2');
1002 2 : joption_count[is_j2]++;
1003 2 : optc_status = MIGHT_BE_J1_ARG + is_j2;
1004 : }
1005 : else
1006 : {
1007 4 : set_join_field (&join_field_1, string_to_join_field (optarg));
1008 3 : set_join_field (&join_field_2, join_field_1);
1009 : }
1010 5 : break;
1011 :
1012 9 : case 'o':
1013 9 : add_field_list (optarg);
1014 1 : optc_status = MIGHT_BE_O_ARG;
1015 1 : break;
1016 :
1017 4 : case 't':
1018 : {
1019 4 : unsigned char newtab = optarg[0];
1020 4 : if (! newtab)
1021 1 : error (EXIT_FAILURE, 0, _("empty tab"));
1022 3 : if (optarg[1])
1023 : {
1024 2 : if (STREQ (optarg, "\\0"))
1025 1 : newtab = '\0';
1026 : else
1027 1 : error (EXIT_FAILURE, 0, _("multi-character tab %s"),
1028 : quote (optarg));
1029 : }
1030 2 : if (0 <= tab && tab != newtab)
1031 0 : error (EXIT_FAILURE, 0, _("incompatible tabs"));
1032 2 : tab = newtab;
1033 : }
1034 2 : break;
1035 :
1036 3 : case NOCHECK_ORDER_OPTION:
1037 3 : check_input_order = CHECK_ORDER_DISABLED;
1038 3 : break;
1039 :
1040 2 : case CHECK_ORDER_OPTION:
1041 2 : check_input_order = CHECK_ORDER_ENABLED;
1042 2 : break;
1043 :
1044 89 : case 1: /* Non-option argument. */
1045 89 : add_file_name (optarg, names, operand_status, joption_count,
1046 : &nfiles, &prev_optc_status, &optc_status);
1047 88 : break;
1048 :
1049 2 : case_GETOPT_HELP_CHAR;
1050 :
1051 1 : case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
1052 :
1053 8 : default:
1054 8 : usage (EXIT_FAILURE);
1055 : }
1056 :
1057 117 : prev_optc_status = optc_status;
1058 : }
1059 :
1060 : /* Process any operands after "--". */
1061 90 : prev_optc_status = MUST_BE_OPERAND;
1062 223 : while (optind < argc)
1063 43 : add_file_name (argv[optind++], names, operand_status, joption_count,
1064 : &nfiles, &prev_optc_status, &optc_status);
1065 :
1066 90 : if (nfiles != 2)
1067 : {
1068 38 : if (nfiles == 0)
1069 16 : error (0, 0, _("missing operand"));
1070 : else
1071 22 : error (0, 0, _("missing operand after %s"), quote (argv[argc - 1]));
1072 38 : usage (EXIT_FAILURE);
1073 : }
1074 :
1075 : /* If "-j1" was specified and it turns out not to have had an argument,
1076 : treat it as "-j 1". Likewise for -j2. */
1077 156 : for (i = 0; i < 2; i++)
1078 104 : if (joption_count[i] != 0)
1079 : {
1080 1 : set_join_field (&join_field_1, i);
1081 1 : set_join_field (&join_field_2, i);
1082 : }
1083 :
1084 52 : if (join_field_1 == SIZE_MAX)
1085 51 : join_field_1 = 0;
1086 52 : if (join_field_2 == SIZE_MAX)
1087 51 : join_field_2 = 0;
1088 :
1089 52 : fp1 = STREQ (names[0], "-") ? stdin : fopen (names[0], "r");
1090 52 : if (!fp1)
1091 4 : error (EXIT_FAILURE, errno, "%s", names[0]);
1092 48 : fp2 = STREQ (names[1], "-") ? stdin : fopen (names[1], "r");
1093 48 : if (!fp2)
1094 6 : error (EXIT_FAILURE, errno, "%s", names[1]);
1095 42 : if (fp1 == fp2)
1096 1 : error (EXIT_FAILURE, errno, _("both files cannot be standard input"));
1097 41 : join (fp1, fp2);
1098 :
1099 16 : if (fclose (fp1) != 0)
1100 0 : error (EXIT_FAILURE, errno, "%s", names[0]);
1101 16 : if (fclose (fp2) != 0)
1102 0 : error (EXIT_FAILURE, errno, "%s", names[1]);
1103 :
1104 16 : if (issued_disorder_warning[0] || issued_disorder_warning[1])
1105 10 : exit (EXIT_FAILURE);
1106 : else
1107 6 : exit (EXIT_SUCCESS);
1108 : }
|