Line data Source code
1 : /* Permuted index for GNU, with keywords in their context.
2 : Copyright (C) 1990, 1991, 1993, 1998-2008 Free Software Foundation, Inc.
3 : François Pinard <pinard@iro.umontreal.ca>, 1988.
4 :
5 : This program is free software: you can redistribute it and/or modify
6 : it under the terms of the GNU General Public License as published by
7 : the Free Software Foundation, either version 3 of the License, or
8 : (at your option) any later version.
9 :
10 : This program is distributed in the hope that it will be useful,
11 : but WITHOUT ANY WARRANTY; without even the implied warranty of
12 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 : GNU General Public License for more details.
14 :
15 : You should have received a copy of the GNU General Public License
16 : along with this program. If not, see <http://www.gnu.org/licenses/>.
17 :
18 : François Pinard <pinard@iro.umontreal.ca> */
19 :
20 : #include <config.h>
21 :
22 : #include <stdio.h>
23 : #include <getopt.h>
24 : #include <sys/types.h>
25 : #include "system.h"
26 : #include "argmatch.h"
27 : #include "diacrit.h"
28 : #include "error.h"
29 : #include "quote.h"
30 : #include "quotearg.h"
31 : #include "regex.h"
32 : #include "xstrtol.h"
33 :
34 : /* The official name of this program (e.g., no `g' prefix). */
35 : #define PROGRAM_NAME "ptx"
36 :
37 : /* Note to translator: Please translate "F. Pinard" to "François
38 : Pinard" if "ç" (c-with-cedilla) is available in the
39 : translation's character set and encoding. */
40 : #define AUTHORS _("F. Pinard")
41 :
42 : /* Number of possible characters in a byte. */
43 : #define CHAR_SET_SIZE 256
44 :
45 : #define ISODIGIT(C) ((C) >= '0' && (C) <= '7')
46 : #define HEXTOBIN(C) ((C) >= 'a' && (C) <= 'f' ? (C)-'a'+10 \
47 : : (C) >= 'A' && (C) <= 'F' ? (C)-'A'+10 : (C)-'0')
48 : #define OCTTOBIN(C) ((C) - '0')
49 :
50 : /* Debugging the memory allocator. */
51 :
52 : #if WITH_DMALLOC
53 : # define MALLOC_FUNC_CHECK 1
54 : # include <dmalloc.h>
55 : #endif
56 :
57 : /* Global definitions. */
58 :
59 : /* FIXME: There are many unchecked integer overflows in this file,
60 : that will cause this command to misbehave given large inputs or
61 : options. Many of the "int" values below should be "size_t" or
62 : something else like that. */
63 :
64 : /* Reallocation step when swallowing non regular files. The value is not
65 : the actual reallocation step, but its base two logarithm. */
66 : #define SWALLOW_REALLOC_LOG 12
67 :
68 : /* Imported from "regex.c". */
69 : #define Sword 1
70 :
71 : /* The name this program was run with. */
72 : char *program_name;
73 :
74 : /* Program options. */
75 :
76 : enum Format
77 : {
78 : UNKNOWN_FORMAT, /* output format still unknown */
79 : DUMB_FORMAT, /* output for a dumb terminal */
80 : ROFF_FORMAT, /* output for `troff' or `nroff' */
81 : TEX_FORMAT /* output for `TeX' or `LaTeX' */
82 : };
83 :
84 : static bool gnu_extensions = true; /* trigger all GNU extensions */
85 : static bool auto_reference = false; /* refs are `file_name:line_number:' */
86 : static bool input_reference = false; /* refs at beginning of input lines */
87 : static bool right_reference = false; /* output refs after right context */
88 : static int line_width = 72; /* output line width in characters */
89 : static int gap_size = 3; /* number of spaces between output fields */
90 : static const char *truncation_string = "/";
91 : /* string used to mark line truncations */
92 : static const char *macro_name = "xx"; /* macro name for roff or TeX output */
93 : static enum Format output_format = UNKNOWN_FORMAT;
94 : /* output format */
95 :
96 : static bool ignore_case = false; /* fold lower to upper for sorting */
97 : static const char *break_file = NULL; /* name of the `Break characters' file */
98 : static const char *only_file = NULL; /* name of the `Only words' file */
99 : static const char *ignore_file = NULL; /* name of the `Ignore words' file */
100 :
101 : /* Options that use regular expressions. */
102 : struct regex_data
103 : {
104 : /* The original regular expression, as a string. */
105 : char const *string;
106 :
107 : /* The compiled regular expression, and its fastmap. */
108 : struct re_pattern_buffer pattern;
109 : char fastmap[UCHAR_MAX + 1];
110 : };
111 :
112 : static struct regex_data context_regex; /* end of context */
113 : static struct regex_data word_regex; /* keyword */
114 :
115 : /* A BLOCK delimit a region in memory of arbitrary size, like the copy of a
116 : whole file. A WORD is something smaller, its length should fit in a
117 : short integer. A WORD_TABLE may contain several WORDs. */
118 :
119 : typedef struct
120 : {
121 : char *start; /* pointer to beginning of region */
122 : char *end; /* pointer to end + 1 of region */
123 : }
124 : BLOCK;
125 :
126 : typedef struct
127 : {
128 : char *start; /* pointer to beginning of region */
129 : short int size; /* length of the region */
130 : }
131 : WORD;
132 :
133 : typedef struct
134 : {
135 : WORD *start; /* array of WORDs */
136 : size_t alloc; /* allocated length */
137 : size_t length; /* number of used entries */
138 : }
139 : WORD_TABLE;
140 :
141 : /* Pattern description tables. */
142 :
143 : /* For each character, provide its folded equivalent. */
144 : static unsigned char folded_chars[CHAR_SET_SIZE];
145 :
146 : /* End of context pattern register indices. */
147 : static struct re_registers context_regs;
148 :
149 : /* Keyword pattern register indices. */
150 : static struct re_registers word_regs;
151 :
152 : /* A word characters fastmap is used only when no word regexp has been
153 : provided. A word is then made up of a sequence of one or more characters
154 : allowed by the fastmap. Contains !0 if character allowed in word. Not
155 : only this is faster in most cases, but it simplifies the implementation
156 : of the Break files. */
157 : static char word_fastmap[CHAR_SET_SIZE];
158 :
159 : /* Maximum length of any word read. */
160 : static int maximum_word_length;
161 :
162 : /* Maximum width of any reference used. */
163 : static int reference_max_width;
164 :
165 : /* Ignore and Only word tables. */
166 :
167 : static WORD_TABLE ignore_table; /* table of words to ignore */
168 : static WORD_TABLE only_table; /* table of words to select */
169 :
170 : /* Source text table, and scanning macros. */
171 :
172 : static int number_input_files; /* number of text input files */
173 : static int total_line_count; /* total number of lines seen so far */
174 : static const char **input_file_name; /* array of text input file names */
175 : static int *file_line_count; /* array of `total_line_count' values at end */
176 :
177 : static BLOCK text_buffer; /* file to study */
178 :
179 : /* SKIP_NON_WHITE used only for getting or skipping the reference. */
180 :
181 : #define SKIP_NON_WHITE(cursor, limit) \
182 : while (cursor < limit && ! isspace (to_uchar (*cursor))) \
183 : cursor++
184 :
185 : #define SKIP_WHITE(cursor, limit) \
186 : while (cursor < limit && isspace (to_uchar (*cursor))) \
187 : cursor++
188 :
189 : #define SKIP_WHITE_BACKWARDS(cursor, start) \
190 : while (cursor > start && isspace (to_uchar (cursor[-1]))) \
191 : cursor--
192 :
193 : #define SKIP_SOMETHING(cursor, limit) \
194 : if (word_regex.string) \
195 : { \
196 : regoff_t count; \
197 : count = re_match (&word_regex.pattern, cursor, limit - cursor, 0, NULL); \
198 : if (count == -2) \
199 : matcher_error (); \
200 : cursor += count == -1 ? 1 : count; \
201 : } \
202 : else if (word_fastmap[to_uchar (*cursor)]) \
203 : while (cursor < limit && word_fastmap[to_uchar (*cursor)]) \
204 : cursor++; \
205 : else \
206 : cursor++
207 :
208 : /* Occurrences table.
209 :
210 : The `keyword' pointer provides the central word, which is surrounded
211 : by a left context and a right context. The `keyword' and `length'
212 : field allow full 8-bit characters keys, even including NULs. At other
213 : places in this program, the name `keyafter' refers to the keyword
214 : followed by its right context.
215 :
216 : The left context does not extend, towards the beginning of the file,
217 : further than a distance given by the `left' value. This value is
218 : relative to the keyword beginning, it is usually negative. This
219 : insures that, except for white space, we will never have to backward
220 : scan the source text, when it is time to generate the final output
221 : lines.
222 :
223 : The right context, indirectly attainable through the keyword end, does
224 : not extend, towards the end of the file, further than a distance given
225 : by the `right' value. This value is relative to the keyword
226 : beginning, it is usually positive.
227 :
228 : When automatic references are used, the `reference' value is the
229 : overall line number in all input files read so far, in this case, it
230 : is of type (int). When input references are used, the `reference'
231 : value indicates the distance between the keyword beginning and the
232 : start of the reference field, it is of type (DELTA) and usually
233 : negative. */
234 :
235 : typedef short int DELTA; /* to hold displacement within one context */
236 :
237 : typedef struct
238 : {
239 : WORD key; /* description of the keyword */
240 : DELTA left; /* distance to left context start */
241 : DELTA right; /* distance to right context end */
242 : int reference; /* reference descriptor */
243 : }
244 : OCCURS;
245 :
246 : /* The various OCCURS tables are indexed by the language. But the time
247 : being, there is no such multiple language support. */
248 :
249 : static OCCURS *occurs_table[1]; /* all words retained from the read text */
250 : static size_t occurs_alloc[1]; /* allocated size of occurs_table */
251 : static size_t number_of_occurs[1]; /* number of used slots in occurs_table */
252 :
253 :
254 : /* Communication among output routines. */
255 :
256 : /* Indicate if special output processing is requested for each character. */
257 : static char edited_flag[CHAR_SET_SIZE];
258 :
259 : static int half_line_width; /* half of line width, reference excluded */
260 : static int before_max_width; /* maximum width of before field */
261 : static int keyafter_max_width; /* maximum width of keyword-and-after field */
262 : static int truncation_string_length;/* length of string used to flag truncation */
263 :
264 : /* When context is limited by lines, wraparound may happen on final output:
265 : the `head' pointer gives access to some supplementary left context which
266 : will be seen at the end of the output line, the `tail' pointer gives
267 : access to some supplementary right context which will be seen at the
268 : beginning of the output line. */
269 :
270 : static BLOCK tail; /* tail field */
271 : static int tail_truncation; /* flag truncation after the tail field */
272 :
273 : static BLOCK before; /* before field */
274 : static int before_truncation; /* flag truncation before the before field */
275 :
276 : static BLOCK keyafter; /* keyword-and-after field */
277 : static int keyafter_truncation; /* flag truncation after the keyafter field */
278 :
279 : static BLOCK head; /* head field */
280 : static int head_truncation; /* flag truncation before the head field */
281 :
282 : static BLOCK reference; /* reference field for input reference mode */
283 :
284 : /* Miscellaneous routines. */
285 :
286 : /* Diagnose an error in the regular expression matcher. Then exit. */
287 :
288 : static void ATTRIBUTE_NORETURN
289 0 : matcher_error (void)
290 : {
291 0 : error (0, errno, _("error in regular expression matcher"));
292 0 : exit (EXIT_FAILURE);
293 : }
294 :
295 : /*------------------------------------------------------.
296 : | Duplicate string STRING, while evaluating \-escapes. |
297 : `------------------------------------------------------*/
298 :
299 : /* Loosely adapted from GNU sh-utils printf.c code. */
300 :
301 : static char *
302 18 : copy_unescaped_string (const char *string)
303 : {
304 : char *result; /* allocated result */
305 : char *cursor; /* cursor in result */
306 : int value; /* value of \nnn escape */
307 : int length; /* length of \nnn escape */
308 :
309 18 : result = xmalloc (strlen (string) + 1);
310 18 : cursor = result;
311 :
312 53 : while (*string)
313 : {
314 17 : if (*string == '\\')
315 : {
316 14 : string++;
317 14 : switch (*string)
318 : {
319 1 : case 'x': /* \xhhh escape, 3 chars maximum */
320 1 : value = 0;
321 2 : for (length = 0, string++;
322 1 : length < 3 && isxdigit (to_uchar (*string));
323 0 : length++, string++)
324 0 : value = value * 16 + HEXTOBIN (*string);
325 1 : if (length == 0)
326 : {
327 1 : *cursor++ = '\\';
328 1 : *cursor++ = 'x';
329 : }
330 : else
331 0 : *cursor++ = value;
332 1 : break;
333 :
334 1 : case '0': /* \0ooo escape, 3 chars maximum */
335 1 : value = 0;
336 2 : for (length = 0, string++;
337 1 : length < 3 && ISODIGIT (*string);
338 0 : length++, string++)
339 0 : value = value * 8 + OCTTOBIN (*string);
340 1 : *cursor++ = value;
341 1 : break;
342 :
343 1 : case 'a': /* alert */
344 : #if __STDC__
345 1 : *cursor++ = '\a';
346 : #else
347 : *cursor++ = 7;
348 : #endif
349 1 : string++;
350 1 : break;
351 :
352 1 : case 'b': /* backspace */
353 1 : *cursor++ = '\b';
354 1 : string++;
355 1 : break;
356 :
357 1 : case 'c': /* cancel the rest of the output */
358 3 : while (*string)
359 1 : string++;
360 1 : break;
361 :
362 1 : case 'f': /* form feed */
363 1 : *cursor++ = '\f';
364 1 : string++;
365 1 : break;
366 :
367 1 : case 'n': /* new line */
368 1 : *cursor++ = '\n';
369 1 : string++;
370 1 : break;
371 :
372 1 : case 'r': /* carriage return */
373 1 : *cursor++ = '\r';
374 1 : string++;
375 1 : break;
376 :
377 1 : case 't': /* horizontal tab */
378 1 : *cursor++ = '\t';
379 1 : string++;
380 1 : break;
381 :
382 1 : case 'v': /* vertical tab */
383 : #if __STDC__
384 1 : *cursor++ = '\v';
385 : #else
386 : *cursor++ = 11;
387 : #endif
388 1 : string++;
389 1 : break;
390 :
391 3 : case '\0': /* lone backslash at end of string */
392 : /* ignore it */
393 3 : break;
394 :
395 1 : default:
396 1 : *cursor++ = '\\';
397 1 : *cursor++ = *string++;
398 1 : break;
399 : }
400 : }
401 : else
402 3 : *cursor++ = *string++;
403 : }
404 :
405 18 : *cursor = '\0';
406 18 : return result;
407 : }
408 :
409 : /*--------------------------------------------------------------------------.
410 : | Compile the regex represented by REGEX, diagnose and abort if any error. |
411 : `--------------------------------------------------------------------------*/
412 :
413 : static void
414 73 : compile_regex (struct regex_data *regex)
415 : {
416 73 : struct re_pattern_buffer *pattern = ®ex->pattern;
417 73 : char const *string = regex->string;
418 : char const *message;
419 :
420 73 : pattern->buffer = NULL;
421 73 : pattern->allocated = 0;
422 73 : pattern->fastmap = regex->fastmap;
423 73 : pattern->translate = ignore_case ? folded_chars : NULL;
424 :
425 73 : message = re_compile_pattern (string, strlen (string), pattern);
426 73 : if (message)
427 0 : error (EXIT_FAILURE, 0, _("%s (for regexp %s)"), message, quote (string));
428 :
429 : /* The fastmap should be compiled before `re_match'. The following
430 : call is not mandatory, because `re_search' is always called sooner,
431 : and it compiles the fastmap if this has not been done yet. */
432 :
433 73 : re_compile_fastmap (pattern);
434 73 : }
435 :
436 : /*------------------------------------------------------------------------.
437 : | This will initialize various tables for pattern match and compiles some |
438 : | regexps. |
439 : `------------------------------------------------------------------------*/
440 :
441 : static void
442 73 : initialize_regex (void)
443 : {
444 : int character; /* character value */
445 :
446 : /* Initialize the case folding table. */
447 :
448 73 : if (ignore_case)
449 0 : for (character = 0; character < CHAR_SET_SIZE; character++)
450 0 : folded_chars[character] = toupper (character);
451 :
452 : /* Unless the user already provided a description of the end of line or
453 : end of sentence sequence, select an end of line sequence to compile.
454 : If the user provided an empty definition, thus disabling end of line
455 : or sentence feature, make it NULL to speed up tests. If GNU
456 : extensions are enabled, use end of sentence like in GNU emacs. If
457 : disabled, use end of lines. */
458 :
459 73 : if (context_regex.string)
460 : {
461 0 : if (!*context_regex.string)
462 0 : context_regex.string = NULL;
463 : }
464 73 : else if (gnu_extensions & !input_reference)
465 70 : context_regex.string = "[.?!][]\"')}]*\\($\\|\t\\| \\)[ \t\n]*";
466 : else
467 3 : context_regex.string = "\n";
468 :
469 73 : if (context_regex.string)
470 73 : compile_regex (&context_regex);
471 :
472 : /* If the user has already provided a non-empty regexp to describe
473 : words, compile it. Else, unless this has already been done through
474 : a user provided Break character file, construct a fastmap of
475 : characters that may appear in a word. If GNU extensions enabled,
476 : include only letters of the underlying character set. If disabled,
477 : include almost everything, even punctuations; stop only on white
478 : space. */
479 :
480 73 : if (word_regex.string)
481 0 : compile_regex (&word_regex);
482 73 : else if (!break_file)
483 : {
484 71 : if (gnu_extensions)
485 : {
486 :
487 : /* Simulate \w+. */
488 :
489 17476 : for (character = 0; character < CHAR_SET_SIZE; character++)
490 17408 : word_fastmap[character] = !! isalpha (character);
491 : }
492 : else
493 : {
494 :
495 : /* Simulate [^ \t\n]+. */
496 :
497 3 : memset (word_fastmap, 1, CHAR_SET_SIZE);
498 3 : word_fastmap[' '] = 0;
499 3 : word_fastmap['\t'] = 0;
500 3 : word_fastmap['\n'] = 0;
501 : }
502 : }
503 73 : }
504 :
505 : /*------------------------------------------------------------------------.
506 : | This routine will attempt to swallow a whole file name FILE_NAME into a |
507 : | contiguous region of memory and return a description of it into BLOCK. |
508 : | Standard input is assumed whenever FILE_NAME is NULL, empty or "-". |
509 : | |
510 : | Previously, in some cases, white space compression was attempted while |
511 : | inputting text. This was defeating some regexps like default end of |
512 : | sentence, which checks for two consecutive spaces. If white space |
513 : | compression is ever reinstated, it should be in output routines. |
514 : `------------------------------------------------------------------------*/
515 :
516 : static void
517 86 : swallow_file_in_memory (const char *file_name, BLOCK *block)
518 : {
519 : int file_handle; /* file descriptor number */
520 : struct stat stat_block; /* stat block for file */
521 : size_t allocated_length; /* allocated length of memory buffer */
522 : size_t used_length; /* used length in memory buffer */
523 : int read_length; /* number of character gotten on last read */
524 :
525 : /* As special cases, a file name which is NULL or "-" indicates standard
526 : input, which is already opened. In all other cases, open the file from
527 : its name. */
528 86 : bool using_stdin = !file_name || !*file_name || STREQ (file_name, "-");
529 86 : if (using_stdin)
530 82 : file_handle = STDIN_FILENO;
531 : else
532 4 : if ((file_handle = open (file_name, O_RDONLY)) < 0)
533 0 : error (EXIT_FAILURE, errno, "%s", file_name);
534 :
535 : /* If the file is a plain, regular file, allocate the memory buffer all at
536 : once and swallow the file in one blow. In other cases, read the file
537 : repeatedly in smaller chunks until we have it all, reallocating memory
538 : once in a while, as we go. */
539 :
540 86 : if (fstat (file_handle, &stat_block) < 0)
541 0 : error (EXIT_FAILURE, errno, "%s", file_name);
542 :
543 86 : if (S_ISREG (stat_block.st_mode))
544 : {
545 : size_t in_memory_size;
546 :
547 61 : block->start = xmalloc ((size_t) stat_block.st_size);
548 :
549 183 : if ((in_memory_size = read (file_handle,
550 122 : block->start, (size_t) stat_block.st_size))
551 61 : != stat_block.st_size)
552 : {
553 : #if MSDOS
554 : /* On MSDOS, in memory size may be smaller than the file
555 : size, because of end of line conversions. But it can
556 : never be smaller than half the file size, because the
557 : minimum is when all lines are empty and terminated by
558 : CR+LF. */
559 : if (in_memory_size != (size_t)-1
560 : && in_memory_size >= stat_block.st_size / 2)
561 : block->start = xrealloc (block->start, in_memory_size);
562 : else
563 : #endif /* not MSDOS */
564 :
565 11 : error (EXIT_FAILURE, errno, "%s", file_name);
566 : }
567 50 : block->end = block->start + in_memory_size;
568 : }
569 : else
570 : {
571 25 : block->start = xmalloc ((size_t) 1 << SWALLOW_REALLOC_LOG);
572 25 : used_length = 0;
573 25 : allocated_length = (1 << SWALLOW_REALLOC_LOG);
574 :
575 115 : while (read_length = read (file_handle,
576 45 : block->start + used_length,
577 : allocated_length - used_length),
578 : read_length > 0)
579 : {
580 20 : used_length += read_length;
581 20 : if (used_length == allocated_length)
582 : {
583 0 : allocated_length += (1 << SWALLOW_REALLOC_LOG);
584 : block->start
585 0 : = xrealloc (block->start, allocated_length);
586 : }
587 : }
588 :
589 25 : if (read_length < 0)
590 3 : error (EXIT_FAILURE, errno, "%s", file_name);
591 :
592 22 : block->end = block->start + used_length;
593 : }
594 :
595 : /* Close the file, but only if it was not the standard input. */
596 :
597 72 : if (! using_stdin && close (file_handle) != 0)
598 0 : error (EXIT_FAILURE, errno, "%s", file_name);
599 72 : }
600 :
601 : /* Sort and search routines. */
602 :
603 : /*--------------------------------------------------------------------------.
604 : | Compare two words, FIRST and SECOND, and return 0 if they are identical. |
605 : | Return less than 0 if the first word goes before the second; return |
606 : | greater than 0 if the first word goes after the second. |
607 : | |
608 : | If a word is indeed a prefix of the other, the shorter should go first. |
609 : `--------------------------------------------------------------------------*/
610 :
611 : static int
612 15 : compare_words (const void *void_first, const void *void_second)
613 : {
614 : #define first ((const WORD *) void_first)
615 : #define second ((const WORD *) void_second)
616 : int length; /* minimum of two lengths */
617 : int counter; /* cursor in words */
618 : int value; /* value of comparison */
619 :
620 15 : length = first->size < second->size ? first->size : second->size;
621 :
622 15 : if (ignore_case)
623 : {
624 0 : for (counter = 0; counter < length; counter++)
625 : {
626 0 : value = (folded_chars [to_uchar (first->start[counter])]
627 0 : - folded_chars [to_uchar (second->start[counter])]);
628 0 : if (value != 0)
629 0 : return value;
630 : }
631 : }
632 : else
633 : {
634 26 : for (counter = 0; counter < length; counter++)
635 : {
636 32 : value = (to_uchar (first->start[counter])
637 16 : - to_uchar (second->start[counter]));
638 16 : if (value != 0)
639 5 : return value;
640 : }
641 : }
642 :
643 10 : return first->size - second->size;
644 : #undef first
645 : #undef second
646 : }
647 :
648 : /*-----------------------------------------------------------------------.
649 : | Decides which of two OCCURS, FIRST or SECOND, should lexicographically |
650 : | go first. In case of a tie, preserve the original order through a |
651 : | pointer comparison. |
652 : `-----------------------------------------------------------------------*/
653 :
654 : static int
655 15 : compare_occurs (const void *void_first, const void *void_second)
656 : {
657 : #define first ((const OCCURS *) void_first)
658 : #define second ((const OCCURS *) void_second)
659 : int value;
660 :
661 15 : value = compare_words (&first->key, &second->key);
662 15 : return value == 0 ? first->key.start - second->key.start : value;
663 : #undef first
664 : #undef second
665 : }
666 :
667 : /*------------------------------------------------------------.
668 : | Return !0 if WORD appears in TABLE. Uses a binary search. |
669 : `------------------------------------------------------------*/
670 :
671 : static int
672 0 : search_table (WORD *word, WORD_TABLE *table)
673 : {
674 : int lowest; /* current lowest possible index */
675 : int highest; /* current highest possible index */
676 : int middle; /* current middle index */
677 : int value; /* value from last comparison */
678 :
679 0 : lowest = 0;
680 0 : highest = table->length - 1;
681 0 : while (lowest <= highest)
682 : {
683 0 : middle = (lowest + highest) / 2;
684 0 : value = compare_words (word, table->start + middle);
685 0 : if (value < 0)
686 0 : highest = middle - 1;
687 0 : else if (value > 0)
688 0 : lowest = middle + 1;
689 : else
690 0 : return 1;
691 : }
692 0 : return 0;
693 : }
694 :
695 : /*---------------------------------------------------------------------.
696 : | Sort the whole occurs table in memory. Presumably, `qsort' does not |
697 : | take intermediate copies or table elements, so the sort will be |
698 : | stabilized throughout the comparison routine. |
699 : `---------------------------------------------------------------------*/
700 :
701 : static void
702 59 : sort_found_occurs (void)
703 : {
704 :
705 : /* Only one language for the time being. */
706 :
707 59 : qsort (occurs_table[0], number_of_occurs[0], sizeof **occurs_table,
708 : compare_occurs);
709 59 : }
710 :
711 : /* Parameter files reading routines. */
712 :
713 : /*----------------------------------------------------------------------.
714 : | Read a file named FILE_NAME, containing a set of break characters. |
715 : | Build a content to the array word_fastmap in which all characters are |
716 : | allowed except those found in the file. Characters may be repeated. |
717 : `----------------------------------------------------------------------*/
718 :
719 : static void
720 2 : digest_break_file (const char *file_name)
721 : {
722 : BLOCK file_contents; /* to receive a copy of the file */
723 : char *cursor; /* cursor in file copy */
724 :
725 2 : swallow_file_in_memory (file_name, &file_contents);
726 :
727 : /* Make the fastmap and record the file contents in it. */
728 :
729 2 : memset (word_fastmap, 1, CHAR_SET_SIZE);
730 18 : for (cursor = file_contents.start; cursor < file_contents.end; cursor++)
731 16 : word_fastmap[to_uchar (*cursor)] = 0;
732 :
733 2 : if (!gnu_extensions)
734 : {
735 :
736 : /* If GNU extensions are enabled, the only way to avoid newline as
737 : a break character is to write all the break characters in the
738 : file with no newline at all, not even at the end of the file.
739 : If disabled, spaces, tabs and newlines are always considered as
740 : break characters even if not included in the break file. */
741 :
742 0 : word_fastmap[' '] = 0;
743 0 : word_fastmap['\t'] = 0;
744 0 : word_fastmap['\n'] = 0;
745 : }
746 :
747 : /* Return the space of the file, which is no more required. */
748 :
749 2 : free (file_contents.start);
750 2 : }
751 :
752 : /*-----------------------------------------------------------------------.
753 : | Read a file named FILE_NAME, containing one word per line, then |
754 : | construct in TABLE a table of WORD descriptors for them. The routine |
755 : | swallows the whole file in memory; this is at the expense of space |
756 : | needed for newlines, which are useless; however, the reading is fast. |
757 : `-----------------------------------------------------------------------*/
758 :
759 : static void
760 0 : digest_word_file (const char *file_name, WORD_TABLE *table)
761 : {
762 : BLOCK file_contents; /* to receive a copy of the file */
763 : char *cursor; /* cursor in file copy */
764 : char *word_start; /* start of the current word */
765 :
766 0 : swallow_file_in_memory (file_name, &file_contents);
767 :
768 0 : table->start = NULL;
769 0 : table->alloc = 0;
770 0 : table->length = 0;
771 :
772 : /* Read the whole file. */
773 :
774 0 : cursor = file_contents.start;
775 0 : while (cursor < file_contents.end)
776 : {
777 :
778 : /* Read one line, and save the word in contains. */
779 :
780 0 : word_start = cursor;
781 0 : while (cursor < file_contents.end && *cursor != '\n')
782 0 : cursor++;
783 :
784 : /* Record the word in table if it is not empty. */
785 :
786 0 : if (cursor > word_start)
787 : {
788 0 : if (table->length == table->alloc)
789 : {
790 0 : if ((SIZE_MAX / sizeof *table->start - 1) / 2 < table->alloc)
791 0 : xalloc_die ();
792 0 : table->alloc = table->alloc * 2 + 1;
793 0 : table->start = xrealloc (table->start,
794 0 : table->alloc * sizeof *table->start);
795 : }
796 :
797 0 : table->start[table->length].start = word_start;
798 0 : table->start[table->length].size = cursor - word_start;
799 0 : table->length++;
800 : }
801 :
802 : /* This test allows for an incomplete line at end of file. */
803 :
804 0 : if (cursor < file_contents.end)
805 0 : cursor++;
806 : }
807 :
808 : /* Finally, sort all the words read. */
809 :
810 0 : qsort (table->start, table->length, sizeof table->start[0], compare_words);
811 0 : }
812 :
813 : /* Keyword recognition and selection. */
814 :
815 : /*----------------------------------------------------------------------.
816 : | For each keyword in the source text, constructs an OCCURS structure. |
817 : `----------------------------------------------------------------------*/
818 :
819 : static void
820 70 : find_occurs_in_text (void)
821 : {
822 : char *cursor; /* for scanning the source text */
823 : char *scan; /* for scanning the source text also */
824 : char *line_start; /* start of the current input line */
825 : char *line_scan; /* newlines scanned until this point */
826 : int reference_length; /* length of reference in input mode */
827 : WORD possible_key; /* possible key, to ease searches */
828 : OCCURS *occurs_cursor; /* current OCCURS under construction */
829 :
830 : char *context_start; /* start of left context */
831 : char *context_end; /* end of right context */
832 : char *word_start; /* start of word */
833 : char *word_end; /* end of word */
834 : char *next_context_start; /* next start of left context */
835 :
836 : /* reference_length is always used within `if (input_reference)'.
837 : However, GNU C diagnoses that it may be used uninitialized. The
838 : following assignment is merely to shut it up. */
839 :
840 70 : reference_length = 0;
841 :
842 : /* Tracking where lines start is helpful for reference processing. In
843 : auto reference mode, this allows counting lines. In input reference
844 : mode, this permits finding the beginning of the references.
845 :
846 : The first line begins with the file, skip immediately this very first
847 : reference in input reference mode, to help further rejection any word
848 : found inside it. Also, unconditionally assigning these variable has
849 : the happy effect of shutting up lint. */
850 :
851 70 : line_start = text_buffer.start;
852 70 : line_scan = line_start;
853 70 : if (input_reference)
854 : {
855 0 : SKIP_NON_WHITE (line_scan, text_buffer.end);
856 0 : reference_length = line_scan - line_start;
857 0 : SKIP_WHITE (line_scan, text_buffer.end);
858 : }
859 :
860 : /* Process the whole buffer, one line or one sentence at a time. */
861 :
862 213 : for (cursor = text_buffer.start;
863 143 : cursor < text_buffer.end;
864 73 : cursor = next_context_start)
865 : {
866 :
867 : /* `context_start' gets initialized before the processing of each
868 : line, or once for the whole buffer if no end of line or sentence
869 : sequence separator. */
870 :
871 73 : context_start = cursor;
872 :
873 : /* If a end of line or end of sentence sequence is defined and
874 : non-empty, `next_context_start' will be recomputed to be the end of
875 : each line or sentence, before each one is processed. If no such
876 : sequence, then `next_context_start' is set at the end of the whole
877 : buffer, which is then considered to be a single line or sentence.
878 : This test also accounts for the case of an incomplete line or
879 : sentence at the end of the buffer. */
880 :
881 73 : next_context_start = text_buffer.end;
882 73 : if (context_regex.string)
883 146 : switch (re_search (&context_regex.pattern, cursor,
884 73 : text_buffer.end - cursor,
885 73 : 0, text_buffer.end - cursor, &context_regs))
886 : {
887 0 : case -2:
888 0 : matcher_error ();
889 :
890 67 : case -1:
891 67 : break;
892 :
893 6 : default:
894 6 : next_context_start = cursor + context_regs.end[0];
895 6 : break;
896 : }
897 :
898 : /* Include the separator into the right context, but not any suffix
899 : white space in this separator; this insures it will be seen in
900 : output and will not take more space than necessary. */
901 :
902 73 : context_end = next_context_start;
903 73 : SKIP_WHITE_BACKWARDS (context_end, context_start);
904 :
905 : /* Read and process a single input line or sentence, one word at a
906 : time. */
907 :
908 : while (1)
909 : {
910 149 : if (word_regex.string)
911 :
912 : /* If a word regexp has been compiled, use it to skip at the
913 : beginning of the next word. If there is no such word, exit
914 : the loop. */
915 :
916 : {
917 0 : regoff_t r = re_search (&word_regex.pattern, cursor,
918 0 : context_end - cursor,
919 : 0, context_end - cursor, &word_regs);
920 0 : if (r == -2)
921 0 : matcher_error ();
922 0 : if (r == -1)
923 0 : break;
924 0 : word_start = cursor + word_regs.start[0];
925 0 : word_end = cursor + word_regs.end[0];
926 : }
927 : else
928 :
929 : /* Avoid re_search and use the fastmap to skip to the
930 : beginning of the next word. If there is no more word in
931 : the buffer, exit the loop. */
932 :
933 : {
934 111 : scan = cursor;
935 688 : while (scan < context_end
936 504 : && !word_fastmap[to_uchar (*scan)])
937 466 : scan++;
938 :
939 111 : if (scan == context_end)
940 73 : break;
941 :
942 38 : word_start = scan;
943 :
944 139 : while (scan < context_end
945 92 : && word_fastmap[to_uchar (*scan)])
946 63 : scan++;
947 :
948 38 : word_end = scan;
949 : }
950 :
951 : /* Skip right to the beginning of the found word. */
952 :
953 38 : cursor = word_start;
954 :
955 : /* Skip any zero length word. Just advance a single position,
956 : then go fetch the next word. */
957 :
958 38 : if (word_end == word_start)
959 : {
960 0 : cursor++;
961 0 : continue;
962 : }
963 :
964 : /* This is a genuine, non empty word, so save it as a possible
965 : key. Then skip over it. Also, maintain the maximum length of
966 : all words read so far. It is mandatory to take the maximum
967 : length of all words in the file, without considering if they
968 : are actually kept or rejected, because backward jumps at output
969 : generation time may fall in *any* word. */
970 :
971 38 : possible_key.start = cursor;
972 38 : possible_key.size = word_end - word_start;
973 38 : cursor += possible_key.size;
974 :
975 38 : if (possible_key.size > maximum_word_length)
976 27 : maximum_word_length = possible_key.size;
977 :
978 : /* In input reference mode, update `line_start' from its previous
979 : value. Count the lines just in case auto reference mode is
980 : also selected. If it happens that the word just matched is
981 : indeed part of a reference; just ignore it. */
982 :
983 38 : if (input_reference)
984 : {
985 0 : while (line_scan < possible_key.start)
986 0 : if (*line_scan == '\n')
987 : {
988 0 : total_line_count++;
989 0 : line_scan++;
990 0 : line_start = line_scan;
991 0 : SKIP_NON_WHITE (line_scan, text_buffer.end);
992 0 : reference_length = line_scan - line_start;
993 : }
994 : else
995 0 : line_scan++;
996 0 : if (line_scan > possible_key.start)
997 0 : continue;
998 : }
999 :
1000 : /* Ignore the word if an `Ignore words' table exists and if it is
1001 : part of it. Also ignore the word if an `Only words' table and
1002 : if it is *not* part of it.
1003 :
1004 : It is allowed that both tables be used at once, even if this
1005 : may look strange for now. Just ignore a word that would appear
1006 : in both. If regexps are eventually implemented for these
1007 : tables, the Ignore table could then reject words that would
1008 : have been previously accepted by the Only table. */
1009 :
1010 38 : if (ignore_file && search_table (&possible_key, &ignore_table))
1011 0 : continue;
1012 38 : if (only_file && !search_table (&possible_key, &only_table))
1013 0 : continue;
1014 :
1015 : /* A non-empty word has been found. First of all, insure
1016 : proper allocation of the next OCCURS, and make a pointer to
1017 : where it will be constructed. */
1018 :
1019 38 : if (number_of_occurs[0] == occurs_alloc[0])
1020 : {
1021 34 : if ((SIZE_MAX / sizeof *occurs_table[0] - 1) / 2
1022 34 : < occurs_alloc[0])
1023 0 : xalloc_die ();
1024 34 : occurs_alloc[0] = occurs_alloc[0] * 2 + 1;
1025 34 : occurs_table[0] = xrealloc (occurs_table[0],
1026 34 : occurs_alloc[0] * sizeof *occurs_table[0]);
1027 : }
1028 :
1029 38 : occurs_cursor = occurs_table[0] + number_of_occurs[0];
1030 :
1031 : /* Define the refence field, if any. */
1032 :
1033 38 : if (auto_reference)
1034 : {
1035 :
1036 : /* While auto referencing, update `line_start' from its
1037 : previous value, counting lines as we go. If input
1038 : referencing at the same time, `line_start' has been
1039 : advanced earlier, and the following loop is never really
1040 : executed. */
1041 :
1042 4 : while (line_scan < possible_key.start)
1043 0 : if (*line_scan == '\n')
1044 : {
1045 0 : total_line_count++;
1046 0 : line_scan++;
1047 0 : line_start = line_scan;
1048 0 : SKIP_NON_WHITE (line_scan, text_buffer.end);
1049 : }
1050 : else
1051 0 : line_scan++;
1052 :
1053 2 : occurs_cursor->reference = total_line_count;
1054 : }
1055 36 : else if (input_reference)
1056 : {
1057 :
1058 : /* If only input referencing, `line_start' has been computed
1059 : earlier to detect the case the word matched would be part
1060 : of the reference. The reference position is simply the
1061 : value of `line_start'. */
1062 :
1063 : occurs_cursor->reference
1064 0 : = (DELTA) (line_start - possible_key.start);
1065 0 : if (reference_length > reference_max_width)
1066 0 : reference_max_width = reference_length;
1067 : }
1068 :
1069 : /* Exclude the reference from the context in simple cases. */
1070 :
1071 38 : if (input_reference && line_start == context_start)
1072 : {
1073 0 : SKIP_NON_WHITE (context_start, context_end);
1074 0 : SKIP_WHITE (context_start, context_end);
1075 : }
1076 :
1077 : /* Completes the OCCURS structure. */
1078 :
1079 38 : occurs_cursor->key = possible_key;
1080 38 : occurs_cursor->left = context_start - possible_key.start;
1081 38 : occurs_cursor->right = context_end - possible_key.start;
1082 :
1083 38 : number_of_occurs[0]++;
1084 : }
1085 : }
1086 70 : }
1087 :
1088 : /* Formatting and actual output - service routines. */
1089 :
1090 : /*-----------------------------------------.
1091 : | Prints some NUMBER of spaces on stdout. |
1092 : `-----------------------------------------*/
1093 :
1094 : static void
1095 105 : print_spaces (int number)
1096 : {
1097 : int counter;
1098 :
1099 1366 : for (counter = number; counter > 0; counter--)
1100 1261 : putchar (' ');
1101 105 : }
1102 :
1103 : /*-------------------------------------.
1104 : | Prints the field provided by FIELD. |
1105 : `-------------------------------------*/
1106 :
1107 : static void
1108 119 : print_field (BLOCK field)
1109 : {
1110 : char *cursor; /* Cursor in field to print */
1111 : int base; /* Base character, without diacritic */
1112 : int diacritic; /* Diacritic code for the character */
1113 :
1114 : /* Whitespace is not really compressed. Instead, each white space
1115 : character (tab, vt, ht etc.) is printed as one single space. */
1116 :
1117 406 : for (cursor = field.start; cursor < field.end; cursor++)
1118 : {
1119 287 : unsigned char character = *cursor;
1120 287 : if (edited_flag[character])
1121 : {
1122 :
1123 : /* First check if this is a diacriticized character.
1124 :
1125 : This works only for TeX. I do not know how diacriticized
1126 : letters work with `roff'. Please someone explain it to me! */
1127 :
1128 2 : diacritic = todiac (character);
1129 2 : if (diacritic != 0 && output_format == TEX_FORMAT)
1130 : {
1131 0 : base = tobase (character);
1132 0 : switch (diacritic)
1133 : {
1134 :
1135 0 : case 1: /* Latin diphthongs */
1136 0 : switch (base)
1137 : {
1138 0 : case 'o':
1139 0 : fputs ("\\oe{}", stdout);
1140 0 : break;
1141 :
1142 0 : case 'O':
1143 0 : fputs ("\\OE{}", stdout);
1144 0 : break;
1145 :
1146 0 : case 'a':
1147 0 : fputs ("\\ae{}", stdout);
1148 0 : break;
1149 :
1150 0 : case 'A':
1151 0 : fputs ("\\AE{}", stdout);
1152 0 : break;
1153 :
1154 0 : default:
1155 0 : putchar (' ');
1156 : }
1157 0 : break;
1158 :
1159 0 : case 2: /* Acute accent */
1160 0 : printf ("\\'%s%c", (base == 'i' ? "\\" : ""), base);
1161 0 : break;
1162 :
1163 0 : case 3: /* Grave accent */
1164 0 : printf ("\\`%s%c", (base == 'i' ? "\\" : ""), base);
1165 0 : break;
1166 :
1167 0 : case 4: /* Circumflex accent */
1168 0 : printf ("\\^%s%c", (base == 'i' ? "\\" : ""), base);
1169 0 : break;
1170 :
1171 0 : case 5: /* Diaeresis */
1172 0 : printf ("\\\"%s%c", (base == 'i' ? "\\" : ""), base);
1173 0 : break;
1174 :
1175 0 : case 6: /* Tilde accent */
1176 0 : printf ("\\~%s%c", (base == 'i' ? "\\" : ""), base);
1177 0 : break;
1178 :
1179 0 : case 7: /* Cedilla */
1180 0 : printf ("\\c{%c}", base);
1181 0 : break;
1182 :
1183 0 : case 8: /* Small circle beneath */
1184 0 : switch (base)
1185 : {
1186 0 : case 'a':
1187 0 : fputs ("\\aa{}", stdout);
1188 0 : break;
1189 :
1190 0 : case 'A':
1191 0 : fputs ("\\AA{}", stdout);
1192 0 : break;
1193 :
1194 0 : default:
1195 0 : putchar (' ');
1196 : }
1197 0 : break;
1198 :
1199 0 : case 9: /* Strike through */
1200 0 : switch (base)
1201 : {
1202 0 : case 'o':
1203 0 : fputs ("\\o{}", stdout);
1204 0 : break;
1205 :
1206 0 : case 'O':
1207 0 : fputs ("\\O{}", stdout);
1208 0 : break;
1209 :
1210 0 : default:
1211 0 : putchar (' ');
1212 : }
1213 0 : break;
1214 : }
1215 0 : }
1216 : else
1217 :
1218 : /* This is not a diacritic character, so handle cases which are
1219 : really specific to `roff' or TeX. All white space processing
1220 : is done as the default case of this switch. */
1221 :
1222 2 : switch (character)
1223 : {
1224 0 : case '"':
1225 : /* In roff output format, double any quote. */
1226 0 : putchar ('"');
1227 0 : putchar ('"');
1228 0 : break;
1229 :
1230 0 : case '$':
1231 : case '%':
1232 : case '&':
1233 : case '#':
1234 : case '_':
1235 : /* In TeX output format, precede these with a backslash. */
1236 0 : putchar ('\\');
1237 0 : putchar (character);
1238 0 : break;
1239 :
1240 0 : case '{':
1241 : case '}':
1242 : /* In TeX output format, precede these with a backslash and
1243 : force mathematical mode. */
1244 0 : printf ("$\\%c$", character);
1245 0 : break;
1246 :
1247 0 : case '\\':
1248 : /* In TeX output mode, request production of a backslash. */
1249 0 : fputs ("\\backslash{}", stdout);
1250 0 : break;
1251 :
1252 2 : default:
1253 : /* Any other flagged character produces a single space. */
1254 2 : putchar (' ');
1255 : }
1256 : }
1257 : else
1258 285 : putchar (*cursor);
1259 : }
1260 119 : }
1261 :
1262 : /* Formatting and actual output - planning routines. */
1263 :
1264 : /*--------------------------------------------------------------------.
1265 : | From information collected from command line options and input file |
1266 : | readings, compute and fix some output parameter values. |
1267 : `--------------------------------------------------------------------*/
1268 :
1269 : static void
1270 59 : fix_output_parameters (void)
1271 : {
1272 : int file_index; /* index in text input file arrays */
1273 : int line_ordinal; /* line ordinal value for reference */
1274 : char ordinal_string[12]; /* edited line ordinal for reference */
1275 : int reference_width; /* width for the whole reference */
1276 : int character; /* character ordinal */
1277 : const char *cursor; /* cursor in some constant strings */
1278 :
1279 : /* In auto reference mode, the maximum width of this field is
1280 : precomputed and subtracted from the overall line width. Add one for
1281 : the column which separate the file name from the line number. */
1282 :
1283 59 : if (auto_reference)
1284 : {
1285 5 : reference_max_width = 0;
1286 10 : for (file_index = 0; file_index < number_input_files; file_index++)
1287 : {
1288 5 : line_ordinal = file_line_count[file_index] + 1;
1289 5 : if (file_index > 0)
1290 0 : line_ordinal -= file_line_count[file_index - 1];
1291 5 : sprintf (ordinal_string, "%d", line_ordinal);
1292 5 : reference_width = strlen (ordinal_string);
1293 5 : if (input_file_name[file_index])
1294 0 : reference_width += strlen (input_file_name[file_index]);
1295 5 : if (reference_width > reference_max_width)
1296 5 : reference_max_width = reference_width;
1297 : }
1298 5 : reference_max_width++;
1299 5 : reference.start = xmalloc ((size_t) reference_max_width + 1);
1300 : }
1301 :
1302 : /* If the reference appears to the left of the output line, reserve some
1303 : space for it right away, including one gap size. */
1304 :
1305 59 : if ((auto_reference | input_reference) & !right_reference)
1306 5 : line_width -= reference_max_width + gap_size;
1307 :
1308 : /* The output lines, minimally, will contain from left to right a left
1309 : context, a gap, and a keyword followed by the right context with no
1310 : special intervening gap. Half of the line width is dedicated to the
1311 : left context and the gap, the other half is dedicated to the keyword
1312 : and the right context; these values are computed once and for all here.
1313 : There also are tail and head wrap around fields, used when the keyword
1314 : is near the beginning or the end of the line, or when some long word
1315 : cannot fit in, but leave place from wrapped around shorter words. The
1316 : maximum width of these fields are recomputed separately for each line,
1317 : on a case by case basis. It is worth noting that it cannot happen that
1318 : both the tail and head fields are used at once. */
1319 :
1320 59 : half_line_width = line_width / 2;
1321 59 : before_max_width = half_line_width - gap_size;
1322 59 : keyafter_max_width = half_line_width;
1323 :
1324 : /* If truncation_string is the empty string, make it NULL to speed up
1325 : tests. In this case, truncation_string_length will never get used, so
1326 : there is no need to set it. */
1327 :
1328 59 : if (truncation_string && *truncation_string)
1329 53 : truncation_string_length = strlen (truncation_string);
1330 : else
1331 6 : truncation_string = NULL;
1332 :
1333 59 : if (gnu_extensions)
1334 : {
1335 :
1336 : /* When flagging truncation at the left of the keyword, the
1337 : truncation mark goes at the beginning of the before field,
1338 : unless there is a head field, in which case the mark goes at the
1339 : left of the head field. When flagging truncation at the right
1340 : of the keyword, the mark goes at the end of the keyafter field,
1341 : unless there is a tail field, in which case the mark goes at the
1342 : end of the tail field. Only eight combination cases could arise
1343 : for truncation marks:
1344 :
1345 : . None.
1346 : . One beginning the before field.
1347 : . One beginning the head field.
1348 : . One ending the keyafter field.
1349 : . One ending the tail field.
1350 : . One beginning the before field, another ending the keyafter field.
1351 : . One ending the tail field, another beginning the before field.
1352 : . One ending the keyafter field, another beginning the head field.
1353 :
1354 : So, there is at most two truncation marks, which could appear both
1355 : on the left side of the center of the output line, both on the
1356 : right side, or one on either side. */
1357 :
1358 56 : before_max_width -= 2 * truncation_string_length;
1359 56 : keyafter_max_width -= 2 * truncation_string_length;
1360 : }
1361 : else
1362 : {
1363 :
1364 : /* I never figured out exactly how UNIX' ptx plans the output width
1365 : of its various fields. If GNU extensions are disabled, do not
1366 : try computing the field widths correctly; instead, use the
1367 : following formula, which does not completely imitate UNIX' ptx,
1368 : but almost. */
1369 :
1370 3 : keyafter_max_width -= 2 * truncation_string_length + 1;
1371 : }
1372 :
1373 : /* Compute which characters need special output processing. Initialize
1374 : by flagging any white space character. Some systems do not consider
1375 : form feed as a space character, but we do. */
1376 :
1377 15163 : for (character = 0; character < CHAR_SET_SIZE; character++)
1378 15104 : edited_flag[character] = !! isspace (character);
1379 59 : edited_flag['\f'] = 1;
1380 :
1381 : /* Complete the special character flagging according to selected output
1382 : format. */
1383 :
1384 59 : switch (output_format)
1385 : {
1386 56 : case UNKNOWN_FORMAT:
1387 : /* Should never happen. */
1388 :
1389 : case DUMB_FORMAT:
1390 56 : break;
1391 :
1392 3 : case ROFF_FORMAT:
1393 :
1394 : /* `Quote' characters should be doubled. */
1395 :
1396 3 : edited_flag['"'] = 1;
1397 3 : break;
1398 :
1399 0 : case TEX_FORMAT:
1400 :
1401 : /* Various characters need special processing. */
1402 :
1403 0 : for (cursor = "$%&#_{}\\"; *cursor; cursor++)
1404 0 : edited_flag[to_uchar (*cursor)] = 1;
1405 :
1406 : /* Any character with 8th bit set will print to a single space, unless
1407 : it is diacriticized. */
1408 :
1409 0 : for (character = 0200; character < CHAR_SET_SIZE; character++)
1410 0 : edited_flag[character] = todiac (character) != 0;
1411 0 : break;
1412 : }
1413 59 : }
1414 :
1415 : /*------------------------------------------------------------------.
1416 : | Compute the position and length of all the output fields, given a |
1417 : | pointer to some OCCURS. |
1418 : `------------------------------------------------------------------*/
1419 :
1420 : static void
1421 38 : define_all_fields (OCCURS *occurs)
1422 : {
1423 : int tail_max_width; /* allowable width of tail field */
1424 : int head_max_width; /* allowable width of head field */
1425 : char *cursor; /* running cursor in source text */
1426 : char *left_context_start; /* start of left context */
1427 : char *right_context_end; /* end of right context */
1428 : char *left_field_start; /* conservative start for `head'/`before' */
1429 : int file_index; /* index in text input file arrays */
1430 : const char *file_name; /* file name for reference */
1431 : int line_ordinal; /* line ordinal for reference */
1432 :
1433 : /* Define `keyafter', start of left context and end of right context.
1434 : `keyafter' starts at the saved position for keyword and extend to the
1435 : right from the end of the keyword, eating separators or full words, but
1436 : not beyond maximum allowed width for `keyafter' field or limit for the
1437 : right context. Suffix spaces will be removed afterwards. */
1438 :
1439 38 : keyafter.start = occurs->key.start;
1440 38 : keyafter.end = keyafter.start + occurs->key.size;
1441 38 : left_context_start = keyafter.start + occurs->left;
1442 38 : right_context_end = keyafter.start + occurs->right;
1443 :
1444 38 : cursor = keyafter.end;
1445 185 : while (cursor < right_context_end
1446 109 : && cursor <= keyafter.start + keyafter_max_width)
1447 : {
1448 109 : keyafter.end = cursor;
1449 109 : SKIP_SOMETHING (cursor, right_context_end);
1450 : }
1451 38 : if (cursor <= keyafter.start + keyafter_max_width)
1452 38 : keyafter.end = cursor;
1453 :
1454 38 : keyafter_truncation = truncation_string && keyafter.end < right_context_end;
1455 :
1456 38 : SKIP_WHITE_BACKWARDS (keyafter.end, keyafter.start);
1457 :
1458 : /* When the left context is wide, it might take some time to catch up from
1459 : the left context boundary to the beginning of the `head' or `before'
1460 : fields. So, in this case, to speed the catchup, we jump back from the
1461 : keyword, using some secure distance, possibly falling in the middle of
1462 : a word. A secure backward jump would be at least half the maximum
1463 : width of a line, plus the size of the longest word met in the whole
1464 : input. We conclude this backward jump by a skip forward of at least
1465 : one word. In this manner, we should not inadvertently accept only part
1466 : of a word. From the reached point, when it will be time to fix the
1467 : beginning of `head' or `before' fields, we will skip forward words or
1468 : delimiters until we get sufficiently near. */
1469 :
1470 38 : if (-occurs->left > half_line_width + maximum_word_length)
1471 : {
1472 : left_field_start
1473 0 : = keyafter.start - (half_line_width + maximum_word_length);
1474 0 : SKIP_SOMETHING (left_field_start, keyafter.start);
1475 : }
1476 : else
1477 38 : left_field_start = keyafter.start + occurs->left;
1478 :
1479 : /* `before' certainly ends at the keyword, but not including separating
1480 : spaces. It starts after than the saved value for the left context, by
1481 : advancing it until it falls inside the maximum allowed width for the
1482 : before field. There will be no prefix spaces either. `before' only
1483 : advances by skipping single separators or whole words. */
1484 :
1485 38 : before.start = left_field_start;
1486 38 : before.end = keyafter.start;
1487 38 : SKIP_WHITE_BACKWARDS (before.end, before.start);
1488 :
1489 76 : while (before.start + before_max_width < before.end)
1490 0 : SKIP_SOMETHING (before.start, before.end);
1491 :
1492 38 : if (truncation_string)
1493 : {
1494 38 : cursor = before.start;
1495 38 : SKIP_WHITE_BACKWARDS (cursor, text_buffer.start);
1496 38 : before_truncation = cursor > left_context_start;
1497 : }
1498 : else
1499 0 : before_truncation = 0;
1500 :
1501 38 : SKIP_WHITE (before.start, text_buffer.end);
1502 :
1503 : /* The tail could not take more columns than what has been left in the
1504 : left context field, and a gap is mandatory. It starts after the
1505 : right context, and does not contain prefixed spaces. It ends at
1506 : the end of line, the end of buffer or when the tail field is full,
1507 : whichever comes first. It cannot contain only part of a word, and
1508 : has no suffixed spaces. */
1509 :
1510 : tail_max_width
1511 38 : = before_max_width - (before.end - before.start) - gap_size;
1512 :
1513 38 : if (tail_max_width > 0)
1514 : {
1515 38 : tail.start = keyafter.end;
1516 38 : SKIP_WHITE (tail.start, text_buffer.end);
1517 :
1518 38 : tail.end = tail.start;
1519 38 : cursor = tail.end;
1520 76 : while (cursor < right_context_end
1521 0 : && cursor < tail.start + tail_max_width)
1522 : {
1523 0 : tail.end = cursor;
1524 0 : SKIP_SOMETHING (cursor, right_context_end);
1525 : }
1526 :
1527 38 : if (cursor < tail.start + tail_max_width)
1528 38 : tail.end = cursor;
1529 :
1530 38 : if (tail.end > tail.start)
1531 : {
1532 0 : keyafter_truncation = 0;
1533 0 : tail_truncation = truncation_string && tail.end < right_context_end;
1534 : }
1535 : else
1536 38 : tail_truncation = 0;
1537 :
1538 38 : SKIP_WHITE_BACKWARDS (tail.end, tail.start);
1539 : }
1540 : else
1541 : {
1542 :
1543 : /* No place left for a tail field. */
1544 :
1545 0 : tail.start = NULL;
1546 0 : tail.end = NULL;
1547 0 : tail_truncation = 0;
1548 : }
1549 :
1550 : /* `head' could not take more columns than what has been left in the right
1551 : context field, and a gap is mandatory. It ends before the left
1552 : context, and does not contain suffixed spaces. Its pointer is advanced
1553 : until the head field has shrunk to its allowed width. It cannot
1554 : contain only part of a word, and has no suffixed spaces. */
1555 :
1556 : head_max_width
1557 38 : = keyafter_max_width - (keyafter.end - keyafter.start) - gap_size;
1558 :
1559 38 : if (head_max_width > 0)
1560 : {
1561 38 : head.end = before.start;
1562 38 : SKIP_WHITE_BACKWARDS (head.end, text_buffer.start);
1563 :
1564 38 : head.start = left_field_start;
1565 76 : while (head.start + head_max_width < head.end)
1566 0 : SKIP_SOMETHING (head.start, head.end);
1567 :
1568 38 : if (head.end > head.start)
1569 : {
1570 0 : before_truncation = 0;
1571 0 : head_truncation = (truncation_string
1572 0 : && head.start > left_context_start);
1573 : }
1574 : else
1575 38 : head_truncation = 0;
1576 :
1577 38 : SKIP_WHITE (head.start, head.end);
1578 : }
1579 : else
1580 : {
1581 :
1582 : /* No place left for a head field. */
1583 :
1584 0 : head.start = NULL;
1585 0 : head.end = NULL;
1586 0 : head_truncation = 0;
1587 : }
1588 :
1589 38 : if (auto_reference)
1590 : {
1591 :
1592 : /* Construct the reference text in preallocated space from the file
1593 : name and the line number. Find out in which file the reference
1594 : occurred. Standard input yields an empty file name. Insure line
1595 : numbers are one based, even if they are computed zero based. */
1596 :
1597 2 : file_index = 0;
1598 4 : while (file_line_count[file_index] < occurs->reference)
1599 0 : file_index++;
1600 :
1601 2 : file_name = input_file_name[file_index];
1602 2 : if (!file_name)
1603 2 : file_name = "";
1604 :
1605 2 : line_ordinal = occurs->reference + 1;
1606 2 : if (file_index > 0)
1607 0 : line_ordinal -= file_line_count[file_index - 1];
1608 :
1609 2 : sprintf (reference.start, "%s:%d", file_name, line_ordinal);
1610 2 : reference.end = reference.start + strlen (reference.start);
1611 : }
1612 36 : else if (input_reference)
1613 : {
1614 :
1615 : /* Reference starts at saved position for reference and extends right
1616 : until some white space is met. */
1617 :
1618 0 : reference.start = keyafter.start + (DELTA) occurs->reference;
1619 0 : reference.end = reference.start;
1620 0 : SKIP_NON_WHITE (reference.end, right_context_end);
1621 : }
1622 38 : }
1623 :
1624 : /* Formatting and actual output - control routines. */
1625 :
1626 : /*----------------------------------------------------------------------.
1627 : | Output the current output fields as one line for `troff' or `nroff'. |
1628 : `----------------------------------------------------------------------*/
1629 :
1630 : static void
1631 3 : output_one_roff_line (void)
1632 : {
1633 : /* Output the `tail' field. */
1634 :
1635 3 : printf (".%s \"", macro_name);
1636 3 : print_field (tail);
1637 3 : if (tail_truncation)
1638 0 : fputs (truncation_string, stdout);
1639 3 : putchar ('"');
1640 :
1641 : /* Output the `before' field. */
1642 :
1643 3 : fputs (" \"", stdout);
1644 3 : if (before_truncation)
1645 0 : fputs (truncation_string, stdout);
1646 3 : print_field (before);
1647 3 : putchar ('"');
1648 :
1649 : /* Output the `keyafter' field. */
1650 :
1651 3 : fputs (" \"", stdout);
1652 3 : print_field (keyafter);
1653 3 : if (keyafter_truncation)
1654 0 : fputs (truncation_string, stdout);
1655 3 : putchar ('"');
1656 :
1657 : /* Output the `head' field. */
1658 :
1659 3 : fputs (" \"", stdout);
1660 3 : if (head_truncation)
1661 0 : fputs (truncation_string, stdout);
1662 3 : print_field (head);
1663 3 : putchar ('"');
1664 :
1665 : /* Conditionally output the `reference' field. */
1666 :
1667 3 : if (auto_reference | input_reference)
1668 : {
1669 2 : fputs (" \"", stdout);
1670 2 : print_field (reference);
1671 2 : putchar ('"');
1672 : }
1673 :
1674 3 : putchar ('\n');
1675 3 : }
1676 :
1677 : /*---------------------------------------------------------.
1678 : | Output the current output fields as one line for `TeX'. |
1679 : `---------------------------------------------------------*/
1680 :
1681 : static void
1682 0 : output_one_tex_line (void)
1683 : {
1684 : BLOCK key; /* key field, isolated */
1685 : BLOCK after; /* after field, isolated */
1686 : char *cursor; /* running cursor in source text */
1687 :
1688 0 : printf ("\\%s ", macro_name);
1689 0 : putchar ('{');
1690 0 : print_field (tail);
1691 0 : fputs ("}{", stdout);
1692 0 : print_field (before);
1693 0 : fputs ("}{", stdout);
1694 0 : key.start = keyafter.start;
1695 0 : after.end = keyafter.end;
1696 0 : cursor = keyafter.start;
1697 0 : SKIP_SOMETHING (cursor, keyafter.end);
1698 0 : key.end = cursor;
1699 0 : after.start = cursor;
1700 0 : print_field (key);
1701 0 : fputs ("}{", stdout);
1702 0 : print_field (after);
1703 0 : fputs ("}{", stdout);
1704 0 : print_field (head);
1705 0 : putchar ('}');
1706 0 : if (auto_reference | input_reference)
1707 : {
1708 0 : putchar ('{');
1709 0 : print_field (reference);
1710 0 : putchar ('}');
1711 : }
1712 0 : putchar ('\n');
1713 0 : }
1714 :
1715 : /*-------------------------------------------------------------------.
1716 : | Output the current output fields as one line for a dumb terminal. |
1717 : `-------------------------------------------------------------------*/
1718 :
1719 : static void
1720 35 : output_one_dumb_line (void)
1721 : {
1722 35 : if (!right_reference)
1723 : {
1724 35 : if (auto_reference)
1725 : {
1726 :
1727 : /* Output the `reference' field, in such a way that GNU emacs
1728 : next-error will handle it. The ending colon is taken from the
1729 : gap which follows. */
1730 :
1731 0 : print_field (reference);
1732 0 : putchar (':');
1733 0 : print_spaces (reference_max_width
1734 0 : + gap_size
1735 0 : - (reference.end - reference.start)
1736 0 : - 1);
1737 : }
1738 : else
1739 : {
1740 :
1741 : /* Output the `reference' field and its following gap. */
1742 :
1743 35 : print_field (reference);
1744 35 : print_spaces (reference_max_width
1745 35 : + gap_size
1746 35 : - (reference.end - reference.start));
1747 : }
1748 : }
1749 :
1750 35 : if (tail.start < tail.end)
1751 : {
1752 : /* Output the `tail' field. */
1753 :
1754 0 : print_field (tail);
1755 0 : if (tail_truncation)
1756 0 : fputs (truncation_string, stdout);
1757 :
1758 0 : print_spaces (half_line_width - gap_size
1759 0 : - (before.end - before.start)
1760 0 : - (before_truncation ? truncation_string_length : 0)
1761 0 : - (tail.end - tail.start)
1762 0 : - (tail_truncation ? truncation_string_length : 0));
1763 : }
1764 : else
1765 70 : print_spaces (half_line_width - gap_size
1766 35 : - (before.end - before.start)
1767 35 : - (before_truncation ? truncation_string_length : 0));
1768 :
1769 : /* Output the `before' field. */
1770 :
1771 35 : if (before_truncation)
1772 0 : fputs (truncation_string, stdout);
1773 35 : print_field (before);
1774 :
1775 35 : print_spaces (gap_size);
1776 :
1777 : /* Output the `keyafter' field. */
1778 :
1779 35 : print_field (keyafter);
1780 35 : if (keyafter_truncation)
1781 0 : fputs (truncation_string, stdout);
1782 :
1783 35 : if (head.start < head.end)
1784 : {
1785 : /* Output the `head' field. */
1786 :
1787 0 : print_spaces (half_line_width
1788 0 : - (keyafter.end - keyafter.start)
1789 0 : - (keyafter_truncation ? truncation_string_length : 0)
1790 0 : - (head.end - head.start)
1791 0 : - (head_truncation ? truncation_string_length : 0));
1792 0 : if (head_truncation)
1793 0 : fputs (truncation_string, stdout);
1794 0 : print_field (head);
1795 : }
1796 : else
1797 :
1798 35 : if ((auto_reference | input_reference) & right_reference)
1799 0 : print_spaces (half_line_width
1800 0 : - (keyafter.end - keyafter.start)
1801 0 : - (keyafter_truncation ? truncation_string_length : 0));
1802 :
1803 35 : if ((auto_reference | input_reference) & right_reference)
1804 : {
1805 : /* Output the `reference' field. */
1806 :
1807 0 : print_spaces (gap_size);
1808 0 : print_field (reference);
1809 : }
1810 :
1811 35 : putchar ('\n');
1812 35 : }
1813 :
1814 : /*------------------------------------------------------------------------.
1815 : | Scan the whole occurs table and, for each entry, output one line in the |
1816 : | appropriate format. |
1817 : `------------------------------------------------------------------------*/
1818 :
1819 : static void
1820 59 : generate_all_output (void)
1821 : {
1822 : size_t occurs_index; /* index of keyword entry being processed */
1823 : OCCURS *occurs_cursor; /* current keyword entry being processed */
1824 :
1825 : /* The following assignments are useful to provide default values in case
1826 : line contexts or references are not used, in which case these variables
1827 : would never be computed. */
1828 :
1829 59 : tail.start = NULL;
1830 59 : tail.end = NULL;
1831 59 : tail_truncation = 0;
1832 :
1833 59 : head.start = NULL;
1834 59 : head.end = NULL;
1835 59 : head_truncation = 0;
1836 :
1837 : /* Loop over all keyword occurrences. */
1838 :
1839 59 : occurs_cursor = occurs_table[0];
1840 :
1841 97 : for (occurs_index = 0; occurs_index < number_of_occurs[0]; occurs_index++)
1842 : {
1843 : /* Compute the exact size of every field and whenever truncation flags
1844 : are present or not. */
1845 :
1846 38 : define_all_fields (occurs_cursor);
1847 :
1848 : /* Produce one output line according to selected format. */
1849 :
1850 38 : switch (output_format)
1851 : {
1852 35 : case UNKNOWN_FORMAT:
1853 : /* Should never happen. */
1854 :
1855 : case DUMB_FORMAT:
1856 35 : output_one_dumb_line ();
1857 35 : break;
1858 :
1859 3 : case ROFF_FORMAT:
1860 3 : output_one_roff_line ();
1861 3 : break;
1862 :
1863 0 : case TEX_FORMAT:
1864 0 : output_one_tex_line ();
1865 0 : break;
1866 : }
1867 :
1868 : /* Advance the cursor into the occurs table. */
1869 :
1870 38 : occurs_cursor++;
1871 : }
1872 59 : }
1873 :
1874 : /* Option decoding and main program. */
1875 :
1876 : /*------------------------------------------------------.
1877 : | Print program identification and options, then exit. |
1878 : `------------------------------------------------------*/
1879 :
1880 : void
1881 7 : usage (int status)
1882 : {
1883 7 : if (status != EXIT_SUCCESS)
1884 7 : fprintf (stderr, _("Try `%s --help' for more information.\n"),
1885 : program_name);
1886 : else
1887 : {
1888 0 : printf (_("\
1889 : Usage: %s [OPTION]... [INPUT]... (without -G)\n\
1890 : or: %s -G [OPTION]... [INPUT [OUTPUT]]\n"),
1891 : program_name, program_name);
1892 0 : fputs (_("\
1893 : Output a permuted index, including context, of the words in the input files.\n\
1894 : \n\
1895 : "), stdout);
1896 0 : fputs (_("\
1897 : Mandatory arguments to long options are mandatory for short options too.\n\
1898 : "), stdout);
1899 0 : fputs (_("\
1900 : -A, --auto-reference output automatically generated references\n\
1901 : -G, --traditional behave more like System V `ptx'\n\
1902 : -F, --flag-truncation=STRING use STRING for flagging line truncations\n\
1903 : "), stdout);
1904 0 : fputs (_("\
1905 : -M, --macro-name=STRING macro name to use instead of `xx'\n\
1906 : -O, --format=roff generate output as roff directives\n\
1907 : -R, --right-side-refs put references at right, not counted in -w\n\
1908 : -S, --sentence-regexp=REGEXP for end of lines or end of sentences\n\
1909 : -T, --format=tex generate output as TeX directives\n\
1910 : "), stdout);
1911 0 : fputs (_("\
1912 : -W, --word-regexp=REGEXP use REGEXP to match each keyword\n\
1913 : -b, --break-file=FILE word break characters in this FILE\n\
1914 : -f, --ignore-case fold lower case to upper case for sorting\n\
1915 : -g, --gap-size=NUMBER gap size in columns between output fields\n\
1916 : -i, --ignore-file=FILE read ignore word list from FILE\n\
1917 : -o, --only-file=FILE read only word list from this FILE\n\
1918 : "), stdout);
1919 0 : fputs (_("\
1920 : -r, --references first field of each line is a reference\n\
1921 : -t, --typeset-mode - not implemented -\n\
1922 : -w, --width=NUMBER output width in columns, reference excluded\n\
1923 : "), stdout);
1924 0 : fputs (HELP_OPTION_DESCRIPTION, stdout);
1925 0 : fputs (VERSION_OPTION_DESCRIPTION, stdout);
1926 0 : fputs (_("\
1927 : \n\
1928 : With no FILE or if FILE is -, read Standard Input. `-F /' by default.\n\
1929 : "), stdout);
1930 0 : emit_bug_reporting_address ();
1931 : }
1932 7 : exit (status);
1933 : }
1934 :
1935 : /*----------------------------------------------------------------------.
1936 : | Main program. Decode ARGC arguments passed through the ARGV array of |
1937 : | strings, then launch execution. |
1938 : `----------------------------------------------------------------------*/
1939 :
1940 : /* Long options equivalences. */
1941 : static const struct option long_options[] =
1942 : {
1943 : {"auto-reference", no_argument, NULL, 'A'},
1944 : {"break-file", required_argument, NULL, 'b'},
1945 : {"flag-truncation", required_argument, NULL, 'F'},
1946 : {"ignore-case", no_argument, NULL, 'f'},
1947 : {"gap-size", required_argument, NULL, 'g'},
1948 : {"ignore-file", required_argument, NULL, 'i'},
1949 : {"macro-name", required_argument, NULL, 'M'},
1950 : {"only-file", required_argument, NULL, 'o'},
1951 : {"references", no_argument, NULL, 'r'},
1952 : {"right-side-refs", no_argument, NULL, 'R'},
1953 : {"format", required_argument, NULL, 10},
1954 : {"sentence-regexp", required_argument, NULL, 'S'},
1955 : {"traditional", no_argument, NULL, 'G'},
1956 : {"typeset-mode", no_argument, NULL, 't'},
1957 : {"width", required_argument, NULL, 'w'},
1958 : {"word-regexp", required_argument, NULL, 'W'},
1959 : {GETOPT_HELP_OPTION_DECL},
1960 : {GETOPT_VERSION_OPTION_DECL},
1961 : {NULL, 0, NULL, 0},
1962 : };
1963 :
1964 : static char const* const format_args[] =
1965 : {
1966 : "roff", "tex", NULL
1967 : };
1968 :
1969 : static enum Format const format_vals[] =
1970 : {
1971 : ROFF_FORMAT, TEX_FORMAT
1972 : };
1973 :
1974 : int
1975 83 : main (int argc, char **argv)
1976 : {
1977 : int optchar; /* argument character */
1978 : int file_index; /* index in text input file arrays */
1979 :
1980 : /* Decode program options. */
1981 :
1982 : initialize_main (&argc, &argv);
1983 83 : program_name = argv[0];
1984 83 : setlocale (LC_ALL, "");
1985 : bindtextdomain (PACKAGE, LOCALEDIR);
1986 : textdomain (PACKAGE);
1987 :
1988 83 : atexit (close_stdout);
1989 :
1990 : #if HAVE_SETCHRCLASS
1991 : setchrclass (NULL);
1992 : #endif
1993 :
1994 83 : while (optchar = getopt_long (argc, argv, "AF:GM:ORS:TW:b:i:fg:o:trw:",
1995 : long_options, NULL),
1996 : optchar != EOF)
1997 : {
1998 44 : switch (optchar)
1999 : {
2000 7 : default:
2001 7 : usage (EXIT_FAILURE);
2002 :
2003 3 : case 'G':
2004 3 : gnu_extensions = false;
2005 3 : break;
2006 :
2007 2 : case 'b':
2008 2 : break_file = optarg;
2009 2 : break;
2010 :
2011 0 : case 'f':
2012 0 : ignore_case = true;
2013 0 : break;
2014 :
2015 3 : case 'g':
2016 : {
2017 : unsigned long int tmp_ulong;
2018 3 : if (xstrtoul (optarg, NULL, 0, &tmp_ulong, NULL) != LONGINT_OK
2019 0 : || ! (0 < tmp_ulong && tmp_ulong <= INT_MAX))
2020 3 : error (EXIT_FAILURE, 0, _("invalid gap width: %s"),
2021 : quotearg (optarg));
2022 0 : gap_size = tmp_ulong;
2023 0 : break;
2024 : }
2025 :
2026 0 : case 'i':
2027 0 : ignore_file = optarg;
2028 0 : break;
2029 :
2030 0 : case 'o':
2031 0 : only_file = optarg;
2032 0 : break;
2033 :
2034 0 : case 'r':
2035 0 : input_reference = true;
2036 0 : break;
2037 :
2038 0 : case 't':
2039 : /* Yet to understand... */
2040 0 : break;
2041 :
2042 0 : case 'w':
2043 : {
2044 : unsigned long int tmp_ulong;
2045 0 : if (xstrtoul (optarg, NULL, 0, &tmp_ulong, NULL) != LONGINT_OK
2046 0 : || ! (0 < tmp_ulong && tmp_ulong <= INT_MAX))
2047 0 : error (EXIT_FAILURE, 0, _("invalid line width: %s"),
2048 : quotearg (optarg));
2049 0 : line_width = tmp_ulong;
2050 0 : break;
2051 : }
2052 :
2053 11 : case 'A':
2054 11 : auto_reference = true;
2055 11 : break;
2056 :
2057 18 : case 'F':
2058 18 : truncation_string = copy_unescaped_string (optarg);
2059 18 : break;
2060 :
2061 0 : case 'M':
2062 0 : macro_name = optarg;
2063 0 : break;
2064 :
2065 0 : case 'O':
2066 0 : output_format = ROFF_FORMAT;
2067 0 : break;
2068 :
2069 0 : case 'R':
2070 0 : right_reference = true;
2071 0 : break;
2072 :
2073 0 : case 'S':
2074 0 : context_regex.string = copy_unescaped_string (optarg);
2075 0 : break;
2076 :
2077 0 : case 'T':
2078 0 : output_format = TEX_FORMAT;
2079 0 : break;
2080 :
2081 0 : case 'W':
2082 0 : word_regex.string = copy_unescaped_string (optarg);
2083 0 : if (!*word_regex.string)
2084 0 : word_regex.string = NULL;
2085 0 : break;
2086 :
2087 0 : case 10:
2088 0 : output_format = XARGMATCH ("--format", optarg,
2089 : format_args, format_vals);
2090 0 : case_GETOPT_HELP_CHAR;
2091 :
2092 0 : case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
2093 : }
2094 : }
2095 :
2096 : /* Process remaining arguments. If GNU extensions are enabled, process
2097 : all arguments as input parameters. If disabled, accept at most two
2098 : arguments, the second of which is an output parameter. */
2099 :
2100 73 : if (optind == argc)
2101 : {
2102 :
2103 : /* No more argument simply means: read standard input. */
2104 :
2105 39 : input_file_name = xmalloc (sizeof *input_file_name);
2106 39 : file_line_count = xmalloc (sizeof *file_line_count);
2107 39 : number_input_files = 1;
2108 39 : input_file_name[0] = NULL;
2109 : }
2110 34 : else if (gnu_extensions)
2111 : {
2112 32 : number_input_files = argc - optind;
2113 32 : input_file_name = xmalloc (number_input_files * sizeof *input_file_name);
2114 32 : file_line_count = xmalloc (number_input_files * sizeof *file_line_count);
2115 :
2116 78 : for (file_index = 0; file_index < number_input_files; file_index++)
2117 : {
2118 46 : input_file_name[file_index] = argv[optind];
2119 46 : if (!*argv[optind] || STREQ (argv[optind], "-"))
2120 42 : input_file_name[0] = NULL;
2121 : else
2122 4 : input_file_name[0] = argv[optind];
2123 46 : optind++;
2124 : }
2125 : }
2126 : else
2127 : {
2128 :
2129 : /* There is one necessary input file. */
2130 :
2131 2 : number_input_files = 1;
2132 2 : input_file_name = xmalloc (sizeof *input_file_name);
2133 2 : file_line_count = xmalloc (sizeof *file_line_count);
2134 2 : if (!*argv[optind] || STREQ (argv[optind], "-"))
2135 2 : input_file_name[0] = NULL;
2136 : else
2137 0 : input_file_name[0] = argv[optind];
2138 2 : optind++;
2139 :
2140 : /* Redirect standard output, only if requested. */
2141 :
2142 2 : if (optind < argc)
2143 : {
2144 0 : if (! freopen (argv[optind], "w", stdout))
2145 0 : error (EXIT_FAILURE, errno, "%s", argv[optind]);
2146 0 : optind++;
2147 : }
2148 :
2149 : /* Diagnose any other argument as an error. */
2150 :
2151 2 : if (optind < argc)
2152 : {
2153 0 : error (0, 0, _("extra operand %s"), quote (argv[optind]));
2154 0 : usage (EXIT_FAILURE);
2155 : }
2156 : }
2157 :
2158 : /* If the output format has not been explicitly selected, choose dumb
2159 : terminal format if GNU extensions are enabled, else `roff' format. */
2160 :
2161 73 : if (output_format == UNKNOWN_FORMAT)
2162 73 : output_format = gnu_extensions ? DUMB_FORMAT : ROFF_FORMAT;
2163 :
2164 : /* Initialize the main tables. */
2165 :
2166 73 : initialize_regex ();
2167 :
2168 : /* Read `Break character' file, if any. */
2169 :
2170 73 : if (break_file)
2171 2 : digest_break_file (break_file);
2172 :
2173 : /* Read `Ignore words' file and `Only words' files, if any. If any of
2174 : these files is empty, reset the name of the file to NULL, to avoid
2175 : unnecessary calls to search_table. */
2176 :
2177 73 : if (ignore_file)
2178 : {
2179 0 : digest_word_file (ignore_file, &ignore_table);
2180 0 : if (ignore_table.length == 0)
2181 0 : ignore_file = NULL;
2182 : }
2183 :
2184 73 : if (only_file)
2185 : {
2186 0 : digest_word_file (only_file, &only_table);
2187 0 : if (only_table.length == 0)
2188 0 : only_file = NULL;
2189 : }
2190 :
2191 : /* Prepare to study all the input files. */
2192 :
2193 73 : number_of_occurs[0] = 0;
2194 73 : total_line_count = 0;
2195 73 : maximum_word_length = 0;
2196 73 : reference_max_width = 0;
2197 :
2198 143 : for (file_index = 0; file_index < number_input_files; file_index++)
2199 : {
2200 :
2201 : /* Read the file in core, than study it. */
2202 :
2203 84 : swallow_file_in_memory (input_file_name[file_index], &text_buffer);
2204 70 : find_occurs_in_text ();
2205 :
2206 : /* Maintain for each file how many lines has been read so far when its
2207 : end is reached. Incrementing the count first is a simple kludge to
2208 : handle a possible incomplete line at end of file. */
2209 :
2210 70 : total_line_count++;
2211 70 : file_line_count[file_index] = total_line_count;
2212 : }
2213 :
2214 : /* Do the output process phase. */
2215 :
2216 59 : sort_found_occurs ();
2217 59 : fix_output_parameters ();
2218 59 : generate_all_output ();
2219 :
2220 : /* All done. */
2221 :
2222 59 : exit (EXIT_SUCCESS);
2223 : }
|