Line data Source code
1 : /* split.c -- split a file into pieces.
2 : Copyright (C) 1988, 1991, 1995-2008 Free Software Foundation, Inc.
3 :
4 : This program is free software: you can redistribute it and/or modify
5 : it under the terms of the GNU General Public License as published by
6 : the Free Software Foundation, either version 3 of the License, or
7 : (at your option) any later version.
8 :
9 : This program is distributed in the hope that it will be useful,
10 : but WITHOUT ANY WARRANTY; without even the implied warranty of
11 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 : GNU General Public License for more details.
13 :
14 : You should have received a copy of the GNU General Public License
15 : along with this program. If not, see <http://www.gnu.org/licenses/>. */
16 :
17 : /* By tege@sics.se, with rms.
18 :
19 : To do:
20 : * Implement -t CHAR or -t REGEX to specify break characters other
21 : than newline. */
22 :
23 : #include <config.h>
24 :
25 : #include <stdio.h>
26 : #include <getopt.h>
27 : #include <sys/types.h>
28 :
29 : #include "system.h"
30 : #include "error.h"
31 : #include "fd-reopen.h"
32 : #include "fcntl--.h"
33 : #include "full-read.h"
34 : #include "full-write.h"
35 : #include "inttostr.h"
36 : #include "quote.h"
37 : #include "safe-read.h"
38 : #include "xstrtol.h"
39 :
40 : /* The official name of this program (e.g., no `g' prefix). */
41 : #define PROGRAM_NAME "split"
42 :
43 : #define AUTHORS "Torbjorn Granlund", "Richard M. Stallman"
44 :
45 : #define DEFAULT_SUFFIX_LENGTH 2
46 :
47 : /* The name this program was run with. */
48 : char *program_name;
49 :
50 : /* Base name of output files. */
51 : static char const *outbase;
52 :
53 : /* Name of output files. */
54 : static char *outfile;
55 :
56 : /* Pointer to the end of the prefix in OUTFILE.
57 : Suffixes are inserted here. */
58 : static char *outfile_mid;
59 :
60 : /* Length of OUTFILE's suffix. */
61 : static size_t suffix_length = DEFAULT_SUFFIX_LENGTH;
62 :
63 : /* Alphabet of characters to use in suffix. */
64 : static char const *suffix_alphabet = "abcdefghijklmnopqrstuvwxyz";
65 :
66 : /* Name of input file. May be "-". */
67 : static char *infile;
68 :
69 : /* Descriptor on which output file is open. */
70 : static int output_desc;
71 :
72 : /* If true, print a diagnostic on standard error just before each
73 : output file is opened. */
74 : static bool verbose;
75 :
76 : /* For long options that have no equivalent short option, use a
77 : non-character as a pseudo short option, starting with CHAR_MAX + 1. */
78 : enum
79 : {
80 : VERBOSE_OPTION = CHAR_MAX + 1
81 : };
82 :
83 : static struct option const longopts[] =
84 : {
85 : {"bytes", required_argument, NULL, 'b'},
86 : {"lines", required_argument, NULL, 'l'},
87 : {"line-bytes", required_argument, NULL, 'C'},
88 : {"suffix-length", required_argument, NULL, 'a'},
89 : {"numeric-suffixes", no_argument, NULL, 'd'},
90 : {"verbose", no_argument, NULL, VERBOSE_OPTION},
91 : {GETOPT_HELP_OPTION_DECL},
92 : {GETOPT_VERSION_OPTION_DECL},
93 : {NULL, 0, NULL, 0}
94 : };
95 :
96 : void
97 47 : usage (int status)
98 : {
99 47 : if (status != EXIT_SUCCESS)
100 46 : fprintf (stderr, _("Try `%s --help' for more information.\n"),
101 : program_name);
102 : else
103 : {
104 1 : printf (_("\
105 : Usage: %s [OPTION] [INPUT [PREFIX]]\n\
106 : "),
107 : program_name);
108 1 : fputs (_("\
109 : Output fixed-size pieces of INPUT to PREFIXaa, PREFIXab, ...; default\n\
110 : size is 1000 lines, and default PREFIX is `x'. With no INPUT, or when INPUT\n\
111 : is -, read standard input.\n\
112 : \n\
113 : "), stdout);
114 1 : fputs (_("\
115 : Mandatory arguments to long options are mandatory for short options too.\n\
116 : "), stdout);
117 1 : fprintf (stdout, _("\
118 : -a, --suffix-length=N use suffixes of length N (default %d)\n\
119 : -b, --bytes=SIZE put SIZE bytes per output file\n\
120 : -C, --line-bytes=SIZE put at most SIZE bytes of lines per output file\n\
121 : -d, --numeric-suffixes use numeric suffixes instead of alphabetic\n\
122 : -l, --lines=NUMBER put NUMBER lines per output file\n\
123 : "), DEFAULT_SUFFIX_LENGTH);
124 1 : fputs (_("\
125 : --verbose print a diagnostic just before each\n\
126 : output file is opened\n\
127 : "), stdout);
128 1 : fputs (HELP_OPTION_DESCRIPTION, stdout);
129 1 : fputs (VERSION_OPTION_DESCRIPTION, stdout);
130 1 : fputs (_("\
131 : \n\
132 : SIZE may have a multiplier suffix:\n\
133 : b 512, kB 1000, K 1024, MB 1000*1000, M 1024*1024,\n\
134 : GB 1000*1000*1000, G 1024*1024*1024, and so on for T, P, E, Z, Y.\n\
135 : "), stdout);
136 1 : emit_bug_reporting_address ();
137 : }
138 47 : exit (status);
139 : }
140 :
141 : /* Compute the next sequential output file name and store it into the
142 : string `outfile'. */
143 :
144 : static void
145 41 : next_file_name (void)
146 : {
147 : /* Index in suffix_alphabet of each character in the suffix. */
148 : static size_t *sufindex;
149 :
150 41 : if (! outfile)
151 : {
152 : /* Allocate and initialize the first file name. */
153 :
154 26 : size_t outbase_length = strlen (outbase);
155 26 : size_t outfile_length = outbase_length + suffix_length;
156 26 : if (outfile_length + 1 < outbase_length)
157 0 : xalloc_die ();
158 26 : outfile = xmalloc (outfile_length + 1);
159 26 : outfile_mid = outfile + outbase_length;
160 26 : memcpy (outfile, outbase, outbase_length);
161 26 : memset (outfile_mid, suffix_alphabet[0], suffix_length);
162 26 : outfile[outfile_length] = 0;
163 26 : sufindex = xcalloc (suffix_length, sizeof *sufindex);
164 :
165 : #if ! _POSIX_NO_TRUNC && HAVE_PATHCONF && defined _PC_NAME_MAX
166 : /* POSIX requires that if the output file name is too long for
167 : its directory, `split' must fail without creating any files.
168 : This must be checked for explicitly on operating systems that
169 : silently truncate file names. */
170 : {
171 : char *dir = dir_name (outfile);
172 : long name_max = pathconf (dir, _PC_NAME_MAX);
173 : if (0 <= name_max && name_max < base_len (last_component (outfile)))
174 : error (EXIT_FAILURE, ENAMETOOLONG, "%s", outfile);
175 : free (dir);
176 : }
177 : #endif
178 : }
179 : else
180 : {
181 : /* Increment the suffix in place, if possible. */
182 :
183 15 : size_t i = suffix_length;
184 30 : while (i-- != 0)
185 : {
186 14 : sufindex[i]++;
187 14 : outfile_mid[i] = suffix_alphabet[sufindex[i]];
188 14 : if (outfile_mid[i])
189 14 : return;
190 0 : sufindex[i] = 0;
191 0 : outfile_mid[i] = suffix_alphabet[sufindex[i]];
192 : }
193 1 : error (EXIT_FAILURE, 0, _("Output file suffixes exhausted"));
194 : }
195 : }
196 :
197 : /* Write BYTES bytes at BP to an output file.
198 : If NEW_FILE_FLAG is true, open the next output file.
199 : Otherwise add to the same output file already in use. */
200 :
201 : static void
202 41 : cwrite (bool new_file_flag, const char *bp, size_t bytes)
203 : {
204 41 : if (new_file_flag)
205 : {
206 41 : if (output_desc >= 0 && close (output_desc) < 0)
207 0 : error (EXIT_FAILURE, errno, "%s", outfile);
208 :
209 41 : next_file_name ();
210 40 : if (verbose)
211 1 : fprintf (stdout, _("creating file %s\n"), quote (outfile));
212 40 : output_desc = open (outfile,
213 : O_WRONLY | O_CREAT | O_TRUNC | O_BINARY,
214 : (S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP
215 : | S_IROTH | S_IWOTH));
216 40 : if (output_desc < 0)
217 1 : error (EXIT_FAILURE, errno, "%s", outfile);
218 : }
219 39 : if (full_write (output_desc, bp, bytes) != bytes)
220 0 : error (EXIT_FAILURE, errno, "%s", outfile);
221 39 : }
222 :
223 : /* Split into pieces of exactly N_BYTES bytes.
224 : Use buffer BUF, whose size is BUFSIZE. */
225 :
226 : static void
227 10 : bytes_split (uintmax_t n_bytes, char *buf, size_t bufsize)
228 : {
229 : size_t n_read;
230 10 : bool new_file_flag = true;
231 : size_t to_read;
232 10 : uintmax_t to_write = n_bytes;
233 : char *bp_out;
234 :
235 : do
236 : {
237 10 : n_read = full_read (STDIN_FILENO, buf, bufsize);
238 10 : if (n_read == SAFE_READ_ERROR)
239 0 : error (EXIT_FAILURE, errno, "%s", infile);
240 10 : bp_out = buf;
241 10 : to_read = n_read;
242 : for (;;)
243 : {
244 26 : if (to_read < to_write)
245 : {
246 10 : if (to_read) /* do not write 0 bytes! */
247 : {
248 9 : cwrite (new_file_flag, bp_out, to_read);
249 9 : to_write -= to_read;
250 9 : new_file_flag = false;
251 : }
252 10 : break;
253 : }
254 : else
255 : {
256 8 : size_t w = to_write;
257 8 : cwrite (new_file_flag, bp_out, w);
258 8 : bp_out += w;
259 8 : to_read -= w;
260 8 : new_file_flag = true;
261 8 : to_write = n_bytes;
262 : }
263 : }
264 : }
265 10 : while (n_read == bufsize);
266 10 : }
267 :
268 : /* Split into pieces of exactly N_LINES lines.
269 : Use buffer BUF, whose size is BUFSIZE. */
270 :
271 : static void
272 18 : lines_split (uintmax_t n_lines, char *buf, size_t bufsize)
273 : {
274 : size_t n_read;
275 : char *bp, *bp_out, *eob;
276 18 : bool new_file_flag = true;
277 18 : uintmax_t n = 0;
278 :
279 : do
280 : {
281 18 : n_read = full_read (STDIN_FILENO, buf, bufsize);
282 18 : if (n_read == SAFE_READ_ERROR)
283 0 : error (EXIT_FAILURE, errno, "%s", infile);
284 18 : bp = bp_out = buf;
285 18 : eob = bp + n_read;
286 18 : *eob = '\n';
287 : for (;;)
288 : {
289 240 : bp = memchr (bp, '\n', eob - bp + 1);
290 129 : if (bp == eob)
291 : {
292 17 : if (eob != bp_out) /* do not write 0 bytes! */
293 : {
294 14 : size_t len = eob - bp_out;
295 14 : cwrite (new_file_flag, bp_out, len);
296 13 : new_file_flag = false;
297 : }
298 16 : break;
299 : }
300 :
301 112 : ++bp;
302 112 : if (++n >= n_lines)
303 : {
304 10 : cwrite (new_file_flag, bp_out, bp - bp_out);
305 9 : bp_out = bp;
306 9 : new_file_flag = true;
307 9 : n = 0;
308 : }
309 : }
310 : }
311 16 : while (n_read == bufsize);
312 16 : }
313 :
314 : /* Split into pieces that are as large as possible while still not more
315 : than N_BYTES bytes, and are split on line boundaries except
316 : where lines longer than N_BYTES bytes occur.
317 : FIXME: Allow N_BYTES to be any uintmax_t value, and don't require a
318 : buffer of size N_BYTES, in case N_BYTES is very large. */
319 :
320 : static void
321 0 : line_bytes_split (size_t n_bytes)
322 : {
323 : size_t n_read;
324 : char *bp;
325 0 : bool eof = false;
326 0 : size_t n_buffered = 0;
327 0 : char *buf = xmalloc (n_bytes);
328 :
329 : do
330 : {
331 : /* Fill up the full buffer size from the input file. */
332 :
333 0 : n_read = full_read (STDIN_FILENO, buf + n_buffered, n_bytes - n_buffered);
334 0 : if (n_read == SAFE_READ_ERROR)
335 0 : error (EXIT_FAILURE, errno, "%s", infile);
336 :
337 0 : n_buffered += n_read;
338 0 : if (n_buffered != n_bytes)
339 : {
340 0 : if (n_buffered == 0)
341 0 : break;
342 0 : eof = true;
343 : }
344 :
345 : /* Find where to end this chunk. */
346 0 : bp = buf + n_buffered;
347 0 : if (n_buffered == n_bytes)
348 : {
349 0 : while (bp > buf && bp[-1] != '\n')
350 0 : bp--;
351 : }
352 :
353 : /* If chunk has no newlines, use all the chunk. */
354 0 : if (bp == buf)
355 0 : bp = buf + n_buffered;
356 :
357 : /* Output the chars as one output file. */
358 0 : cwrite (true, buf, bp - buf);
359 :
360 : /* Discard the chars we just output; move rest of chunk
361 : down to be the start of the next chunk. Source and
362 : destination probably overlap. */
363 0 : n_buffered -= bp - buf;
364 0 : if (n_buffered > 0)
365 0 : memmove (buf, bp, n_buffered);
366 : }
367 0 : while (!eof);
368 0 : free (buf);
369 0 : }
370 :
371 : #define FAIL_ONLY_ONE_WAY() \
372 : do \
373 : { \
374 : error (0, 0, _("cannot split in more than one way")); \
375 : usage (EXIT_FAILURE); \
376 : } \
377 : while (0)
378 :
379 : int
380 79 : main (int argc, char **argv)
381 : {
382 : struct stat stat_buf;
383 : enum
384 : {
385 : type_undef, type_bytes, type_byteslines, type_lines, type_digits
386 79 : } split_type = type_undef;
387 : size_t in_blk_size; /* optimal block size of input file device */
388 : char *buf; /* file i/o buffer */
389 79 : size_t page_size = getpagesize ();
390 : uintmax_t n_units;
391 : static char const multipliers[] = "bEGKkMmPTYZ0";
392 : int c;
393 79 : int digits_optind = 0;
394 :
395 : initialize_main (&argc, &argv);
396 79 : program_name = argv[0];
397 79 : setlocale (LC_ALL, "");
398 : bindtextdomain (PACKAGE, LOCALEDIR);
399 : textdomain (PACKAGE);
400 :
401 79 : atexit (close_stdout);
402 :
403 : /* Parse command line options. */
404 :
405 79 : infile = "-";
406 79 : outbase = "x";
407 :
408 : while (1)
409 26 : {
410 : /* This is the argv-index of the option we will read next. */
411 105 : int this_optind = optind ? optind : 1;
412 :
413 105 : c = getopt_long (argc, argv, "0123456789C:a:b:dl:", longopts, NULL);
414 105 : if (c == -1)
415 50 : break;
416 :
417 55 : switch (c)
418 : {
419 7 : case 'a':
420 : {
421 : unsigned long tmp;
422 7 : if (xstrtoul (optarg, NULL, 10, &tmp, "") != LONGINT_OK
423 2 : || SIZE_MAX / sizeof (size_t) < tmp)
424 : {
425 5 : error (0, 0, _("%s: invalid suffix length"), optarg);
426 5 : usage (EXIT_FAILURE);
427 : }
428 2 : suffix_length = tmp;
429 : }
430 2 : break;
431 :
432 21 : case 'b':
433 21 : if (split_type != type_undef)
434 0 : FAIL_ONLY_ONE_WAY ();
435 21 : split_type = type_bytes;
436 21 : if (xstrtoumax (optarg, NULL, 10, &n_units, multipliers) != LONGINT_OK
437 12 : || n_units == 0)
438 : {
439 10 : error (0, 0, _("%s: invalid number of bytes"), optarg);
440 10 : usage (EXIT_FAILURE);
441 : }
442 11 : break;
443 :
444 1 : case 'l':
445 1 : if (split_type != type_undef)
446 0 : FAIL_ONLY_ONE_WAY ();
447 1 : split_type = type_lines;
448 1 : if (xstrtoumax (optarg, NULL, 10, &n_units, "") != LONGINT_OK
449 0 : || n_units == 0)
450 : {
451 1 : error (0, 0, _("%s: invalid number of lines"), optarg);
452 1 : usage (EXIT_FAILURE);
453 : }
454 0 : break;
455 :
456 2 : case 'C':
457 2 : if (split_type != type_undef)
458 1 : FAIL_ONLY_ONE_WAY ();
459 1 : split_type = type_byteslines;
460 1 : if (xstrtoumax (optarg, NULL, 10, &n_units, multipliers) != LONGINT_OK
461 0 : || n_units == 0 || SIZE_MAX < n_units)
462 : {
463 1 : error (0, 0, _("%s: invalid number of bytes"), optarg);
464 1 : usage (EXIT_FAILURE);
465 : }
466 0 : break;
467 :
468 12 : case '0':
469 : case '1':
470 : case '2':
471 : case '3':
472 : case '4':
473 : case '5':
474 : case '6':
475 : case '7':
476 : case '8':
477 : case '9':
478 12 : if (split_type == type_undef)
479 : {
480 8 : split_type = type_digits;
481 8 : n_units = 0;
482 : }
483 12 : if (split_type != type_undef && split_type != type_digits)
484 1 : FAIL_ONLY_ONE_WAY ();
485 11 : if (digits_optind != 0 && digits_optind != this_optind)
486 1 : n_units = 0; /* More than one number given; ignore other. */
487 11 : digits_optind = this_optind;
488 11 : if (!DECIMAL_DIGIT_ACCUMULATE (n_units, c - '0', uintmax_t))
489 : {
490 : char buffer[INT_BUFSIZE_BOUND (uintmax_t)];
491 0 : error (EXIT_FAILURE, 0,
492 : _("line count option -%s%c... is too large"),
493 : umaxtostr (n_units, buffer), c);
494 : }
495 11 : break;
496 :
497 1 : case 'd':
498 1 : suffix_alphabet = "0123456789";
499 1 : break;
500 :
501 1 : case VERBOSE_OPTION:
502 1 : verbose = true;
503 1 : break;
504 :
505 1 : case_GETOPT_HELP_CHAR;
506 :
507 1 : case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
508 :
509 8 : default:
510 8 : usage (EXIT_FAILURE);
511 : }
512 : }
513 :
514 : /* Handle default case. */
515 50 : if (split_type == type_undef)
516 : {
517 33 : split_type = type_lines;
518 33 : n_units = 1000;
519 : }
520 :
521 50 : if (n_units == 0)
522 : {
523 5 : error (0, 0, _("invalid number of lines: 0"));
524 5 : usage (EXIT_FAILURE);
525 : }
526 :
527 : /* Get out the filename arguments. */
528 :
529 45 : if (optind < argc)
530 30 : infile = argv[optind++];
531 :
532 45 : if (optind < argc)
533 26 : outbase = argv[optind++];
534 :
535 45 : if (optind < argc)
536 : {
537 14 : error (0, 0, _("extra operand %s"), quote (argv[optind]));
538 14 : usage (EXIT_FAILURE);
539 : }
540 :
541 : /* Open the input file. */
542 31 : if (! STREQ (infile, "-")
543 7 : && fd_reopen (STDIN_FILENO, infile, O_RDONLY, 0) < 0)
544 3 : error (EXIT_FAILURE, errno, _("cannot open %s for reading"),
545 : quote (infile));
546 :
547 : /* Binary I/O is safer when bytecounts are used. */
548 : if (O_BINARY && ! isatty (STDIN_FILENO))
549 : freopen (NULL, "rb", stdin);
550 :
551 : /* No output file is open now. */
552 28 : output_desc = -1;
553 :
554 : /* Get the optimal block size of input device and make a buffer. */
555 :
556 28 : if (fstat (STDIN_FILENO, &stat_buf) != 0)
557 0 : error (EXIT_FAILURE, errno, "%s", infile);
558 28 : in_blk_size = ST_BLKSIZE (stat_buf);
559 :
560 28 : buf = ptr_align (xmalloc (in_blk_size + 1 + page_size - 1), page_size);
561 :
562 28 : switch (split_type)
563 : {
564 18 : case type_digits:
565 : case type_lines:
566 18 : lines_split (n_units, buf, in_blk_size);
567 16 : break;
568 :
569 10 : case type_bytes:
570 10 : bytes_split (n_units, buf, in_blk_size);
571 10 : break;
572 :
573 0 : case type_byteslines:
574 0 : line_bytes_split (n_units);
575 0 : break;
576 :
577 0 : default:
578 0 : abort ();
579 : }
580 :
581 26 : if (close (STDIN_FILENO) != 0)
582 0 : error (EXIT_FAILURE, errno, "%s", infile);
583 26 : if (output_desc >= 0 && close (output_desc) < 0)
584 0 : error (EXIT_FAILURE, errno, "%s", outfile);
585 :
586 26 : exit (EXIT_SUCCESS);
587 : }
|