| 1 | /* cut - remove parts of lines of files
|
|---|
| 2 | Copyright (C) 1997-2005 Free Software Foundation, Inc.
|
|---|
| 3 | Copyright (C) 1984 David M. Ihnat
|
|---|
| 4 |
|
|---|
| 5 | This program is free software; you can redistribute it and/or modify
|
|---|
| 6 | it under the terms of the GNU General Public License as published by
|
|---|
| 7 | the Free Software Foundation; either version 2, or (at your option)
|
|---|
| 8 | any later version.
|
|---|
| 9 |
|
|---|
| 10 | This program is distributed in the hope that it will be useful,
|
|---|
| 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|---|
| 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|---|
| 13 | GNU General Public License for more details.
|
|---|
| 14 |
|
|---|
| 15 | You should have received a copy of the GNU General Public License
|
|---|
| 16 | along with this program; if not, write to the Free Software Foundation,
|
|---|
| 17 | Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
|
|---|
| 18 |
|
|---|
| 19 | /* Written by David Ihnat. */
|
|---|
| 20 |
|
|---|
| 21 | /* POSIX changes, bug fixes, long-named options, and cleanup
|
|---|
| 22 | by David MacKenzie <[email protected]>.
|
|---|
| 23 |
|
|---|
| 24 | Rewrite cut_fields and cut_bytes -- Jim Meyering. */
|
|---|
| 25 |
|
|---|
| 26 | #include <config.h>
|
|---|
| 27 |
|
|---|
| 28 | #include <stdio.h>
|
|---|
| 29 | #include <assert.h>
|
|---|
| 30 | #include <getopt.h>
|
|---|
| 31 | #include <sys/types.h>
|
|---|
| 32 | #include "system.h"
|
|---|
| 33 |
|
|---|
| 34 | #include "error.h"
|
|---|
| 35 | #include "getndelim2.h"
|
|---|
| 36 | #include "hash.h"
|
|---|
| 37 | #include "quote.h"
|
|---|
| 38 | #include "xstrndup.h"
|
|---|
| 39 |
|
|---|
| 40 | /* The official name of this program (e.g., no `g' prefix). */
|
|---|
| 41 | #define PROGRAM_NAME "cut"
|
|---|
| 42 |
|
|---|
| 43 | #define AUTHORS "David Ihnat", "David MacKenzie", "Jim Meyering"
|
|---|
| 44 |
|
|---|
| 45 | #define FATAL_ERROR(Message) \
|
|---|
| 46 | do \
|
|---|
| 47 | { \
|
|---|
| 48 | error (0, 0, (Message)); \
|
|---|
| 49 | usage (EXIT_FAILURE); \
|
|---|
| 50 | } \
|
|---|
| 51 | while (0)
|
|---|
| 52 |
|
|---|
| 53 | /* Append LOW, HIGH to the list RP of range pairs, allocating additional
|
|---|
| 54 | space if necessary. Update local variable N_RP. When allocating,
|
|---|
| 55 | update global variable N_RP_ALLOCATED. */
|
|---|
| 56 |
|
|---|
| 57 | #define ADD_RANGE_PAIR(rp, low, high) \
|
|---|
| 58 | do \
|
|---|
| 59 | { \
|
|---|
| 60 | if (n_rp >= n_rp_allocated) \
|
|---|
| 61 | { \
|
|---|
| 62 | (rp) = X2NREALLOC (rp, &n_rp_allocated); \
|
|---|
| 63 | } \
|
|---|
| 64 | rp[n_rp].lo = (low); \
|
|---|
| 65 | rp[n_rp].hi = (high); \
|
|---|
| 66 | ++n_rp; \
|
|---|
| 67 | } \
|
|---|
| 68 | while (0)
|
|---|
| 69 |
|
|---|
| 70 | struct range_pair
|
|---|
| 71 | {
|
|---|
| 72 | size_t lo;
|
|---|
| 73 | size_t hi;
|
|---|
| 74 | };
|
|---|
| 75 |
|
|---|
| 76 | /* This buffer is used to support the semantics of the -s option
|
|---|
| 77 | (or lack of same) when the specified field list includes (does
|
|---|
| 78 | not include) the first field. In both of those cases, the entire
|
|---|
| 79 | first field must be read into this buffer to determine whether it
|
|---|
| 80 | is followed by a delimiter or a newline before any of it may be
|
|---|
| 81 | output. Otherwise, cut_fields can do the job without using this
|
|---|
| 82 | buffer. */
|
|---|
| 83 | static char *field_1_buffer;
|
|---|
| 84 |
|
|---|
| 85 | /* The number of bytes allocated for FIELD_1_BUFFER. */
|
|---|
| 86 | static size_t field_1_bufsize;
|
|---|
| 87 |
|
|---|
| 88 | /* The largest field or byte index used as an endpoint of a closed
|
|---|
| 89 | or degenerate range specification; this doesn't include the starting
|
|---|
| 90 | index of right-open-ended ranges. For example, with either range spec
|
|---|
| 91 | `2-5,9-', `2-3,5,9-' this variable would be set to 5. */
|
|---|
| 92 | static size_t max_range_endpoint;
|
|---|
| 93 |
|
|---|
| 94 | /* If nonzero, this is the index of the first field in a range that goes
|
|---|
| 95 | to end of line. */
|
|---|
| 96 | static size_t eol_range_start;
|
|---|
| 97 |
|
|---|
| 98 | /* This is a bit vector.
|
|---|
| 99 | In byte mode, which bytes to output.
|
|---|
| 100 | In field mode, which DELIM-separated fields to output.
|
|---|
| 101 | Both bytes and fields are numbered starting with 1,
|
|---|
| 102 | so the zeroth bit of this array is unused.
|
|---|
| 103 | A field or byte K has been selected if
|
|---|
| 104 | (K <= MAX_RANGE_ENDPOINT and is_printable_field(K))
|
|---|
| 105 | || (EOL_RANGE_START > 0 && K >= EOL_RANGE_START). */
|
|---|
| 106 | static unsigned char *printable_field;
|
|---|
| 107 |
|
|---|
| 108 | enum operating_mode
|
|---|
| 109 | {
|
|---|
| 110 | undefined_mode,
|
|---|
| 111 |
|
|---|
| 112 | /* Output characters that are in the given bytes. */
|
|---|
| 113 | byte_mode,
|
|---|
| 114 |
|
|---|
| 115 | /* Output the given delimeter-separated fields. */
|
|---|
| 116 | field_mode
|
|---|
| 117 | };
|
|---|
| 118 |
|
|---|
| 119 | /* The name this program was run with. */
|
|---|
| 120 | char *program_name;
|
|---|
| 121 |
|
|---|
| 122 | static enum operating_mode operating_mode;
|
|---|
| 123 |
|
|---|
| 124 | /* If true do not output lines containing no delimeter characters.
|
|---|
| 125 | Otherwise, all such lines are printed. This option is valid only
|
|---|
| 126 | with field mode. */
|
|---|
| 127 | static bool suppress_non_delimited;
|
|---|
| 128 |
|
|---|
| 129 | /* If nonzero, print all bytes, characters, or fields _except_
|
|---|
| 130 | those that were specified. */
|
|---|
| 131 | static bool complement;
|
|---|
| 132 |
|
|---|
| 133 | /* The delimeter character for field mode. */
|
|---|
| 134 | static unsigned char delim;
|
|---|
| 135 |
|
|---|
| 136 | /* True if the --output-delimiter=STRING option was specified. */
|
|---|
| 137 | static bool output_delimiter_specified;
|
|---|
| 138 |
|
|---|
| 139 | /* The length of output_delimiter_string. */
|
|---|
| 140 | static size_t output_delimiter_length;
|
|---|
| 141 |
|
|---|
| 142 | /* The output field separator string. Defaults to the 1-character
|
|---|
| 143 | string consisting of the input delimiter. */
|
|---|
| 144 | static char *output_delimiter_string;
|
|---|
| 145 |
|
|---|
| 146 | /* True if we have ever read standard input. */
|
|---|
| 147 | static bool have_read_stdin;
|
|---|
| 148 |
|
|---|
| 149 | #define HT_RANGE_START_INDEX_INITIAL_CAPACITY 31
|
|---|
| 150 |
|
|---|
| 151 | /* The set of range-start indices. For example, given a range-spec list like
|
|---|
| 152 | `-b1,3-5,4-9,15-', the following indices will be recorded here: 1, 3, 15.
|
|---|
| 153 | Note that although `4' looks like a range-start index, it is in the middle
|
|---|
| 154 | of the `3-5' range, so it doesn't count.
|
|---|
| 155 | This table is created/used IFF output_delimiter_specified is set. */
|
|---|
| 156 | static Hash_table *range_start_ht;
|
|---|
| 157 |
|
|---|
| 158 | /* For long options that have no equivalent short option, use a
|
|---|
| 159 | non-character as a pseudo short option, starting with CHAR_MAX + 1. */
|
|---|
| 160 | enum
|
|---|
| 161 | {
|
|---|
| 162 | OUTPUT_DELIMITER_OPTION = CHAR_MAX + 1,
|
|---|
| 163 | COMPLEMENT_OPTION
|
|---|
| 164 | };
|
|---|
| 165 |
|
|---|
| 166 | static struct option const longopts[] =
|
|---|
| 167 | {
|
|---|
| 168 | {"bytes", required_argument, NULL, 'b'},
|
|---|
| 169 | {"characters", required_argument, NULL, 'c'},
|
|---|
| 170 | {"fields", required_argument, NULL, 'f'},
|
|---|
| 171 | {"delimiter", required_argument, NULL, 'd'},
|
|---|
| 172 | {"only-delimited", no_argument, NULL, 's'},
|
|---|
| 173 | {"output-delimiter", required_argument, NULL, OUTPUT_DELIMITER_OPTION},
|
|---|
| 174 | {"complement", no_argument, NULL, COMPLEMENT_OPTION},
|
|---|
| 175 | {GETOPT_HELP_OPTION_DECL},
|
|---|
| 176 | {GETOPT_VERSION_OPTION_DECL},
|
|---|
| 177 | {NULL, 0, NULL, 0}
|
|---|
| 178 | };
|
|---|
| 179 |
|
|---|
| 180 | void
|
|---|
| 181 | usage (int status)
|
|---|
| 182 | {
|
|---|
| 183 | if (status != EXIT_SUCCESS)
|
|---|
| 184 | fprintf (stderr, _("Try `%s --help' for more information.\n"),
|
|---|
| 185 | program_name);
|
|---|
| 186 | else
|
|---|
| 187 | {
|
|---|
| 188 | printf (_("\
|
|---|
| 189 | Usage: %s [OPTION]... [FILE]...\n\
|
|---|
| 190 | "),
|
|---|
| 191 | program_name);
|
|---|
| 192 | fputs (_("\
|
|---|
| 193 | Print selected parts of lines from each FILE to standard output.\n\
|
|---|
| 194 | \n\
|
|---|
| 195 | "), stdout);
|
|---|
| 196 | fputs (_("\
|
|---|
| 197 | Mandatory arguments to long options are mandatory for short options too.\n\
|
|---|
| 198 | "), stdout);
|
|---|
| 199 | fputs (_("\
|
|---|
| 200 | -b, --bytes=LIST select only these bytes\n\
|
|---|
| 201 | -c, --characters=LIST select only these characters\n\
|
|---|
| 202 | -d, --delimiter=DELIM use DELIM instead of TAB for field delimiter\n\
|
|---|
| 203 | "), stdout);
|
|---|
| 204 | fputs (_("\
|
|---|
| 205 | -f, --fields=LIST select only these fields; also print any line\n\
|
|---|
| 206 | that contains no delimiter character, unless\n\
|
|---|
| 207 | the -s option is specified\n\
|
|---|
| 208 | -n (ignored)\n\
|
|---|
| 209 | "), stdout);
|
|---|
| 210 | fputs (_("\
|
|---|
| 211 | --complement complement the set of selected bytes, characters\n\
|
|---|
| 212 | or fields.\n\
|
|---|
| 213 | "), stdout);
|
|---|
| 214 | fputs (_("\
|
|---|
| 215 | -s, --only-delimited do not print lines not containing delimiters\n\
|
|---|
| 216 | --output-delimiter=STRING use STRING as the output delimiter\n\
|
|---|
| 217 | the default is to use the input delimiter\n\
|
|---|
| 218 | "), stdout);
|
|---|
| 219 | fputs (HELP_OPTION_DESCRIPTION, stdout);
|
|---|
| 220 | fputs (VERSION_OPTION_DESCRIPTION, stdout);
|
|---|
| 221 | fputs (_("\
|
|---|
| 222 | \n\
|
|---|
| 223 | Use one, and only one of -b, -c or -f. Each LIST is made up of one\n\
|
|---|
| 224 | range, or many ranges separated by commas. Selected input is written\n\
|
|---|
| 225 | in the same order that it is read, and is written exactly once.\n\
|
|---|
| 226 | Each range is one of:\n\
|
|---|
| 227 | \n\
|
|---|
| 228 | N N'th byte, character or field, counted from 1\n\
|
|---|
| 229 | N- from N'th byte, character or field, to end of line\n\
|
|---|
| 230 | N-M from N'th to M'th (included) byte, character or field\n\
|
|---|
| 231 | -M from first to M'th (included) byte, character or field\n\
|
|---|
| 232 | \n\
|
|---|
| 233 | With no FILE, or when FILE is -, read standard input.\n\
|
|---|
| 234 | "), stdout);
|
|---|
| 235 | printf (_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT);
|
|---|
| 236 | }
|
|---|
| 237 | exit (status);
|
|---|
| 238 | }
|
|---|
| 239 |
|
|---|
| 240 | static inline void
|
|---|
| 241 | mark_range_start (size_t i)
|
|---|
| 242 | {
|
|---|
| 243 | /* Record the fact that `i' is a range-start index. */
|
|---|
| 244 | void *ent_from_table = hash_insert (range_start_ht, (void*) i);
|
|---|
| 245 | if (ent_from_table == NULL)
|
|---|
| 246 | {
|
|---|
| 247 | /* Insertion failed due to lack of memory. */
|
|---|
| 248 | xalloc_die ();
|
|---|
| 249 | }
|
|---|
| 250 | assert ((size_t) ent_from_table == i);
|
|---|
| 251 | }
|
|---|
| 252 |
|
|---|
| 253 | static inline void
|
|---|
| 254 | mark_printable_field (size_t i)
|
|---|
| 255 | {
|
|---|
| 256 | size_t n = i / CHAR_BIT;
|
|---|
| 257 | printable_field[n] |= (1 << (i % CHAR_BIT));
|
|---|
| 258 | }
|
|---|
| 259 |
|
|---|
| 260 | static inline bool
|
|---|
| 261 | is_printable_field (size_t i)
|
|---|
| 262 | {
|
|---|
| 263 | size_t n = i / CHAR_BIT;
|
|---|
| 264 | return (printable_field[n] >> (i % CHAR_BIT)) & 1;
|
|---|
| 265 | }
|
|---|
| 266 |
|
|---|
| 267 | static size_t
|
|---|
| 268 | hash_int (const void *x, size_t tablesize)
|
|---|
| 269 | {
|
|---|
| 270 | #ifdef UINTPTR_MAX
|
|---|
| 271 | uintptr_t y = (uintptr_t) x;
|
|---|
| 272 | #else
|
|---|
| 273 | size_t y = (size_t) x;
|
|---|
| 274 | #endif
|
|---|
| 275 | return y % tablesize;
|
|---|
| 276 | }
|
|---|
| 277 |
|
|---|
| 278 | static bool
|
|---|
| 279 | hash_compare_ints (void const *x, void const *y)
|
|---|
| 280 | {
|
|---|
| 281 | return (x == y) ? true : false;
|
|---|
| 282 | }
|
|---|
| 283 |
|
|---|
| 284 | static bool
|
|---|
| 285 | is_range_start_index (size_t i)
|
|---|
| 286 | {
|
|---|
| 287 | return hash_lookup (range_start_ht, (void *) i) ? true : false;
|
|---|
| 288 | }
|
|---|
| 289 |
|
|---|
| 290 | /* Return nonzero if the K'th field or byte is printable.
|
|---|
| 291 | When returning nonzero, if RANGE_START is non-NULL,
|
|---|
| 292 | set *RANGE_START to true if K is the beginning of a range, and to
|
|---|
| 293 | false otherwise. */
|
|---|
| 294 |
|
|---|
| 295 | static bool
|
|---|
| 296 | print_kth (size_t k, bool *range_start)
|
|---|
| 297 | {
|
|---|
| 298 | bool k_selected
|
|---|
| 299 | = ((0 < eol_range_start && eol_range_start <= k)
|
|---|
| 300 | || (k <= max_range_endpoint && is_printable_field (k)));
|
|---|
| 301 |
|
|---|
| 302 | bool is_selected = k_selected ^ complement;
|
|---|
| 303 | if (range_start && is_selected)
|
|---|
| 304 | *range_start = is_range_start_index (k);
|
|---|
| 305 |
|
|---|
| 306 | return is_selected;
|
|---|
| 307 | }
|
|---|
| 308 |
|
|---|
| 309 | /* Comparison function for qsort to order the list of
|
|---|
| 310 | struct range_pairs. */
|
|---|
| 311 | static int
|
|---|
| 312 | compare_ranges (const void *a, const void *b)
|
|---|
| 313 | {
|
|---|
| 314 | int a_start = ((const struct range_pair *) a)->lo;
|
|---|
| 315 | int b_start = ((const struct range_pair *) b)->lo;
|
|---|
| 316 | return a_start < b_start ? -1 : a_start > b_start;
|
|---|
| 317 | }
|
|---|
| 318 |
|
|---|
| 319 | /* Given the list of field or byte range specifications FIELDSTR, set
|
|---|
| 320 | MAX_RANGE_ENDPOINT and allocate and initialize the PRINTABLE_FIELD
|
|---|
| 321 | array. If there is a right-open-ended range, set EOL_RANGE_START
|
|---|
| 322 | to its starting index. FIELDSTR should be composed of one or more
|
|---|
| 323 | numbers or ranges of numbers, separated by blanks or commas.
|
|---|
| 324 | Incomplete ranges may be given: `-m' means `1-m'; `n-' means `n'
|
|---|
| 325 | through end of line. Return true if FIELDSTR contains at least
|
|---|
| 326 | one field specification, false otherwise. */
|
|---|
| 327 |
|
|---|
| 328 | /* FIXME-someday: What if the user wants to cut out the 1,000,000-th
|
|---|
| 329 | field of some huge input file? This function shouldn't have to
|
|---|
| 330 | allocate a table of a million bits just so we can test every
|
|---|
| 331 | field < 10^6 with an array dereference. Instead, consider using
|
|---|
| 332 | an adaptive approach: if the range of selected fields is too large,
|
|---|
| 333 | but only a few fields/byte-offsets are actually selected, use a
|
|---|
| 334 | hash table. If the range of selected fields is too large, and
|
|---|
| 335 | too many are selected, then resort to using the range-pairs (the
|
|---|
| 336 | `rp' array) directly. */
|
|---|
| 337 |
|
|---|
| 338 | static bool
|
|---|
| 339 | set_fields (const char *fieldstr)
|
|---|
| 340 | {
|
|---|
| 341 | size_t initial = 1; /* Value of first number in a range. */
|
|---|
| 342 | size_t value = 0; /* If nonzero, a number being accumulated. */
|
|---|
| 343 | bool dash_found = false; /* True if a '-' is found in this field. */
|
|---|
| 344 | bool field_found = false; /* True if at least one field spec
|
|---|
| 345 | has been processed. */
|
|---|
| 346 |
|
|---|
| 347 | struct range_pair *rp = NULL;
|
|---|
| 348 | size_t n_rp = 0;
|
|---|
| 349 | size_t n_rp_allocated = 0;
|
|---|
| 350 | size_t i;
|
|---|
| 351 | bool in_digits = false;
|
|---|
| 352 |
|
|---|
| 353 | /* Collect and store in RP the range end points.
|
|---|
| 354 | It also sets EOL_RANGE_START if appropriate. */
|
|---|
| 355 |
|
|---|
| 356 | for (;;)
|
|---|
| 357 | {
|
|---|
| 358 | if (*fieldstr == '-')
|
|---|
| 359 | {
|
|---|
| 360 | in_digits = false;
|
|---|
| 361 | /* Starting a range. */
|
|---|
| 362 | if (dash_found)
|
|---|
| 363 | FATAL_ERROR (_("invalid byte or field list"));
|
|---|
| 364 | dash_found = true;
|
|---|
| 365 | fieldstr++;
|
|---|
| 366 |
|
|---|
| 367 | if (value)
|
|---|
| 368 | {
|
|---|
| 369 | initial = value;
|
|---|
| 370 | value = 0;
|
|---|
| 371 | }
|
|---|
| 372 | else
|
|---|
| 373 | initial = 1;
|
|---|
| 374 | }
|
|---|
| 375 | else if (*fieldstr == ',' || ISBLANK (*fieldstr) || *fieldstr == '\0')
|
|---|
| 376 | {
|
|---|
| 377 | in_digits = false;
|
|---|
| 378 | /* Ending the string, or this field/byte sublist. */
|
|---|
| 379 | if (dash_found)
|
|---|
| 380 | {
|
|---|
| 381 | dash_found = false;
|
|---|
| 382 |
|
|---|
| 383 | /* A range. Possibilites: -n, m-n, n-.
|
|---|
| 384 | In any case, `initial' contains the start of the range. */
|
|---|
| 385 | if (value == 0)
|
|---|
| 386 | {
|
|---|
| 387 | /* `n-'. From `initial' to end of line. */
|
|---|
| 388 | eol_range_start = initial;
|
|---|
| 389 | field_found = true;
|
|---|
| 390 | }
|
|---|
| 391 | else
|
|---|
| 392 | {
|
|---|
| 393 | /* `m-n' or `-n' (1-n). */
|
|---|
| 394 | if (value < initial)
|
|---|
| 395 | FATAL_ERROR (_("invalid byte or field list"));
|
|---|
| 396 |
|
|---|
| 397 | /* Is there already a range going to end of line? */
|
|---|
| 398 | if (eol_range_start != 0)
|
|---|
| 399 | {
|
|---|
| 400 | /* Yes. Is the new sequence already contained
|
|---|
| 401 | in the old one? If so, no processing is
|
|---|
| 402 | necessary. */
|
|---|
| 403 | if (initial < eol_range_start)
|
|---|
| 404 | {
|
|---|
| 405 | /* No, the new sequence starts before the
|
|---|
| 406 | old. Does the old range going to end of line
|
|---|
| 407 | extend into the new range? */
|
|---|
| 408 | if (eol_range_start <= value)
|
|---|
| 409 | {
|
|---|
| 410 | /* Yes. Simply move the end of line marker. */
|
|---|
| 411 | eol_range_start = initial;
|
|---|
| 412 | }
|
|---|
| 413 | else
|
|---|
| 414 | {
|
|---|
| 415 | /* No. A simple range, before and disjoint from
|
|---|
| 416 | the range going to end of line. Fill it. */
|
|---|
| 417 | ADD_RANGE_PAIR (rp, initial, value);
|
|---|
| 418 | }
|
|---|
| 419 |
|
|---|
| 420 | /* In any case, some fields were selected. */
|
|---|
| 421 | field_found = true;
|
|---|
| 422 | }
|
|---|
| 423 | }
|
|---|
| 424 | else
|
|---|
| 425 | {
|
|---|
| 426 | /* There is no range going to end of line. */
|
|---|
| 427 | ADD_RANGE_PAIR (rp, initial, value);
|
|---|
| 428 | field_found = true;
|
|---|
| 429 | }
|
|---|
| 430 | value = 0;
|
|---|
| 431 | }
|
|---|
| 432 | }
|
|---|
| 433 | else if (value != 0)
|
|---|
| 434 | {
|
|---|
| 435 | /* A simple field number, not a range. */
|
|---|
| 436 | ADD_RANGE_PAIR (rp, value, value);
|
|---|
| 437 | value = 0;
|
|---|
| 438 | field_found = true;
|
|---|
| 439 | }
|
|---|
| 440 |
|
|---|
| 441 | if (*fieldstr == '\0')
|
|---|
| 442 | {
|
|---|
| 443 | break;
|
|---|
| 444 | }
|
|---|
| 445 |
|
|---|
| 446 | fieldstr++;
|
|---|
| 447 | }
|
|---|
| 448 | else if (ISDIGIT (*fieldstr))
|
|---|
| 449 | {
|
|---|
| 450 | /* Record beginning of digit string, in case we have to
|
|---|
| 451 | complain about it. */
|
|---|
| 452 | static char const *num_start;
|
|---|
| 453 | if (!in_digits || !num_start)
|
|---|
| 454 | num_start = fieldstr;
|
|---|
| 455 | in_digits = true;
|
|---|
| 456 |
|
|---|
| 457 | /* Detect overflow. */
|
|---|
| 458 | if (!DECIMAL_DIGIT_ACCUMULATE (value, *fieldstr - '0', size_t))
|
|---|
| 459 | {
|
|---|
| 460 | /* In case the user specified -c4294967296,22,
|
|---|
| 461 | complain only about the first number. */
|
|---|
| 462 | /* Determine the length of the offending number. */
|
|---|
| 463 | size_t len = strspn (num_start, "0123456789");
|
|---|
| 464 | char *bad_num = xstrndup (num_start, len);
|
|---|
| 465 | if (operating_mode == byte_mode)
|
|---|
| 466 | error (0, 0,
|
|---|
| 467 | _("byte offset %s is too large"), quote (bad_num));
|
|---|
| 468 | else
|
|---|
| 469 | error (0, 0,
|
|---|
| 470 | _("field number %s is too large"), quote (bad_num));
|
|---|
| 471 | free (bad_num);
|
|---|
| 472 | exit (EXIT_FAILURE);
|
|---|
| 473 | }
|
|---|
| 474 |
|
|---|
| 475 | fieldstr++;
|
|---|
| 476 | }
|
|---|
| 477 | else
|
|---|
| 478 | FATAL_ERROR (_("invalid byte or field list"));
|
|---|
| 479 | }
|
|---|
| 480 |
|
|---|
| 481 | max_range_endpoint = 0;
|
|---|
| 482 | for (i = 0; i < n_rp; i++)
|
|---|
| 483 | {
|
|---|
| 484 | if (rp[i].hi > max_range_endpoint)
|
|---|
| 485 | max_range_endpoint = rp[i].hi;
|
|---|
| 486 | }
|
|---|
| 487 |
|
|---|
| 488 | /* Allocate an array large enough so that it may be indexed by
|
|---|
| 489 | the field numbers corresponding to all finite ranges
|
|---|
| 490 | (i.e. `2-6' or `-4', but not `5-') in FIELDSTR. */
|
|---|
| 491 |
|
|---|
| 492 | printable_field = xzalloc (max_range_endpoint / CHAR_BIT + 1);
|
|---|
| 493 |
|
|---|
| 494 | qsort (rp, n_rp, sizeof (rp[0]), compare_ranges);
|
|---|
| 495 |
|
|---|
| 496 | /* Set the array entries corresponding to integers in the ranges of RP. */
|
|---|
| 497 | for (i = 0; i < n_rp; i++)
|
|---|
| 498 | {
|
|---|
| 499 | size_t j;
|
|---|
| 500 | size_t rsi_candidate;
|
|---|
| 501 |
|
|---|
| 502 | /* Record the range-start indices, i.e., record each start
|
|---|
| 503 | index that is not part of any other (lo..hi] range. */
|
|---|
| 504 | rsi_candidate = complement ? rp[i].hi + 1 : rp[i].lo;
|
|---|
| 505 | if (output_delimiter_specified
|
|---|
| 506 | && !is_printable_field (rsi_candidate))
|
|---|
| 507 | mark_range_start (rsi_candidate);
|
|---|
| 508 |
|
|---|
| 509 | for (j = rp[i].lo; j <= rp[i].hi; j++)
|
|---|
| 510 | mark_printable_field (j);
|
|---|
| 511 | }
|
|---|
| 512 |
|
|---|
| 513 | if (output_delimiter_specified
|
|---|
| 514 | && !complement
|
|---|
| 515 | && eol_range_start && !is_printable_field (eol_range_start))
|
|---|
| 516 | mark_range_start (eol_range_start);
|
|---|
| 517 |
|
|---|
| 518 | free (rp);
|
|---|
| 519 |
|
|---|
| 520 | return field_found;
|
|---|
| 521 | }
|
|---|
| 522 |
|
|---|
| 523 | /* Read from stream STREAM, printing to standard output any selected bytes. */
|
|---|
| 524 |
|
|---|
| 525 | static void
|
|---|
| 526 | cut_bytes (FILE *stream)
|
|---|
| 527 | {
|
|---|
| 528 | size_t byte_idx; /* Number of bytes in the line so far. */
|
|---|
| 529 | /* Whether to begin printing delimiters between ranges for the current line.
|
|---|
| 530 | Set after we've begun printing data corresponding to the first range. */
|
|---|
| 531 | bool print_delimiter;
|
|---|
| 532 |
|
|---|
| 533 | byte_idx = 0;
|
|---|
| 534 | print_delimiter = false;
|
|---|
| 535 | while (1)
|
|---|
| 536 | {
|
|---|
| 537 | int c; /* Each character from the file. */
|
|---|
| 538 |
|
|---|
| 539 | c = getc (stream);
|
|---|
| 540 |
|
|---|
| 541 | if (c == '\n')
|
|---|
| 542 | {
|
|---|
| 543 | putchar ('\n');
|
|---|
| 544 | byte_idx = 0;
|
|---|
| 545 | print_delimiter = false;
|
|---|
| 546 | }
|
|---|
| 547 | else if (c == EOF)
|
|---|
| 548 | {
|
|---|
| 549 | if (byte_idx > 0)
|
|---|
| 550 | putchar ('\n');
|
|---|
| 551 | break;
|
|---|
| 552 | }
|
|---|
| 553 | else
|
|---|
| 554 | {
|
|---|
| 555 | bool range_start;
|
|---|
| 556 | bool *rs = output_delimiter_specified ? &range_start : NULL;
|
|---|
| 557 | if (print_kth (++byte_idx, rs))
|
|---|
| 558 | {
|
|---|
| 559 | if (rs && *rs && print_delimiter)
|
|---|
| 560 | {
|
|---|
| 561 | fwrite (output_delimiter_string, sizeof (char),
|
|---|
| 562 | output_delimiter_length, stdout);
|
|---|
| 563 | }
|
|---|
| 564 | print_delimiter = true;
|
|---|
| 565 | putchar (c);
|
|---|
| 566 | }
|
|---|
| 567 | }
|
|---|
| 568 | }
|
|---|
| 569 | }
|
|---|
| 570 |
|
|---|
| 571 | /* Read from stream STREAM, printing to standard output any selected fields. */
|
|---|
| 572 |
|
|---|
| 573 | static void
|
|---|
| 574 | cut_fields (FILE *stream)
|
|---|
| 575 | {
|
|---|
| 576 | int c;
|
|---|
| 577 | size_t field_idx = 1;
|
|---|
| 578 | bool found_any_selected_field = false;
|
|---|
| 579 | bool buffer_first_field;
|
|---|
| 580 |
|
|---|
| 581 | c = getc (stream);
|
|---|
| 582 | if (c == EOF)
|
|---|
| 583 | return;
|
|---|
| 584 |
|
|---|
| 585 | ungetc (c, stream);
|
|---|
| 586 |
|
|---|
| 587 | /* To support the semantics of the -s flag, we may have to buffer
|
|---|
| 588 | all of the first field to determine whether it is `delimited.'
|
|---|
| 589 | But that is unnecessary if all non-delimited lines must be printed
|
|---|
| 590 | and the first field has been selected, or if non-delimited lines
|
|---|
| 591 | must be suppressed and the first field has *not* been selected.
|
|---|
| 592 | That is because a non-delimited line has exactly one field. */
|
|---|
| 593 | buffer_first_field = (suppress_non_delimited ^ !print_kth (1, NULL));
|
|---|
| 594 |
|
|---|
| 595 | while (1)
|
|---|
| 596 | {
|
|---|
| 597 | if (field_idx == 1 && buffer_first_field)
|
|---|
| 598 | {
|
|---|
| 599 | ssize_t len;
|
|---|
| 600 | size_t n_bytes;
|
|---|
| 601 |
|
|---|
| 602 | len = getndelim2 (&field_1_buffer, &field_1_bufsize, 0,
|
|---|
| 603 | GETNLINE_NO_LIMIT, delim, '\n', stream);
|
|---|
| 604 | if (len < 0)
|
|---|
| 605 | {
|
|---|
| 606 | free (field_1_buffer);
|
|---|
| 607 | if (ferror (stream) || feof (stream))
|
|---|
| 608 | break;
|
|---|
| 609 | xalloc_die ();
|
|---|
| 610 | }
|
|---|
| 611 |
|
|---|
| 612 | n_bytes = len;
|
|---|
| 613 | assert (n_bytes != 0);
|
|---|
| 614 |
|
|---|
| 615 | /* If the first field extends to the end of line (it is not
|
|---|
| 616 | delimited) and we are printing all non-delimited lines,
|
|---|
| 617 | print this one. */
|
|---|
| 618 | if (to_uchar (field_1_buffer[n_bytes - 1]) != delim)
|
|---|
| 619 | {
|
|---|
| 620 | if (suppress_non_delimited)
|
|---|
| 621 | {
|
|---|
| 622 | /* Empty. */
|
|---|
| 623 | }
|
|---|
| 624 | else
|
|---|
| 625 | {
|
|---|
| 626 | fwrite (field_1_buffer, sizeof (char), n_bytes, stdout);
|
|---|
| 627 | /* Make sure the output line is newline terminated. */
|
|---|
| 628 | if (field_1_buffer[n_bytes - 1] != '\n')
|
|---|
| 629 | putchar ('\n');
|
|---|
| 630 | }
|
|---|
| 631 | continue;
|
|---|
| 632 | }
|
|---|
| 633 | if (print_kth (1, NULL))
|
|---|
| 634 | {
|
|---|
| 635 | /* Print the field, but not the trailing delimiter. */
|
|---|
| 636 | fwrite (field_1_buffer, sizeof (char), n_bytes - 1, stdout);
|
|---|
| 637 | found_any_selected_field = true;
|
|---|
| 638 | }
|
|---|
| 639 | ++field_idx;
|
|---|
| 640 | }
|
|---|
| 641 |
|
|---|
| 642 | if (c != EOF)
|
|---|
| 643 | {
|
|---|
| 644 | if (print_kth (field_idx, NULL))
|
|---|
| 645 | {
|
|---|
| 646 | if (found_any_selected_field)
|
|---|
| 647 | {
|
|---|
| 648 | fwrite (output_delimiter_string, sizeof (char),
|
|---|
| 649 | output_delimiter_length, stdout);
|
|---|
| 650 | }
|
|---|
| 651 | found_any_selected_field = true;
|
|---|
| 652 |
|
|---|
| 653 | while ((c = getc (stream)) != delim && c != '\n' && c != EOF)
|
|---|
| 654 | {
|
|---|
| 655 | putchar (c);
|
|---|
| 656 | }
|
|---|
| 657 | }
|
|---|
| 658 | else
|
|---|
| 659 | {
|
|---|
| 660 | while ((c = getc (stream)) != delim && c != '\n' && c != EOF)
|
|---|
| 661 | {
|
|---|
| 662 | /* Empty. */
|
|---|
| 663 | }
|
|---|
| 664 | }
|
|---|
| 665 | }
|
|---|
| 666 |
|
|---|
| 667 | if (c == '\n')
|
|---|
| 668 | {
|
|---|
| 669 | c = getc (stream);
|
|---|
| 670 | if (c != EOF)
|
|---|
| 671 | {
|
|---|
| 672 | ungetc (c, stream);
|
|---|
| 673 | c = '\n';
|
|---|
| 674 | }
|
|---|
| 675 | }
|
|---|
| 676 |
|
|---|
| 677 | if (c == delim)
|
|---|
| 678 | ++field_idx;
|
|---|
| 679 | else if (c == '\n' || c == EOF)
|
|---|
| 680 | {
|
|---|
| 681 | if (found_any_selected_field
|
|---|
| 682 | || !(suppress_non_delimited && field_idx == 1))
|
|---|
| 683 | putchar ('\n');
|
|---|
| 684 | if (c == EOF)
|
|---|
| 685 | break;
|
|---|
| 686 | field_idx = 1;
|
|---|
| 687 | found_any_selected_field = false;
|
|---|
| 688 | }
|
|---|
| 689 | }
|
|---|
| 690 | }
|
|---|
| 691 |
|
|---|
| 692 | static void
|
|---|
| 693 | cut_stream (FILE *stream)
|
|---|
| 694 | {
|
|---|
| 695 | if (operating_mode == byte_mode)
|
|---|
| 696 | cut_bytes (stream);
|
|---|
| 697 | else
|
|---|
| 698 | cut_fields (stream);
|
|---|
| 699 | }
|
|---|
| 700 |
|
|---|
| 701 | /* Process file FILE to standard output.
|
|---|
| 702 | Return true if successful. */
|
|---|
| 703 |
|
|---|
| 704 | static bool
|
|---|
| 705 | cut_file (char *file)
|
|---|
| 706 | {
|
|---|
| 707 | FILE *stream;
|
|---|
| 708 |
|
|---|
| 709 | if (STREQ (file, "-"))
|
|---|
| 710 | {
|
|---|
| 711 | have_read_stdin = true;
|
|---|
| 712 | stream = stdin;
|
|---|
| 713 | }
|
|---|
| 714 | else
|
|---|
| 715 | {
|
|---|
| 716 | stream = fopen (file, "r");
|
|---|
| 717 | if (stream == NULL)
|
|---|
| 718 | {
|
|---|
| 719 | error (0, errno, "%s", file);
|
|---|
| 720 | return false;
|
|---|
| 721 | }
|
|---|
| 722 | }
|
|---|
| 723 |
|
|---|
| 724 | cut_stream (stream);
|
|---|
| 725 |
|
|---|
| 726 | if (ferror (stream))
|
|---|
| 727 | {
|
|---|
| 728 | error (0, errno, "%s", file);
|
|---|
| 729 | return false;
|
|---|
| 730 | }
|
|---|
| 731 | if (STREQ (file, "-"))
|
|---|
| 732 | clearerr (stream); /* Also clear EOF. */
|
|---|
| 733 | else if (fclose (stream) == EOF)
|
|---|
| 734 | {
|
|---|
| 735 | error (0, errno, "%s", file);
|
|---|
| 736 | return false;
|
|---|
| 737 | }
|
|---|
| 738 | return true;
|
|---|
| 739 | }
|
|---|
| 740 |
|
|---|
| 741 | int
|
|---|
| 742 | main (int argc, char **argv)
|
|---|
| 743 | {
|
|---|
| 744 | int optc;
|
|---|
| 745 | bool ok;
|
|---|
| 746 | bool delim_specified = false;
|
|---|
| 747 | char *spec_list_string IF_LINT(= NULL);
|
|---|
| 748 |
|
|---|
| 749 | #ifdef __EMX__
|
|---|
| 750 | /* a undocumented hack */
|
|---|
| 751 | if (getenv("CUT_BINARY_MODE"))
|
|---|
| 752 | {
|
|---|
| 753 | extern int _fmode_bin;
|
|---|
| 754 | _fmode_bin = 1;
|
|---|
| 755 | if (!isatty(fileno(stdout)))
|
|---|
| 756 | freopen(NULL, "wb", stdout);
|
|---|
| 757 | if (!isatty(fileno(stdin)))
|
|---|
| 758 | freopen(NULL, "rb", stdin);
|
|---|
| 759 | }
|
|---|
| 760 | #endif
|
|---|
| 761 |
|
|---|
| 762 | initialize_main (&argc, &argv);
|
|---|
| 763 | program_name = argv[0];
|
|---|
| 764 | setlocale (LC_ALL, "");
|
|---|
| 765 | bindtextdomain (PACKAGE, LOCALEDIR);
|
|---|
| 766 | textdomain (PACKAGE);
|
|---|
| 767 |
|
|---|
| 768 | atexit (close_stdout);
|
|---|
| 769 |
|
|---|
| 770 | operating_mode = undefined_mode;
|
|---|
| 771 |
|
|---|
| 772 | /* By default, all non-delimited lines are printed. */
|
|---|
| 773 | suppress_non_delimited = false;
|
|---|
| 774 |
|
|---|
| 775 | delim = '\0';
|
|---|
| 776 | have_read_stdin = false;
|
|---|
| 777 |
|
|---|
| 778 | while ((optc = getopt_long (argc, argv, "b:c:d:f:ns", longopts, NULL)) != -1)
|
|---|
| 779 | {
|
|---|
| 780 | switch (optc)
|
|---|
| 781 | {
|
|---|
| 782 | case 'b':
|
|---|
| 783 | case 'c':
|
|---|
| 784 | /* Build the byte list. */
|
|---|
| 785 | if (operating_mode != undefined_mode)
|
|---|
| 786 | FATAL_ERROR (_("only one type of list may be specified"));
|
|---|
| 787 | operating_mode = byte_mode;
|
|---|
| 788 | spec_list_string = optarg;
|
|---|
| 789 | break;
|
|---|
| 790 |
|
|---|
| 791 | case 'f':
|
|---|
| 792 | /* Build the field list. */
|
|---|
| 793 | if (operating_mode != undefined_mode)
|
|---|
| 794 | FATAL_ERROR (_("only one type of list may be specified"));
|
|---|
| 795 | operating_mode = field_mode;
|
|---|
| 796 | spec_list_string = optarg;
|
|---|
| 797 | break;
|
|---|
| 798 |
|
|---|
| 799 | case 'd':
|
|---|
| 800 | /* New delimiter. */
|
|---|
| 801 | /* Interpret -d '' to mean `use the NUL byte as the delimiter.' */
|
|---|
| 802 | if (optarg[0] != '\0' && optarg[1] != '\0')
|
|---|
| 803 | FATAL_ERROR (_("the delimiter must be a single character"));
|
|---|
| 804 | delim = optarg[0];
|
|---|
| 805 | delim_specified = true;
|
|---|
| 806 | break;
|
|---|
| 807 |
|
|---|
| 808 | case OUTPUT_DELIMITER_OPTION:
|
|---|
| 809 | output_delimiter_specified = true;
|
|---|
| 810 | /* Interpret --output-delimiter='' to mean
|
|---|
| 811 | `use the NUL byte as the delimiter.' */
|
|---|
| 812 | output_delimiter_length = (optarg[0] == '\0'
|
|---|
| 813 | ? 1 : strlen (optarg));
|
|---|
| 814 | output_delimiter_string = xstrdup (optarg);
|
|---|
| 815 | break;
|
|---|
| 816 |
|
|---|
| 817 | case 'n':
|
|---|
| 818 | break;
|
|---|
| 819 |
|
|---|
| 820 | case 's':
|
|---|
| 821 | suppress_non_delimited = true;
|
|---|
| 822 | break;
|
|---|
| 823 |
|
|---|
| 824 | case COMPLEMENT_OPTION:
|
|---|
| 825 | complement = true;
|
|---|
| 826 | break;
|
|---|
| 827 |
|
|---|
| 828 | case_GETOPT_HELP_CHAR;
|
|---|
| 829 |
|
|---|
| 830 | case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
|
|---|
| 831 |
|
|---|
| 832 | default:
|
|---|
| 833 | usage (EXIT_FAILURE);
|
|---|
| 834 | }
|
|---|
| 835 | }
|
|---|
| 836 |
|
|---|
| 837 | if (operating_mode == undefined_mode)
|
|---|
| 838 | FATAL_ERROR (_("you must specify a list of bytes, characters, or fields"));
|
|---|
| 839 |
|
|---|
| 840 | if (delim != '\0' && operating_mode != field_mode)
|
|---|
| 841 | FATAL_ERROR (_("an input delimiter may be specified only\
|
|---|
| 842 | when operating on fields"));
|
|---|
| 843 |
|
|---|
| 844 | if (suppress_non_delimited && operating_mode != field_mode)
|
|---|
| 845 | FATAL_ERROR (_("suppressing non-delimited lines makes sense\n\
|
|---|
| 846 | \tonly when operating on fields"));
|
|---|
| 847 |
|
|---|
| 848 | if (output_delimiter_specified)
|
|---|
| 849 | {
|
|---|
| 850 | range_start_ht = hash_initialize (HT_RANGE_START_INDEX_INITIAL_CAPACITY,
|
|---|
| 851 | NULL, hash_int,
|
|---|
| 852 | hash_compare_ints, NULL);
|
|---|
| 853 | if (range_start_ht == NULL)
|
|---|
| 854 | xalloc_die ();
|
|---|
| 855 |
|
|---|
| 856 | }
|
|---|
| 857 |
|
|---|
| 858 | if (! set_fields (spec_list_string))
|
|---|
| 859 | {
|
|---|
| 860 | if (operating_mode == field_mode)
|
|---|
| 861 | FATAL_ERROR (_("missing list of fields"));
|
|---|
| 862 | else
|
|---|
| 863 | FATAL_ERROR (_("missing list of positions"));
|
|---|
| 864 | }
|
|---|
| 865 |
|
|---|
| 866 | if (!delim_specified)
|
|---|
| 867 | delim = '\t';
|
|---|
| 868 |
|
|---|
| 869 | if (output_delimiter_string == NULL)
|
|---|
| 870 | {
|
|---|
| 871 | static char dummy[2];
|
|---|
| 872 | dummy[0] = delim;
|
|---|
| 873 | dummy[1] = '\0';
|
|---|
| 874 | output_delimiter_string = dummy;
|
|---|
| 875 | output_delimiter_length = 1;
|
|---|
| 876 | }
|
|---|
| 877 |
|
|---|
| 878 | if (optind == argc)
|
|---|
| 879 | ok = cut_file ("-");
|
|---|
| 880 | else
|
|---|
| 881 | for (ok = true; optind < argc; optind++)
|
|---|
| 882 | ok &= cut_file (argv[optind]);
|
|---|
| 883 |
|
|---|
| 884 | if (range_start_ht)
|
|---|
| 885 | hash_free (range_start_ht);
|
|---|
| 886 |
|
|---|
| 887 | if (have_read_stdin && fclose (stdin) == EOF)
|
|---|
| 888 | {
|
|---|
| 889 | error (0, errno, "-");
|
|---|
| 890 | ok = false;
|
|---|
| 891 | }
|
|---|
| 892 |
|
|---|
| 893 | exit (ok ? EXIT_SUCCESS : EXIT_FAILURE);
|
|---|
| 894 | }
|
|---|