source: trunk/coreutils/src/uniq.c@ 2603

Last change on this file since 2603 was 2554, checked in by bird, 20 years ago

coretuils-5.94

File size: 13.7 KB
Line 
1/* uniq -- remove duplicate lines from a sorted file
2 Copyright (C) 86, 91, 1995-2005 Free Software Foundation, Inc.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
7 any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software Foundation,
16 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
17
18/* Written by Richard Stallman and David MacKenzie. */
19
20
21#include <config.h>
22
23#include <stdio.h>
24#include <getopt.h>
25#include <sys/types.h>
26
27#include "system.h"
28#include "argmatch.h"
29#include "linebuffer.h"
30#include "error.h"
31#include "hard-locale.h"
32#include "posixver.h"
33#include "quote.h"
34#include "xmemcoll.h"
35#include "xstrtol.h"
36#include "memcasecmp.h"
37
38/* The official name of this program (e.g., no `g' prefix). */
39#define PROGRAM_NAME "uniq"
40
41#define AUTHORS "Richard Stallman", "David MacKenzie"
42
43#define SWAP_LINES(A, B) \
44 do \
45 { \
46 struct linebuffer *_tmp; \
47 _tmp = (A); \
48 (A) = (B); \
49 (B) = _tmp; \
50 } \
51 while (0)
52
53/* The name this program was run with. */
54char *program_name;
55
56/* True if the LC_COLLATE locale is hard. */
57static bool hard_LC_COLLATE;
58
59/* Number of fields to skip on each line when doing comparisons. */
60static size_t skip_fields;
61
62/* Number of chars to skip after skipping any fields. */
63static size_t skip_chars;
64
65/* Number of chars to compare. */
66static size_t check_chars;
67
68enum countmode
69{
70 count_occurrences, /* -c Print count before output lines. */
71 count_none /* Default. Do not print counts. */
72};
73
74/* Whether and how to precede the output lines with a count of the number of
75 times they occurred in the input. */
76static enum countmode countmode;
77
78/* Which lines to output: unique lines, the first of a group of
79 repeated lines, and the second and subsequented of a group of
80 repeated lines. */
81static bool output_unique;
82static bool output_first_repeated;
83static bool output_later_repeated;
84
85/* If true, ignore case when comparing. */
86static bool ignore_case;
87
88enum delimit_method
89{
90 /* No delimiters output. --all-repeated[=none] */
91 DM_NONE,
92
93 /* Delimiter precedes all groups. --all-repeated=prepend */
94 DM_PREPEND,
95
96 /* Delimit all groups. --all-repeated=separate */
97 DM_SEPARATE
98};
99
100static char const *const delimit_method_string[] =
101{
102 "none", "prepend", "separate", NULL
103};
104
105static enum delimit_method const delimit_method_map[] =
106{
107 DM_NONE, DM_PREPEND, DM_SEPARATE
108};
109
110/* Select whether/how to delimit groups of duplicate lines. */
111static enum delimit_method delimit_groups;
112
113static struct option const longopts[] =
114{
115 {"count", no_argument, NULL, 'c'},
116 {"repeated", no_argument, NULL, 'd'},
117 {"all-repeated", optional_argument, NULL, 'D'},
118 {"ignore-case", no_argument, NULL, 'i'},
119 {"unique", no_argument, NULL, 'u'},
120 {"skip-fields", required_argument, NULL, 'f'},
121 {"skip-chars", required_argument, NULL, 's'},
122 {"check-chars", required_argument, NULL, 'w'},
123 {GETOPT_HELP_OPTION_DECL},
124 {GETOPT_VERSION_OPTION_DECL},
125 {NULL, 0, NULL, 0}
126};
127
128void
129usage (int status)
130{
131 if (status != EXIT_SUCCESS)
132 fprintf (stderr, _("Try `%s --help' for more information.\n"),
133 program_name);
134 else
135 {
136 printf (_("\
137Usage: %s [OPTION]... [INPUT [OUTPUT]]\n\
138"),
139 program_name);
140 fputs (_("\
141Discard all but one of successive identical lines from INPUT (or\n\
142standard input), writing to OUTPUT (or standard output).\n\
143\n\
144"), stdout);
145 fputs (_("\
146Mandatory arguments to long options are mandatory for short options too.\n\
147"), stdout);
148 fputs (_("\
149 -c, --count prefix lines by the number of occurrences\n\
150 -d, --repeated only print duplicate lines\n\
151"), stdout);
152 fputs (_("\
153 -D, --all-repeated[=delimit-method] print all duplicate lines\n\
154 delimit-method={none(default),prepend,separate}\n\
155 Delimiting is done with blank lines.\n\
156 -f, --skip-fields=N avoid comparing the first N fields\n\
157 -i, --ignore-case ignore differences in case when comparing\n\
158 -s, --skip-chars=N avoid comparing the first N characters\n\
159 -u, --unique only print unique lines\n\
160"), stdout);
161 fputs (_("\
162 -w, --check-chars=N compare no more than N characters in lines\n\
163"), stdout);
164 fputs (HELP_OPTION_DESCRIPTION, stdout);
165 fputs (VERSION_OPTION_DESCRIPTION, stdout);
166 fputs (_("\
167\n\
168A field is a run of whitespace, then non-whitespace characters.\n\
169Fields are skipped before chars.\n\
170"), stdout);
171 printf (_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT);
172 }
173 exit (status);
174}
175
176/* Convert OPT to size_t, reporting an error using MSGID if it does
177 not fit. */
178
179static size_t
180size_opt (char const *opt, char const *msgid)
181{
182 unsigned long int size;
183 if (xstrtoul (opt, NULL, 10, &size, "") != LONGINT_OK
184 || SIZE_MAX < size)
185 error (EXIT_FAILURE, 0, "%s: %s", opt, _(msgid));
186 return size;
187}
188
189/* Given a linebuffer LINE,
190 return a pointer to the beginning of the line's field to be compared. */
191
192static char *
193find_field (const struct linebuffer *line)
194{
195 size_t count;
196 char *lp = line->buffer;
197 size_t size = line->length - 1;
198 size_t i = 0;
199
200 for (count = 0; count < skip_fields && i < size; count++)
201 {
202 while (i < size && ISBLANK (lp[i]))
203 i++;
204 while (i < size && !ISBLANK (lp[i]))
205 i++;
206 }
207
208 for (count = 0; count < skip_chars && i < size; count++)
209 i++;
210
211 return lp + i;
212}
213
214/* Return false if two strings OLD and NEW match, true if not.
215 OLD and NEW point not to the beginnings of the lines
216 but rather to the beginnings of the fields to compare.
217 OLDLEN and NEWLEN are their lengths. */
218
219static bool
220different (char *old, char *new, size_t oldlen, size_t newlen)
221{
222 if (check_chars < oldlen)
223 oldlen = check_chars;
224 if (check_chars < newlen)
225 newlen = check_chars;
226
227 if (ignore_case)
228 {
229 /* FIXME: This should invoke strcoll somehow. */
230 return oldlen != newlen || memcasecmp (old, new, oldlen);
231 }
232 else if (hard_LC_COLLATE)
233 return xmemcoll (old, oldlen, new, newlen) != 0;
234 else
235 return oldlen != newlen || memcmp (old, new, oldlen);
236}
237
238/* Output the line in linebuffer LINE to standard output
239 provided that the switches say it should be output.
240 MATCH is true if the line matches the previous line.
241 If requested, print the number of times it occurred, as well;
242 LINECOUNT + 1 is the number of times that the line occurred. */
243
244static void
245writeline (struct linebuffer const *line,
246 bool match, uintmax_t linecount)
247{
248 if (! (linecount == 0 ? output_unique
249 : !match ? output_first_repeated
250 : output_later_repeated))
251 return;
252
253 if (countmode == count_occurrences)
254 printf ("%7" PRIuMAX " ", linecount + 1);
255
256 fwrite (line->buffer, sizeof (char), line->length, stdout);
257}
258
259/* Process input file INFILE with output to OUTFILE.
260 If either is "-", use the standard I/O stream for it instead. */
261
262static void
263check_file (const char *infile, const char *outfile)
264{
265 struct linebuffer lb1, lb2;
266 struct linebuffer *thisline, *prevline;
267
268 if (! (STREQ (infile, "-") || freopen (infile, "r", stdin)))
269 error (EXIT_FAILURE, errno, "%s", infile);
270 if (! (STREQ (outfile, "-") || freopen (outfile, "w", stdout)))
271 error (EXIT_FAILURE, errno, "%s", outfile);
272
273 thisline = &lb1;
274 prevline = &lb2;
275
276 initbuffer (thisline);
277 initbuffer (prevline);
278
279 /* The duplication in the following `if' and `else' blocks is an
280 optimization to distinguish the common case (in which none of
281 the following options has been specified: --count, -repeated,
282 --all-repeated, --unique) from the others. In the common case,
283 this optimization lets uniq output each different line right away,
284 without waiting to see if the next one is different. */
285
286 if (output_unique && output_first_repeated && countmode == count_none)
287 {
288 char *prevfield IF_LINT (= NULL);
289 size_t prevlen IF_LINT (= 0);
290
291 while (!feof (stdin))
292 {
293 char *thisfield;
294 size_t thislen;
295 if (readlinebuffer (thisline, stdin) == 0)
296 break;
297 thisfield = find_field (thisline);
298 thislen = thisline->length - 1 - (thisfield - thisline->buffer);
299 if (prevline->length == 0
300 || different (thisfield, prevfield, thislen, prevlen))
301 {
302 fwrite (thisline->buffer, sizeof (char),
303 thisline->length, stdout);
304
305 SWAP_LINES (prevline, thisline);
306 prevfield = thisfield;
307 prevlen = thislen;
308 }
309 }
310 }
311 else
312 {
313 char *prevfield;
314 size_t prevlen;
315 uintmax_t match_count = 0;
316 bool first_delimiter = true;
317
318 if (readlinebuffer (prevline, stdin) == 0)
319 goto closefiles;
320 prevfield = find_field (prevline);
321 prevlen = prevline->length - 1 - (prevfield - prevline->buffer);
322
323 while (!feof (stdin))
324 {
325 bool match;
326 char *thisfield;
327 size_t thislen;
328 if (readlinebuffer (thisline, stdin) == 0)
329 {
330 if (ferror (stdin))
331 goto closefiles;
332 break;
333 }
334 thisfield = find_field (thisline);
335 thislen = thisline->length - 1 - (thisfield - thisline->buffer);
336 match = !different (thisfield, prevfield, thislen, prevlen);
337 match_count += match;
338
339 if (match_count == UINTMAX_MAX)
340 {
341 if (count_occurrences)
342 error (EXIT_FAILURE, 0, _("too many repeated lines"));
343 match_count--;
344 }
345
346 if (delimit_groups != DM_NONE)
347 {
348 if (!match)
349 {
350 if (match_count) /* a previous match */
351 first_delimiter = false; /* Only used when DM_SEPARATE */
352 }
353 else if (match_count == 1)
354 {
355 if ((delimit_groups == DM_PREPEND)
356 || (delimit_groups == DM_SEPARATE
357 && !first_delimiter))
358 putchar ('\n');
359 }
360 }
361
362 if (!match || output_later_repeated)
363 {
364 writeline (prevline, match, match_count);
365 SWAP_LINES (prevline, thisline);
366 prevfield = thisfield;
367 prevlen = thislen;
368 if (!match)
369 match_count = 0;
370 }
371 }
372
373 writeline (prevline, false, match_count);
374 }
375
376 closefiles:
377 if (ferror (stdin) || fclose (stdin) != 0)
378 error (EXIT_FAILURE, 0, _("error reading %s"), infile);
379
380 /* stdout is handled via the atexit-invoked close_stdout function. */
381
382 free (lb1.buffer);
383 free (lb2.buffer);
384}
385
386enum Skip_field_option_type
387 {
388 SFO_NONE,
389 SFO_OBSOLETE,
390 SFO_NEW
391 };
392
393int
394main (int argc, char **argv)
395{
396 int optc = 0;
397 bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
398 enum Skip_field_option_type skip_field_option_type = SFO_NONE;
399 int nfiles = 0;
400 char const *file[2];
401
402 file[0] = file[1] = "-";
403 initialize_main (&argc, &argv);
404 program_name = argv[0];
405 setlocale (LC_ALL, "");
406 bindtextdomain (PACKAGE, LOCALEDIR);
407 textdomain (PACKAGE);
408 hard_LC_COLLATE = hard_locale (LC_COLLATE);
409
410 atexit (close_stdout);
411
412 skip_chars = 0;
413 skip_fields = 0;
414 check_chars = SIZE_MAX;
415 output_unique = output_first_repeated = true;
416 output_later_repeated = false;
417 countmode = count_none;
418 delimit_groups = DM_NONE;
419
420 for (;;)
421 {
422 /* Parse an operand with leading "+" as a file after "--" was
423 seen; or if pedantic and a file was seen; or if not
424 obsolete. */
425
426 if (optc == -1
427 || (posixly_correct && nfiles != 0)
428 || ((optc = getopt_long (argc, argv,
429 "-0123456789Dcdf:is:uw:", longopts, NULL))
430 == -1))
431 {
432 if (argc <= optind)
433 break;
434 if (nfiles == 2)
435 {
436 error (0, 0, _("extra operand %s"), quote (argv[optind]));
437 usage (EXIT_FAILURE);
438 }
439 file[nfiles++] = argv[optind++];
440 }
441 else switch (optc)
442 {
443 case 1:
444 {
445 unsigned long int size;
446 if (optarg[0] == '+'
447 && posix2_version () < 200112
448 && xstrtoul (optarg, NULL, 10, &size, "") == LONGINT_OK
449 && size <= SIZE_MAX)
450 skip_chars = size;
451 else if (nfiles == 2)
452 {
453 error (0, 0, _("extra operand %s"), quote (optarg));
454 usage (EXIT_FAILURE);
455 }
456 else
457 file[nfiles++] = optarg;
458 }
459 break;
460
461 case '0':
462 case '1':
463 case '2':
464 case '3':
465 case '4':
466 case '5':
467 case '6':
468 case '7':
469 case '8':
470 case '9':
471 {
472 if (skip_field_option_type == SFO_NEW)
473 skip_fields = 0;
474
475 if (!DECIMAL_DIGIT_ACCUMULATE (skip_fields, optc - '0', size_t))
476 error (EXIT_FAILURE, 0, "%s",
477 _("invalid number of fields to skip"));
478 skip_field_option_type = SFO_OBSOLETE;
479 }
480 break;
481
482 case 'c':
483 countmode = count_occurrences;
484 break;
485
486 case 'd':
487 output_unique = false;
488 break;
489
490 case 'D':
491 output_unique = false;
492 output_later_repeated = true;
493 if (optarg == NULL)
494 delimit_groups = DM_NONE;
495 else
496 delimit_groups = XARGMATCH ("--all-repeated", optarg,
497 delimit_method_string,
498 delimit_method_map);
499 break;
500
501 case 'f':
502 skip_field_option_type = SFO_NEW;
503 skip_fields = size_opt (optarg,
504 N_("invalid number of fields to skip"));
505 break;
506
507 case 'i':
508 ignore_case = true;
509 break;
510
511 case 's':
512 skip_chars = size_opt (optarg,
513 N_("invalid number of bytes to skip"));
514 break;
515
516 case 'u':
517 output_first_repeated = false;
518 break;
519
520 case 'w':
521 check_chars = size_opt (optarg,
522 N_("invalid number of bytes to compare"));
523 break;
524
525 case_GETOPT_HELP_CHAR;
526
527 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
528
529 default:
530 usage (EXIT_FAILURE);
531 }
532 }
533
534 if (countmode == count_occurrences && output_later_repeated)
535 {
536 error (0, 0,
537 _("printing all duplicated lines and repeat counts is meaningless"));
538 usage (EXIT_FAILURE);
539 }
540
541 check_file (file[0], file[1]);
542
543 exit (EXIT_SUCCESS);
544}
Note: See TracBrowser for help on using the repository browser.