Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

code.c@ 3845

Visit:

Last change on this file since 3845 was 3170, checked in by bird, 19 years ago
findutils 4.3.2
File size: 6.6 KB

Line
1	/* code -- bigram- and front-encode filenames for locate
2	Copyright (C) 1994 Free Software Foundation, Inc.
3
4	This program is free software; you can redistribute it and/or modify
5	it under the terms of the GNU General Public License as published by
6	the Free Software Foundation; either version 2, or (at your option)
7	any later version.
8
9	This program is distributed in the hope that it will be useful,
10	but WITHOUT ANY WARRANTY; without even the implied warranty of
11	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12	GNU General Public License for more details.
13
14	You should have received a copy of the GNU General Public License
15	along with this program; if not, write to the Free Software
16	Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
17	USA.
18	*/
19
20	/* Compress a sorted list.
21	Works with `find' to encode a filename database to save space
22	and search time.
23
24	Usage:
25
26	bigram < file_list > bigrams
27	process-bigrams > most_common_bigrams
28	code most_common_bigrams < file_list > squeezed_list
29
30	Uses `front compression' (see ";login:", March 1983, p. 8).
31	The output begins with the 128 most common bigrams.
32	After that, the output format is, for each line,
33	an offset (from the previous line) differential count byte
34	followed by a (partially bigram-encoded) ASCII remainder.
35	The output lines have no terminating byte; the start of the next line
36	is indicated by its first byte having a value <= 30.
37
38	The encoding of the output bytes is:
39
40	0-28 likeliest differential counts + offset (14) to make nonnegative
41	30 escape code for out-of-range count to follow in next halfword
42	128-255 bigram codes (the 128 most common, as determined by `updatedb')
43	32-127 single character (printable) ASCII remainder
44
45	Written by James A. Woods <[email protected]>.
46	Modified by David MacKenzie <[email protected]>. */
47
48	#include <config.h>
49	#include <stdio.h>
50	#include <sys/types.h>
51
52	#if defined(HAVE_STRING_H) \|\| defined(STDC_HEADERS)
53	#include <string.h>
54	#else
55	#include <strings.h>
56	#endif
57
58	#ifdef STDC_HEADERS
59	#include <stdlib.h>
60	#endif
61
62	#if ENABLE_NLS
63	# include <libintl.h>
64	# define _(Text) gettext (Text)
65	#else
66	# define _(Text) Text
67	#define textdomain(Domain)
68	#define bindtextdomain(Package, Directory)
69	#endif
70	#ifdef gettext_noop
71	# define N_(String) gettext_noop (String)
72	#else
73	/* See locate.c for explanation as to why not use (String) */
74	# define N_(String) String
75	#endif
76
77	#include "locatedb.h"
78	#include <getline.h>
79	#include "closeout.h"
80
81	char *xmalloc PARAMS((size_t));
82
83	/* The name this program was run with. */
84	char *program_name;
85
86	/* The 128 most common bigrams in the file list, padded with NULs
87	if there are fewer. */
88	static char bigrams[257] = {0};
89
90	/* Return the offset of PATTERN in STRING, or -1 if not found. */
91
92	static int
93	strindex (char string, char pattern)
94	{
95	register char *s;
96
97	for (s = string; *s != '\0'; s++)
98	/* Fast first char check. */
99	if (s == pattern)
100	{
101	register char p2 = pattern + 1, s2 = s + 1;
102	while (p2 != '\0' && p2 == *s2)
103	p2++, s2++;
104	if (*p2 == '\0')
105	return s2 - strlen (pattern) - string;
106	}
107	return -1;
108	}
109
110	/* Return the length of the longest common prefix of strings S1 and S2. */
111
112	static int
113	prefix_length (char s1, char s2)
114	{
115	register char *start;
116
117	for (start = s1; s1 == s2 && *s1 != '\0'; s1++, s2++)
118	;
119	return s1 - start;
120	}
121
122	extern char *version_string;
123
124	static void
125	usage (FILE *stream)
126	{
127	fprintf (stream, _("\
128	Usage: %s [--version \| --help]\n\
129	or %s most_common_bigrams < file-list > locate-database\n"),
130	program_name, program_name);
131	fputs (_("\nReport bugs to <[email protected]>.\n"), stream);
132	}
133
134
135	int
136	main (int argc, char **argv)
137	{
138	char path; / The current input entry. */
139	char oldpath; / The previous input entry. */
140	size_t pathsize, oldpathsize; /* Amounts allocated for them. */
141	int count, oldcount, diffcount; /* Their prefix lengths & the difference. */
142	char bigram[3]; /* Bigram to search for in table. */
143	int code; /* Index of `bigram' in bigrams table. */
144	FILE fp; / Most common bigrams file. */
145	int line_len; /* Length of input line. */
146
147	program_name = argv[0];
148	atexit (close_stdout);
149
150	bigram[2] = '\0';
151
152	if (argc != 2)
153	{
154	usage(stderr);
155	return 2;
156	}
157
158	if (0 == strcmp(argv[1], "--help"))
159	{
160	usage(stdout);
161	return 0;
162	}
163	else if (0 == strcmp(argv[1], "--version"))
164	{
165	printf (_("GNU findutils version %s\n"), version_string);
166	return 0;
167	}
168
169	fp = fopen (argv[1], "r");
170	if (fp == NULL)
171	{
172	fprintf (stderr, "%s: ", argv[0]);
173	perror (argv[1]);
174	return 1;
175	}
176
177	pathsize = oldpathsize = 1026; /* Increased as necessary by getline. */
178	path = xmalloc (pathsize);
179	oldpath = xmalloc (oldpathsize);
180
181	/* Set to anything not starting with a slash, to force the first
182	prefix count to 0. */
183	strcpy (oldpath, " ");
184	oldcount = 0;
185
186	/* Copy the list of most common bigrams to the output,
187	padding with NULs if there are <128 of them. */
188	fgets (bigrams, 257, fp);
189	fwrite (bigrams, 1, 256, stdout);
190	fclose (fp);
191
192	while ((line_len = getline (&path, &pathsize, stdin)) > 0)
193	{
194	char *pp;
195
196	path[line_len - 1] = '\0'; /* Remove newline. */
197
198	/* Squelch unprintable chars in path so as not to botch decoding. */
199	for (pp = path; *pp != '\0'; pp++)
200	{
201	if (!(pp >= 040 && pp < 0177))
202	*pp = '?';
203	}
204
205	count = prefix_length (oldpath, path);
206	diffcount = count - oldcount;
207	oldcount = count;
208	/* If the difference is small, it fits in one byte;
209	otherwise, two bytes plus a marker noting that fact. */
210	if (diffcount < -LOCATEDB_OLD_OFFSET \|\| diffcount > LOCATEDB_OLD_OFFSET)
211	{
212	putc (LOCATEDB_OLD_ESCAPE, stdout);
213	putw (diffcount + LOCATEDB_OLD_OFFSET, stdout);
214	}
215	else
216	putc (diffcount + LOCATEDB_OLD_OFFSET, stdout);
217
218	/* Look for bigrams in the remainder of the path. */
219	for (pp = path + count; *pp != '\0'; pp += 2)
220	{
221	if (pp[1] == '\0')
222	{
223	/* No bigram is possible; only one char is left. */
224	putchar (*pp);
225	break;
226	}
227	bigram[0] = *pp;
228	bigram[1] = pp[1];
229	/* Linear search for specific bigram in string table. */
230	code = strindex (bigrams, bigram);
231	if (code % 2 == 0)
232	putchar ((code / 2) \| 0200); /* It's a common bigram. */
233	else
234	fputs (bigram, stdout); /* Write the text as printable ASCII. */
235	}
236
237	{
238	/* Swap path and oldpath and their sizes. */
239	char *tmppath = oldpath;
240	size_t tmppathsize = oldpathsize;
241	oldpath = path;
242	oldpathsize = pathsize;
243	path = tmppath;
244	pathsize = tmppathsize;
245	}
246	}
247
248	free (path);
249	free (oldpath);
250
251	return 0;
252	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/essentials/sys-apps/findutils/locate/code.c@ 3845

Download in other formats: