| 1 | // RuleBasedCollator.java - Concrete class for locale-based string compare.
|
|---|
| 2 |
|
|---|
| 3 | /* Copyright (C) 1999, 2000, 2001 Free Software Foundation
|
|---|
| 4 |
|
|---|
| 5 | This file is part of libgcj.
|
|---|
| 6 |
|
|---|
| 7 | This software is copyrighted work licensed under the terms of the
|
|---|
| 8 | Libgcj License. Please consult the file "LIBGCJ_LICENSE" for
|
|---|
| 9 | details. */
|
|---|
| 10 |
|
|---|
| 11 | package java.text;
|
|---|
| 12 |
|
|---|
| 13 | import java.util.Enumeration;
|
|---|
| 14 | import java.util.Hashtable;
|
|---|
| 15 | import java.util.Vector;
|
|---|
| 16 |
|
|---|
| 17 | /**
|
|---|
| 18 | * @author Tom Tromey <[email protected]>
|
|---|
| 19 | * @date March 25, 1999
|
|---|
| 20 | */
|
|---|
| 21 | /* Written using "Java Class Libraries", 2nd edition, plus online
|
|---|
| 22 | * API docs for JDK 1.2 from http://www.javasoft.com.
|
|---|
| 23 | * Status: Believed complete and correct
|
|---|
| 24 | */
|
|---|
| 25 |
|
|---|
| 26 | final class RBCElement
|
|---|
| 27 | {
|
|---|
| 28 | String key;
|
|---|
| 29 | char relation;
|
|---|
| 30 |
|
|---|
| 31 | RBCElement (String key, char relation)
|
|---|
| 32 | {
|
|---|
| 33 | this.key = key;
|
|---|
| 34 | this.relation = relation;
|
|---|
| 35 | }
|
|---|
| 36 | }
|
|---|
| 37 |
|
|---|
| 38 | public class RuleBasedCollator extends Collator
|
|---|
| 39 | {
|
|---|
| 40 | public Object clone ()
|
|---|
| 41 | {
|
|---|
| 42 | RuleBasedCollator c = (RuleBasedCollator) super.clone ();
|
|---|
| 43 | c.map = (Hashtable) map.clone ();
|
|---|
| 44 | c.prefixes = (Hashtable) map.clone ();
|
|---|
| 45 | return c;
|
|---|
| 46 | }
|
|---|
| 47 |
|
|---|
| 48 | // A helper for CollationElementIterator.next().
|
|---|
| 49 | int ceiNext (CollationElementIterator cei)
|
|---|
| 50 | {
|
|---|
| 51 | if (cei.lookahead_set)
|
|---|
| 52 | {
|
|---|
| 53 | cei.lookahead_set = false;
|
|---|
| 54 | return cei.lookahead;
|
|---|
| 55 | }
|
|---|
| 56 |
|
|---|
| 57 | int save = cei.index;
|
|---|
| 58 | int max = cei.text.length();
|
|---|
| 59 | String s = null;
|
|---|
| 60 |
|
|---|
| 61 | // It is possible to have a case where `abc' has a mapping, but
|
|---|
| 62 | // neither `ab' nor `abd' do. In this case we must treat `abd' as
|
|---|
| 63 | // nothing special.
|
|---|
| 64 | boolean found = false;
|
|---|
| 65 |
|
|---|
| 66 | int i;
|
|---|
| 67 | for (i = save + 1; i <= max; ++i)
|
|---|
| 68 | {
|
|---|
| 69 | s = cei.text.substring(save, i);
|
|---|
| 70 | if (prefixes.get(s) == null)
|
|---|
| 71 | break;
|
|---|
| 72 | found = true;
|
|---|
| 73 | }
|
|---|
| 74 | // Assume s != null.
|
|---|
| 75 |
|
|---|
| 76 | Object obj = map.get(s);
|
|---|
| 77 | // The special case.
|
|---|
| 78 | while (found && obj == null && s.length() > 1)
|
|---|
| 79 | {
|
|---|
| 80 | --i;
|
|---|
| 81 | s = cei.text.substring(save, i);
|
|---|
| 82 | obj = map.get(s);
|
|---|
| 83 | }
|
|---|
| 84 |
|
|---|
| 85 | // Update state.
|
|---|
| 86 | cei.index = i;
|
|---|
| 87 |
|
|---|
| 88 | if (obj == null)
|
|---|
| 89 | {
|
|---|
| 90 | // This idea, and the values, come from JDK.
|
|---|
| 91 | // assert (s.length() == 1)
|
|---|
| 92 | cei.lookahead_set = true;
|
|---|
| 93 | cei.lookahead = s.charAt(0) << 8;
|
|---|
| 94 | return 0x7fff << 16;
|
|---|
| 95 | }
|
|---|
| 96 |
|
|---|
| 97 | return ((Integer) obj).intValue();
|
|---|
| 98 | }
|
|---|
| 99 |
|
|---|
| 100 | // A helper for compareTo() that returns the next character that has
|
|---|
| 101 | // a nonzero ordering at the indicated strength. This is also used
|
|---|
| 102 | // in CollationKey.
|
|---|
| 103 | static final int next (CollationElementIterator iter, int strength)
|
|---|
| 104 | {
|
|---|
| 105 | while (true)
|
|---|
| 106 | {
|
|---|
| 107 | int os = iter.next();
|
|---|
| 108 | if (os == CollationElementIterator.NULLORDER)
|
|---|
| 109 | return os;
|
|---|
| 110 | int c = 0;
|
|---|
| 111 | switch (strength)
|
|---|
| 112 | {
|
|---|
| 113 | case PRIMARY:
|
|---|
| 114 | c = os & ~0xffff;
|
|---|
| 115 | break;
|
|---|
| 116 | case SECONDARY:
|
|---|
| 117 | c = os & ~0x00ff;
|
|---|
| 118 | break;
|
|---|
| 119 | case TERTIARY:
|
|---|
| 120 | case IDENTICAL:
|
|---|
| 121 | c = os;
|
|---|
| 122 | break;
|
|---|
| 123 | }
|
|---|
| 124 | if (c != 0)
|
|---|
| 125 | return c;
|
|---|
| 126 | }
|
|---|
| 127 | }
|
|---|
| 128 |
|
|---|
| 129 | public int compare (String source, String target)
|
|---|
| 130 | {
|
|---|
| 131 | CollationElementIterator cs, ct;
|
|---|
| 132 |
|
|---|
| 133 | cs = new CollationElementIterator (source, this);
|
|---|
| 134 | ct = new CollationElementIterator (target, this);
|
|---|
| 135 |
|
|---|
| 136 | while (true)
|
|---|
| 137 | {
|
|---|
| 138 | int os = next (cs, strength);
|
|---|
| 139 | int ot = next (ct, strength);
|
|---|
| 140 |
|
|---|
| 141 | if (os == CollationElementIterator.NULLORDER
|
|---|
| 142 | && ot == CollationElementIterator.NULLORDER)
|
|---|
| 143 | break;
|
|---|
| 144 | else if (os == CollationElementIterator.NULLORDER)
|
|---|
| 145 | {
|
|---|
| 146 | // Source string is shorter, so return "less than".
|
|---|
| 147 | return -1;
|
|---|
| 148 | }
|
|---|
| 149 | else if (ot == CollationElementIterator.NULLORDER)
|
|---|
| 150 | {
|
|---|
| 151 | // Target string is shorter, so return "greater than".
|
|---|
| 152 | return 1;
|
|---|
| 153 | }
|
|---|
| 154 |
|
|---|
| 155 | if (os != ot)
|
|---|
| 156 | return os - ot;
|
|---|
| 157 | }
|
|---|
| 158 |
|
|---|
| 159 | return 0;
|
|---|
| 160 | }
|
|---|
| 161 |
|
|---|
| 162 | public boolean equals (Object obj)
|
|---|
| 163 | {
|
|---|
| 164 | if (! (obj instanceof RuleBasedCollator) || ! super.equals(obj))
|
|---|
| 165 | return false;
|
|---|
| 166 | RuleBasedCollator rbc = (RuleBasedCollator) obj;
|
|---|
| 167 | // FIXME: this is probably wrong. Instead we should compare maps
|
|---|
| 168 | // directly.
|
|---|
| 169 | return (frenchAccents == rbc.frenchAccents
|
|---|
| 170 | && rules.equals(rbc.rules));
|
|---|
| 171 | }
|
|---|
| 172 |
|
|---|
| 173 | public CollationElementIterator getCollationElementIterator (String source)
|
|---|
| 174 | {
|
|---|
| 175 | StringBuffer expand = new StringBuffer (source.length());
|
|---|
| 176 | int max = source.length();
|
|---|
| 177 | for (int i = 0; i < max; ++i)
|
|---|
| 178 | decomposeCharacter (source.charAt(i), expand);
|
|---|
| 179 | return new CollationElementIterator (expand.toString(), this);
|
|---|
| 180 | }
|
|---|
| 181 |
|
|---|
| 182 | public CollationElementIterator getCollationElementIterator (CharacterIterator source)
|
|---|
| 183 | {
|
|---|
| 184 | StringBuffer expand = new StringBuffer ();
|
|---|
| 185 | for (char c = source.first ();
|
|---|
| 186 | c != CharacterIterator.DONE;
|
|---|
| 187 | c = source.next ())
|
|---|
| 188 | decomposeCharacter (c, expand);
|
|---|
| 189 |
|
|---|
| 190 | return new CollationElementIterator (expand.toString(), this);
|
|---|
| 191 | }
|
|---|
| 192 |
|
|---|
| 193 | public CollationKey getCollationKey (String source)
|
|---|
| 194 | {
|
|---|
| 195 | return new CollationKey (getCollationElementIterator (source), source,
|
|---|
| 196 | strength);
|
|---|
| 197 | }
|
|---|
| 198 |
|
|---|
| 199 | public String getRules ()
|
|---|
| 200 | {
|
|---|
| 201 | return rules;
|
|---|
| 202 | }
|
|---|
| 203 |
|
|---|
| 204 | public int hashCode ()
|
|---|
| 205 | {
|
|---|
| 206 | return (frenchAccents ? 1231 : 1237
|
|---|
| 207 | ^ rules.hashCode()
|
|---|
| 208 | ^ map.hashCode()
|
|---|
| 209 | ^ prefixes.hashCode());
|
|---|
| 210 | }
|
|---|
| 211 |
|
|---|
| 212 | private final boolean is_special (char c)
|
|---|
| 213 | {
|
|---|
| 214 | // Rules from JCL book.
|
|---|
| 215 | return ((c >= 0x0009 && c <= 0x000d)
|
|---|
| 216 | || (c >= 0x0020 && c <= 0x002f)
|
|---|
| 217 | || (c >= 0x003a && c <= 0x0040)
|
|---|
| 218 | || (c >= 0x005b && c <= 0x0060)
|
|---|
| 219 | || (c >= 0x007b && c <= 0x007e));
|
|---|
| 220 | }
|
|---|
| 221 |
|
|---|
| 222 | private final int text_argument (String rules, int index,
|
|---|
| 223 | StringBuffer result)
|
|---|
| 224 | {
|
|---|
| 225 | result.setLength(0);
|
|---|
| 226 | int len = rules.length();
|
|---|
| 227 | while (index < len)
|
|---|
| 228 | {
|
|---|
| 229 | char c = rules.charAt(index);
|
|---|
| 230 | if (c == '\'' && index + 2 < len
|
|---|
| 231 | && rules.charAt(index + 2) == '\''
|
|---|
| 232 | && is_special (rules.charAt(index + 1)))
|
|---|
| 233 | index += 2;
|
|---|
| 234 | else if (is_special (c) || Character.isWhitespace(c))
|
|---|
| 235 | return index;
|
|---|
| 236 | result.append(c);
|
|---|
| 237 | ++index;
|
|---|
| 238 | }
|
|---|
| 239 | return index;
|
|---|
| 240 | }
|
|---|
| 241 |
|
|---|
| 242 | public RuleBasedCollator (String rules) throws ParseException
|
|---|
| 243 | {
|
|---|
| 244 | this.rules = rules;
|
|---|
| 245 | this.frenchAccents = false;
|
|---|
| 246 |
|
|---|
| 247 | // We keep each rule in order in a vector. At the end we traverse
|
|---|
| 248 | // the vector and compute collation values from it.
|
|---|
| 249 | int insertion_index = 0;
|
|---|
| 250 | Vector vec = new Vector ();
|
|---|
| 251 |
|
|---|
| 252 | StringBuffer argument = new StringBuffer ();
|
|---|
| 253 |
|
|---|
| 254 | int len = rules.length();
|
|---|
| 255 | for (int index = 0; index < len; ++index)
|
|---|
| 256 | {
|
|---|
| 257 | char c = rules.charAt(index);
|
|---|
| 258 |
|
|---|
| 259 | // Just skip whitespace.
|
|---|
| 260 | if (Character.isWhitespace(c))
|
|---|
| 261 | continue;
|
|---|
| 262 |
|
|---|
| 263 | // Modifier.
|
|---|
| 264 | if (c == '@')
|
|---|
| 265 | {
|
|---|
| 266 | frenchAccents = true;
|
|---|
| 267 | continue;
|
|---|
| 268 | }
|
|---|
| 269 |
|
|---|
| 270 | // Check for relation or reset operator.
|
|---|
| 271 | if (! (c == '<' || c == ';' || c == ',' || c == '=' || c == '&'))
|
|---|
| 272 | throw new ParseException ("invalid character", index);
|
|---|
| 273 |
|
|---|
| 274 | ++index;
|
|---|
| 275 | while (index < len)
|
|---|
| 276 | {
|
|---|
| 277 | if (! Character.isWhitespace(rules.charAt(index)))
|
|---|
| 278 | break;
|
|---|
| 279 | ++index;
|
|---|
| 280 | }
|
|---|
| 281 | if (index == len)
|
|---|
| 282 | throw new ParseException ("missing argument", index);
|
|---|
| 283 |
|
|---|
| 284 | int save = index;
|
|---|
| 285 | index = text_argument (rules, index, argument);
|
|---|
| 286 | if (argument.length() == 0)
|
|---|
| 287 | throw new ParseException ("invalid character", save);
|
|---|
| 288 | String arg = argument.toString();
|
|---|
| 289 | int item_index = vec.indexOf(arg);
|
|---|
| 290 | if (c != '&')
|
|---|
| 291 | {
|
|---|
| 292 | // If the argument already appears in the vector, then we
|
|---|
| 293 | // must remove it in order to re-order.
|
|---|
| 294 | if (item_index != -1)
|
|---|
| 295 | {
|
|---|
| 296 | vec.removeElementAt(item_index);
|
|---|
| 297 | if (insertion_index >= item_index)
|
|---|
| 298 | --insertion_index;
|
|---|
| 299 | }
|
|---|
| 300 | RBCElement r = new RBCElement (arg, c);
|
|---|
| 301 | vec.insertElementAt(r, insertion_index);
|
|---|
| 302 | ++insertion_index;
|
|---|
| 303 | }
|
|---|
| 304 | else
|
|---|
| 305 | {
|
|---|
| 306 | // Reset.
|
|---|
| 307 | if (item_index == -1)
|
|---|
| 308 | throw
|
|---|
| 309 | new ParseException ("argument to reset not previously seen",
|
|---|
| 310 | save);
|
|---|
| 311 | insertion_index = item_index + 1;
|
|---|
| 312 | }
|
|---|
| 313 |
|
|---|
| 314 | // Ugly: in this case the resulting INDEX comes from
|
|---|
| 315 | // text_argument, which returns the index of the next
|
|---|
| 316 | // character we should examine.
|
|---|
| 317 | --index;
|
|---|
| 318 | }
|
|---|
| 319 |
|
|---|
| 320 | // Now construct a hash table that maps strings onto their
|
|---|
| 321 | // collation values.
|
|---|
| 322 | int primary = 0;
|
|---|
| 323 | int secondary = 0;
|
|---|
| 324 | int tertiary = 0;
|
|---|
| 325 | this.map = new Hashtable ();
|
|---|
| 326 | this.prefixes = new Hashtable ();
|
|---|
| 327 | Enumeration e = vec.elements();
|
|---|
| 328 | while (e.hasMoreElements())
|
|---|
| 329 | {
|
|---|
| 330 | RBCElement r = (RBCElement) e.nextElement();
|
|---|
| 331 | switch (r.relation)
|
|---|
| 332 | {
|
|---|
| 333 | case '<':
|
|---|
| 334 | ++primary;
|
|---|
| 335 | secondary = 0;
|
|---|
| 336 | tertiary = 0;
|
|---|
| 337 | break;
|
|---|
| 338 | case ';':
|
|---|
| 339 | ++secondary;
|
|---|
| 340 | tertiary = 0;
|
|---|
| 341 | break;
|
|---|
| 342 | case ',':
|
|---|
| 343 | ++tertiary;
|
|---|
| 344 | break;
|
|---|
| 345 | case '=':
|
|---|
| 346 | break;
|
|---|
| 347 | }
|
|---|
| 348 | // This must match CollationElementIterator.
|
|---|
| 349 | map.put(r.key, new Integer (primary << 16
|
|---|
| 350 | | secondary << 8 | tertiary));
|
|---|
| 351 |
|
|---|
| 352 | // Make a map of all lookaheads we might need.
|
|---|
| 353 | for (int i = r.key.length() - 1; i >= 1; --i)
|
|---|
| 354 | prefixes.put(r.key.substring(0, i), Boolean.TRUE);
|
|---|
| 355 | }
|
|---|
| 356 | }
|
|---|
| 357 |
|
|---|
| 358 | // True if we are using French-style accent ordering.
|
|---|
| 359 | private boolean frenchAccents;
|
|---|
| 360 |
|
|---|
| 361 | // It's easier to just save the rules than to try to recreate them.
|
|---|
| 362 | private String rules;
|
|---|
| 363 |
|
|---|
| 364 | // This maps strings onto collation values.
|
|---|
| 365 | private Hashtable map;
|
|---|
| 366 | // An entry in this hash means that more lookahead is required for
|
|---|
| 367 | // the prefix string.
|
|---|
| 368 | private Hashtable prefixes;
|
|---|
| 369 | }
|
|---|