| 1 | /*
|
|---|
| 2 | Data structures for encoding transformations.
|
|---|
| 3 |
|
|---|
| 4 | Perl works internally in either a native 'byte' encoding or
|
|---|
| 5 | in UTF-8 encoded Unicode. We have no immediate need for a "wchar_t"
|
|---|
| 6 | representation. When we do we can use utf8_to_uv().
|
|---|
| 7 |
|
|---|
| 8 | Most character encodings are either simple byte mappings or
|
|---|
| 9 | variable length multi-byte encodings. UTF-8 can be viewed as a
|
|---|
| 10 | rather extreme case of the latter.
|
|---|
| 11 |
|
|---|
| 12 | So to solve an important part of perl's encode needs we need to solve the
|
|---|
| 13 | "multi-byte -> multi-byte" case. The simple byte forms are then just degenerate
|
|---|
| 14 | case. (Where one of multi-bytes will usually be UTF-8.)
|
|---|
| 15 |
|
|---|
| 16 | The other type of encoding is a shift encoding where a prefix sequence
|
|---|
| 17 | determines what subsequent bytes mean. Such encodings have state.
|
|---|
| 18 |
|
|---|
| 19 | We also need to handle case where a character in one encoding has to be
|
|---|
| 20 | represented as multiple characters in the other. e.g. letter+diacritic.
|
|---|
| 21 |
|
|---|
| 22 | The process can be considered as pseudo perl:
|
|---|
| 23 |
|
|---|
| 24 | my $dst = '';
|
|---|
| 25 | while (length($src))
|
|---|
| 26 | {
|
|---|
| 27 | my $size = $count($src);
|
|---|
| 28 | my $in_seq = substr($src,0,$size,'');
|
|---|
| 29 | my $out_seq = $s2d_hash{$in_seq};
|
|---|
| 30 | if (defined $out_seq)
|
|---|
| 31 | {
|
|---|
| 32 | $dst .= $out_seq;
|
|---|
| 33 | }
|
|---|
| 34 | else
|
|---|
| 35 | {
|
|---|
| 36 | # an error condition
|
|---|
| 37 | }
|
|---|
| 38 | }
|
|---|
| 39 | return $dst;
|
|---|
| 40 |
|
|---|
| 41 | That has the following components:
|
|---|
| 42 | &src_count - a "rule" for how many bytes make up the next character in the
|
|---|
| 43 | source.
|
|---|
| 44 | %s2d_hash - a mapping from input sequences to output sequences
|
|---|
| 45 |
|
|---|
| 46 | The problem with that scheme is that it does not allow the output
|
|---|
| 47 | character repertoire to affect the characters considered from the
|
|---|
| 48 | input.
|
|---|
| 49 |
|
|---|
| 50 | So we use a "trie" representation which can also be considered
|
|---|
| 51 | a state machine:
|
|---|
| 52 |
|
|---|
| 53 | my $dst = '';
|
|---|
| 54 | my $seq = \@s2d_seq;
|
|---|
| 55 | my $next = \@s2d_next;
|
|---|
| 56 | while (length($src))
|
|---|
| 57 | {
|
|---|
| 58 | my $byte = $substr($src,0,1,'');
|
|---|
| 59 | my $out_seq = $seq->[$byte];
|
|---|
| 60 | if (defined $out_seq)
|
|---|
| 61 | {
|
|---|
| 62 | $dst .= $out_seq;
|
|---|
| 63 | }
|
|---|
| 64 | else
|
|---|
| 65 | {
|
|---|
| 66 | # an error condition
|
|---|
| 67 | }
|
|---|
| 68 | ($next,$seq) = @$next->[$byte] if $next;
|
|---|
| 69 | }
|
|---|
| 70 | return $dst;
|
|---|
| 71 |
|
|---|
| 72 | There is now a pair of data structures to represent everything.
|
|---|
| 73 | It is valid for output sequence at a particular point to
|
|---|
| 74 | be defined but zero length, that just means "don't know yet".
|
|---|
| 75 | For the single byte case there is no 'next' so new tables will be the same as
|
|---|
| 76 | the original tables. For a multi-byte case a prefix byte will flip to the tables
|
|---|
| 77 | for the next page (adding nothing to the output), then the tables for the page
|
|---|
| 78 | will provide the actual output and set tables back to original base page.
|
|---|
| 79 |
|
|---|
| 80 | This scheme can also handle shift encodings.
|
|---|
| 81 |
|
|---|
| 82 | A slight enhancement to the scheme also allows for look-ahead - if
|
|---|
| 83 | we add a flag to re-add the removed byte to the source we could handle
|
|---|
| 84 | a" -> ä
|
|---|
| 85 | ab -> a (and take b back please)
|
|---|
| 86 |
|
|---|
| 87 | */
|
|---|
| 88 |
|
|---|
| 89 | #include <EXTERN.h>
|
|---|
| 90 | #include <perl.h>
|
|---|
| 91 | #define U8 U8
|
|---|
| 92 | #include "encode.h"
|
|---|
| 93 |
|
|---|
| 94 | int
|
|---|
| 95 | do_encode(encpage_t * enc, const U8 * src, STRLEN * slen, U8 * dst,
|
|---|
| 96 | STRLEN dlen, STRLEN * dout, int approx, const U8 *term, STRLEN tlen)
|
|---|
| 97 | {
|
|---|
| 98 | const U8 *s = src;
|
|---|
| 99 | const U8 *send = s + *slen;
|
|---|
| 100 | const U8 *last = s;
|
|---|
| 101 | U8 *d = dst;
|
|---|
| 102 | U8 *dend = d + dlen, *dlast = d;
|
|---|
| 103 | int code = 0;
|
|---|
| 104 | while (s < send) {
|
|---|
| 105 | encpage_t *e = enc;
|
|---|
| 106 | U8 byte = *s;
|
|---|
| 107 | while (byte > e->max)
|
|---|
| 108 | e++;
|
|---|
| 109 | if (byte >= e->min && e->slen && (approx || !(e->slen & 0x80))) {
|
|---|
| 110 | const U8 *cend = s + (e->slen & 0x7f);
|
|---|
| 111 | if (cend <= send) {
|
|---|
| 112 | STRLEN n;
|
|---|
| 113 | if ((n = e->dlen)) {
|
|---|
| 114 | const U8 *out = e->seq + n * (byte - e->min);
|
|---|
| 115 | U8 *oend = d + n;
|
|---|
| 116 | if (dst) {
|
|---|
| 117 | if (oend <= dend) {
|
|---|
| 118 | while (d < oend)
|
|---|
| 119 | *d++ = *out++;
|
|---|
| 120 | }
|
|---|
| 121 | else {
|
|---|
| 122 | /* Out of space */
|
|---|
| 123 | code = ENCODE_NOSPACE;
|
|---|
| 124 | break;
|
|---|
| 125 | }
|
|---|
| 126 | }
|
|---|
| 127 | else
|
|---|
| 128 | d = oend;
|
|---|
| 129 | }
|
|---|
| 130 | enc = e->next;
|
|---|
| 131 | s++;
|
|---|
| 132 | if (s == cend) {
|
|---|
| 133 | if (approx && (e->slen & 0x80))
|
|---|
| 134 | code = ENCODE_FALLBACK;
|
|---|
| 135 | last = s;
|
|---|
| 136 | if (term && (STRLEN)(d-dlast) == tlen && memEQ(dlast, term, tlen)) {
|
|---|
| 137 | code = ENCODE_FOUND_TERM;
|
|---|
| 138 | break;
|
|---|
| 139 | }
|
|---|
| 140 | dlast = d;
|
|---|
| 141 | }
|
|---|
| 142 | }
|
|---|
| 143 | else {
|
|---|
| 144 | /* partial source character */
|
|---|
| 145 | code = ENCODE_PARTIAL;
|
|---|
| 146 | break;
|
|---|
| 147 | }
|
|---|
| 148 | }
|
|---|
| 149 | else {
|
|---|
| 150 | /* Cannot represent */
|
|---|
| 151 | code = ENCODE_NOREP;
|
|---|
| 152 | break;
|
|---|
| 153 | }
|
|---|
| 154 | }
|
|---|
| 155 | *slen = last - src;
|
|---|
| 156 | *dout = d - dst;
|
|---|
| 157 | return code;
|
|---|
| 158 | }
|
|---|