| 1 | /*
|
|---|
| 2 | $Id: Unicode.xs,v 2.1 2004/10/24 13:00:29 dankogai Exp $
|
|---|
| 3 | */
|
|---|
| 4 |
|
|---|
| 5 | #define PERL_NO_GET_CONTEXT
|
|---|
| 6 | #include "EXTERN.h"
|
|---|
| 7 | #include "perl.h"
|
|---|
| 8 | #include "XSUB.h"
|
|---|
| 9 | #define U8 U8
|
|---|
| 10 | #include "../Encode/encode.h"
|
|---|
| 11 |
|
|---|
| 12 | #define FBCHAR 0xFFFd
|
|---|
| 13 | #define BOM_BE 0xFeFF
|
|---|
| 14 | #define BOM16LE 0xFFFe
|
|---|
| 15 | #define BOM32LE 0xFFFe0000
|
|---|
| 16 | #define issurrogate(x) (0xD800 <= (x) && (x) <= 0xDFFF )
|
|---|
| 17 | #define isHiSurrogate(x) (0xD800 <= (x) && (x) < 0xDC00 )
|
|---|
| 18 | #define isLoSurrogate(x) (0xDC00 <= (x) && (x) <= 0xDFFF )
|
|---|
| 19 | #define invalid_ucs2(x) ( issurrogate(x) || 0xFFFF < (x) )
|
|---|
| 20 |
|
|---|
| 21 | static UV
|
|---|
| 22 | enc_unpack(pTHX_ U8 **sp,U8 *e,STRLEN size,U8 endian)
|
|---|
| 23 | {
|
|---|
| 24 | U8 *s = *sp;
|
|---|
| 25 | UV v = 0;
|
|---|
| 26 | if (s+size > e) {
|
|---|
| 27 | croak("Partial character %c",(char) endian);
|
|---|
| 28 | }
|
|---|
| 29 | switch(endian) {
|
|---|
| 30 | case 'N':
|
|---|
| 31 | v = *s++;
|
|---|
| 32 | v = (v << 8) | *s++;
|
|---|
| 33 | case 'n':
|
|---|
| 34 | v = (v << 8) | *s++;
|
|---|
| 35 | v = (v << 8) | *s++;
|
|---|
| 36 | break;
|
|---|
| 37 | case 'V':
|
|---|
| 38 | case 'v':
|
|---|
| 39 | v |= *s++;
|
|---|
| 40 | v |= (*s++ << 8);
|
|---|
| 41 | if (endian == 'v')
|
|---|
| 42 | break;
|
|---|
| 43 | v |= (*s++ << 16);
|
|---|
| 44 | v |= (*s++ << 24);
|
|---|
| 45 | break;
|
|---|
| 46 | default:
|
|---|
| 47 | croak("Unknown endian %c",(char) endian);
|
|---|
| 48 | break;
|
|---|
| 49 | }
|
|---|
| 50 | *sp = s;
|
|---|
| 51 | return v;
|
|---|
| 52 | }
|
|---|
| 53 |
|
|---|
| 54 | void
|
|---|
| 55 | enc_pack(pTHX_ SV *result,STRLEN size,U8 endian,UV value)
|
|---|
| 56 | {
|
|---|
| 57 | U8 *d = (U8 *)SvGROW(result,SvCUR(result)+size+1);
|
|---|
| 58 | switch(endian) {
|
|---|
| 59 | case 'v':
|
|---|
| 60 | case 'V':
|
|---|
| 61 | d += SvCUR(result);
|
|---|
| 62 | SvCUR_set(result,SvCUR(result)+size);
|
|---|
| 63 | while (size--) {
|
|---|
| 64 | *d++ = (U8)(value & 0xFF);
|
|---|
| 65 | value >>= 8;
|
|---|
| 66 | }
|
|---|
| 67 | break;
|
|---|
| 68 | case 'n':
|
|---|
| 69 | case 'N':
|
|---|
| 70 | SvCUR_set(result,SvCUR(result)+size);
|
|---|
| 71 | d += SvCUR(result);
|
|---|
| 72 | while (size--) {
|
|---|
| 73 | *--d = (U8)(value & 0xFF);
|
|---|
| 74 | value >>= 8;
|
|---|
| 75 | }
|
|---|
| 76 | break;
|
|---|
| 77 | default:
|
|---|
| 78 | croak("Unknown endian %c",(char) endian);
|
|---|
| 79 | break;
|
|---|
| 80 | }
|
|---|
| 81 | }
|
|---|
| 82 |
|
|---|
| 83 | MODULE = Encode::Unicode PACKAGE = Encode::Unicode
|
|---|
| 84 |
|
|---|
| 85 | PROTOTYPES: DISABLE
|
|---|
| 86 |
|
|---|
| 87 | #define attr(k, l) (hv_exists((HV *)SvRV(obj),k,l) ? \
|
|---|
| 88 | *hv_fetch((HV *)SvRV(obj),k,l,0) : &PL_sv_undef)
|
|---|
| 89 |
|
|---|
| 90 | void
|
|---|
| 91 | decode_xs(obj, str, check = 0)
|
|---|
| 92 | SV * obj
|
|---|
| 93 | SV * str
|
|---|
| 94 | IV check
|
|---|
| 95 | CODE:
|
|---|
| 96 | {
|
|---|
| 97 | U8 endian = *((U8 *)SvPV_nolen(attr("endian", 6)));
|
|---|
| 98 | int size = SvIV(attr("size", 4));
|
|---|
| 99 | int ucs2 = SvTRUE(attr("ucs2", 4));
|
|---|
| 100 | int renewed = SvTRUE(attr("renewed", 7));
|
|---|
| 101 | SV *result = newSVpvn("",0);
|
|---|
| 102 | STRLEN ulen;
|
|---|
| 103 | U8 *s = (U8 *)SvPVbyte(str,ulen);
|
|---|
| 104 | U8 *e = (U8 *)SvEND(str);
|
|---|
| 105 | ST(0) = sv_2mortal(result);
|
|---|
| 106 | SvUTF8_on(result);
|
|---|
| 107 |
|
|---|
| 108 | if (!endian && s+size <= e) {
|
|---|
| 109 | UV bom;
|
|---|
| 110 | endian = (size == 4) ? 'N' : 'n';
|
|---|
| 111 | bom = enc_unpack(aTHX_ &s,e,size,endian);
|
|---|
| 112 | if (bom != BOM_BE) {
|
|---|
| 113 | if (bom == BOM16LE) {
|
|---|
| 114 | endian = 'v';
|
|---|
| 115 | }
|
|---|
| 116 | else if (bom == BOM32LE) {
|
|---|
| 117 | endian = 'V';
|
|---|
| 118 | }
|
|---|
| 119 | else {
|
|---|
| 120 | croak("%"SVf":Unrecognised BOM %"UVxf,
|
|---|
| 121 | *hv_fetch((HV *)SvRV(obj),"Name",4,0),
|
|---|
| 122 | bom);
|
|---|
| 123 | }
|
|---|
| 124 | }
|
|---|
| 125 | #if 1
|
|---|
| 126 | /* Update endian for next sequence */
|
|---|
| 127 | if (renewed) {
|
|---|
| 128 | hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
|
|---|
| 129 | }
|
|---|
| 130 | #endif
|
|---|
| 131 | }
|
|---|
| 132 | while (s < e && s+size <= e) {
|
|---|
| 133 | UV ord = enc_unpack(aTHX_ &s,e,size,endian);
|
|---|
| 134 | U8 *d;
|
|---|
| 135 | if (size != 4 && invalid_ucs2(ord)) {
|
|---|
| 136 | if (ucs2) {
|
|---|
| 137 | if (check) {
|
|---|
| 138 | croak("%"SVf":no surrogates allowed %"UVxf,
|
|---|
| 139 | *hv_fetch((HV *)SvRV(obj),"Name",4,0),
|
|---|
| 140 | ord);
|
|---|
| 141 | }
|
|---|
| 142 | if (s+size <= e) {
|
|---|
| 143 | /* skip the next one as well */
|
|---|
| 144 | enc_unpack(aTHX_ &s,e,size,endian);
|
|---|
| 145 | }
|
|---|
| 146 | ord = FBCHAR;
|
|---|
| 147 | }
|
|---|
| 148 | else {
|
|---|
| 149 | UV lo;
|
|---|
| 150 | if (!isHiSurrogate(ord)) {
|
|---|
| 151 | croak("%"SVf":Malformed HI surrogate %"UVxf,
|
|---|
| 152 | *hv_fetch((HV *)SvRV(obj),"Name",4,0),
|
|---|
| 153 | ord);
|
|---|
| 154 | }
|
|---|
| 155 | if (s+size > e) {
|
|---|
| 156 | /* Partial character */
|
|---|
| 157 | s -= size; /* back up to 1st half */
|
|---|
| 158 | break; /* And exit loop */
|
|---|
| 159 | }
|
|---|
| 160 | lo = enc_unpack(aTHX_ &s,e,size,endian);
|
|---|
| 161 | if (!isLoSurrogate(lo)){
|
|---|
| 162 | croak("%"SVf":Malformed LO surrogate %"UVxf,
|
|---|
| 163 | *hv_fetch((HV *)SvRV(obj),"Name",4,0),
|
|---|
| 164 | ord);
|
|---|
| 165 | }
|
|---|
| 166 | ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00);
|
|---|
| 167 | }
|
|---|
| 168 | }
|
|---|
| 169 | d = (U8 *) SvGROW(result,SvCUR(result)+UTF8_MAXLEN+1);
|
|---|
| 170 | d = uvuni_to_utf8_flags(d+SvCUR(result), ord, 0);
|
|---|
| 171 | SvCUR_set(result,d - (U8 *)SvPVX(result));
|
|---|
| 172 | }
|
|---|
| 173 | if (s < e) {
|
|---|
| 174 | /* unlikely to happen because it's fixed-length -- dankogai */
|
|---|
| 175 | if (check & ENCODE_WARN_ON_ERR){
|
|---|
| 176 | Perl_warner(aTHX_ packWARN(WARN_UTF8),"%"SVf":Partial character",
|
|---|
| 177 | *hv_fetch((HV *)SvRV(obj),"Name",4,0));
|
|---|
| 178 | }
|
|---|
| 179 | }
|
|---|
| 180 | if (check && !(check & ENCODE_LEAVE_SRC)){
|
|---|
| 181 | if (s < e) {
|
|---|
| 182 | Move(s,SvPVX(str),e-s,U8);
|
|---|
| 183 | SvCUR_set(str,(e-s));
|
|---|
| 184 | }
|
|---|
| 185 | else {
|
|---|
| 186 | SvCUR_set(str,0);
|
|---|
| 187 | }
|
|---|
| 188 | *SvEND(str) = '\0';
|
|---|
| 189 | }
|
|---|
| 190 | XSRETURN(1);
|
|---|
| 191 | }
|
|---|
| 192 |
|
|---|
| 193 | void
|
|---|
| 194 | encode_xs(obj, utf8, check = 0)
|
|---|
| 195 | SV * obj
|
|---|
| 196 | SV * utf8
|
|---|
| 197 | IV check
|
|---|
| 198 | CODE:
|
|---|
| 199 | {
|
|---|
| 200 | U8 endian = *((U8 *)SvPV_nolen(attr("endian", 6)));
|
|---|
| 201 | int size = SvIV(attr("size", 4));
|
|---|
| 202 | int ucs2 = SvTRUE(attr("ucs2", 4));
|
|---|
| 203 | int renewed = SvTRUE(attr("renewed", 7));
|
|---|
| 204 | SV *result = newSVpvn("",0);
|
|---|
| 205 | STRLEN ulen;
|
|---|
| 206 | U8 *s = (U8 *)SvPVutf8(utf8,ulen);
|
|---|
| 207 | U8 *e = (U8 *)SvEND(utf8);
|
|---|
| 208 | ST(0) = sv_2mortal(result);
|
|---|
| 209 | if (!endian) {
|
|---|
| 210 | endian = (size == 4) ? 'N' : 'n';
|
|---|
| 211 | enc_pack(aTHX_ result,size,endian,BOM_BE);
|
|---|
| 212 | #if 1
|
|---|
| 213 | /* Update endian for next sequence */
|
|---|
| 214 | if (renewed){
|
|---|
| 215 | hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
|
|---|
| 216 | }
|
|---|
| 217 | #endif
|
|---|
| 218 | }
|
|---|
| 219 | while (s < e && s+UTF8SKIP(s) <= e) {
|
|---|
| 220 | STRLEN len;
|
|---|
| 221 | UV ord = utf8n_to_uvuni(s, e-s, &len, 0);
|
|---|
| 222 | s += len;
|
|---|
| 223 | if (size != 4 && invalid_ucs2(ord)) {
|
|---|
| 224 | if (!issurrogate(ord)){
|
|---|
| 225 | if (ucs2) {
|
|---|
| 226 | if (check) {
|
|---|
| 227 | croak("%"SVf":code point \"\\x{%"UVxf"}\" too high",
|
|---|
| 228 | *hv_fetch((HV *)SvRV(obj),"Name",4,0),ord);
|
|---|
| 229 | }
|
|---|
| 230 | enc_pack(aTHX_ result,size,endian,FBCHAR);
|
|---|
| 231 | }else{
|
|---|
| 232 | UV hi = ((ord - 0x10000) >> 10) + 0xD800;
|
|---|
| 233 | UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00;
|
|---|
| 234 | enc_pack(aTHX_ result,size,endian,hi);
|
|---|
| 235 | enc_pack(aTHX_ result,size,endian,lo);
|
|---|
| 236 | }
|
|---|
| 237 | }
|
|---|
| 238 | else {
|
|---|
| 239 | /* not supposed to happen */
|
|---|
| 240 | enc_pack(aTHX_ result,size,endian,FBCHAR);
|
|---|
| 241 | }
|
|---|
| 242 | }
|
|---|
| 243 | else {
|
|---|
| 244 | enc_pack(aTHX_ result,size,endian,ord);
|
|---|
| 245 | }
|
|---|
| 246 | }
|
|---|
| 247 | if (s < e) {
|
|---|
| 248 | /* UTF-8 partial char happens often on PerlIO.
|
|---|
| 249 | Since this is okay and normal, we do not warn.
|
|---|
| 250 | But this is critical when you choose to LEAVE_SRC
|
|---|
| 251 | in which case we die */
|
|---|
| 252 | if (check & (ENCODE_DIE_ON_ERR|ENCODE_LEAVE_SRC)){
|
|---|
| 253 | Perl_croak(aTHX_ "%"SVf":partial character is not allowed "
|
|---|
| 254 | "when CHECK = 0x%" UVuf,
|
|---|
| 255 | *hv_fetch((HV *)SvRV(obj),"Name",4,0), check);
|
|---|
| 256 | }
|
|---|
| 257 |
|
|---|
| 258 | }
|
|---|
| 259 | if (check && !(check & ENCODE_LEAVE_SRC)){
|
|---|
| 260 | if (s < e) {
|
|---|
| 261 | Move(s,SvPVX(utf8),e-s,U8);
|
|---|
| 262 | SvCUR_set(utf8,(e-s));
|
|---|
| 263 | }
|
|---|
| 264 | else {
|
|---|
| 265 | SvCUR_set(utf8,0);
|
|---|
| 266 | }
|
|---|
| 267 | *SvEND(utf8) = '\0';
|
|---|
| 268 | }
|
|---|
| 269 | XSRETURN(1);
|
|---|
| 270 | }
|
|---|
| 271 |
|
|---|