Geant4 11.1.1
Toolkit for the simulation of the passage of particles through matter
Loading...
Searching...
No Matches
xmltok.c
Go to the documentation of this file.
1/*
2 __ __ _
3 ___\ \/ /_ __ __ _| |_
4 / _ \\ /| '_ \ / _` | __|
5 | __// \| |_) | (_| | |_
6 \___/_/\_\ .__/ \__,_|\__|
7 |_| XML parser
8
9 Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10 Copyright (c) 2000 Clark Cooper <[email protected]>
11 Copyright (c) 2001-2003 Fred L. Drake, Jr. <[email protected]>
12 Copyright (c) 2002 Greg Stein <[email protected]>
13 Copyright (c) 2002-2016 Karl Waclawek <[email protected]>
14 Copyright (c) 2005-2009 Steven Solie <[email protected]>
15 Copyright (c) 2016-2022 Sebastian Pipping <[email protected]>
16 Copyright (c) 2016 Pascal Cuoq <[email protected]>
17 Copyright (c) 2016 Don Lewis <[email protected]>
18 Copyright (c) 2017 Rhodri James <[email protected]>
19 Copyright (c) 2017 Alexander Bluhm <[email protected]>
20 Copyright (c) 2017 Benbuck Nason <[email protected]>
21 Copyright (c) 2017 José Gutiérrez de la Concha <[email protected]>
22 Copyright (c) 2019 David Loffredo <[email protected]>
23 Copyright (c) 2021 Dong-hee Na <[email protected]>
24 Copyright (c) 2022 Martin Ettl <[email protected]>
25 Licensed under the MIT license:
26
27 Permission is hereby granted, free of charge, to any person obtaining
28 a copy of this software and associated documentation files (the
29 "Software"), to deal in the Software without restriction, including
30 without limitation the rights to use, copy, modify, merge, publish,
31 distribute, sublicense, and/or sell copies of the Software, and to permit
32 persons to whom the Software is furnished to do so, subject to the
33 following conditions:
34
35 The above copyright notice and this permission notice shall be included
36 in all copies or substantial portions of the Software.
37
38 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
39 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
40 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
41 NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
42 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
43 OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
44 USE OR OTHER DEALINGS IN THE SOFTWARE.
45*/
46
47#include <expat_config.h>
48
49#include <stddef.h>
50#include <string.h> /* memcpy */
51#include <stdbool.h>
52
53#ifdef _WIN32
54# include "winconfig.h"
55#endif
56
57#include "expat_external.h"
58#include "internal.h"
59#include "xmltok.h"
60#include "nametab.h"
61
62#ifdef XML_DTD
63# define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
64#else
65# define IGNORE_SECTION_TOK_VTABLE /* as nothing */
66#endif
67
68#define VTABLE1 \
69 {PREFIX(prologTok), PREFIX(contentTok), \
70 PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE}, \
71 {PREFIX(attributeValueTok), PREFIX(entityValueTok)}, \
72 PREFIX(nameMatchesAscii), PREFIX(nameLength), PREFIX(skipS), \
73 PREFIX(getAtts), PREFIX(charRefNumber), PREFIX(predefinedEntityName), \
74 PREFIX(updatePosition), PREFIX(isPublicId)
75
76#define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
77
78#define UCS2_GET_NAMING(pages, hi, lo) \
79 (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1u << ((lo)&0x1F)))
80
81/* A 2 byte UTF-8 representation splits the characters 11 bits between
82 the bottom 5 and 6 bits of the bytes. We need 8 bits to index into
83 pages, 3 bits to add to that index and 5 bits to generate the mask.
84*/
85#define UTF8_GET_NAMING2(pages, byte) \
86 (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
87 + ((((byte)[0]) & 3) << 1) + ((((byte)[1]) >> 5) & 1)] \
88 & (1u << (((byte)[1]) & 0x1F)))
89
90/* A 3 byte UTF-8 representation splits the characters 16 bits between
91 the bottom 4, 6 and 6 bits of the bytes. We need 8 bits to index
92 into pages, 3 bits to add to that index and 5 bits to generate the
93 mask.
94*/
95#define UTF8_GET_NAMING3(pages, byte) \
96 (namingBitmap \
97 [((pages)[((((byte)[0]) & 0xF) << 4) + ((((byte)[1]) >> 2) & 0xF)] \
98 << 3) \
99 + ((((byte)[1]) & 3) << 1) + ((((byte)[2]) >> 5) & 1)] \
100 & (1u << (((byte)[2]) & 0x1F)))
101
102/* Detection of invalid UTF-8 sequences is based on Table 3.1B
103 of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
104 with the additional restriction of not allowing the Unicode
105 code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
106 Implementation details:
107 (A & 0x80) == 0 means A < 0x80
108 and
109 (A & 0xC0) == 0xC0 means A > 0xBF
110*/
111
112#define UTF8_INVALID2(p) \
113 ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
114
115#define UTF8_INVALID3(p) \
116 (((p)[2] & 0x80) == 0 \
117 || ((*p) == 0xEF && (p)[1] == 0xBF ? (p)[2] > 0xBD \
118 : ((p)[2] & 0xC0) == 0xC0) \
119 || ((*p) == 0xE0 \
120 ? (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \
121 : ((p)[1] & 0x80) == 0 \
122 || ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
123
124#define UTF8_INVALID4(p) \
125 (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 || ((p)[2] & 0x80) == 0 \
126 || ((p)[2] & 0xC0) == 0xC0 \
127 || ((*p) == 0xF0 \
128 ? (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \
129 : ((p)[1] & 0x80) == 0 \
130 || ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
131
132static int PTRFASTCALL
133isNever(const ENCODING *enc, const char *p) {
134 UNUSED_P(enc);
135 UNUSED_P(p);
136 return 0;
137}
138
139static int PTRFASTCALL
140utf8_isName2(const ENCODING *enc, const char *p) {
141 UNUSED_P(enc);
142 return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
143}
144
145static int PTRFASTCALL
146utf8_isName3(const ENCODING *enc, const char *p) {
147 UNUSED_P(enc);
148 return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
149}
150
151#define utf8_isName4 isNever
152
153static int PTRFASTCALL
154utf8_isNmstrt2(const ENCODING *enc, const char *p) {
155 UNUSED_P(enc);
156 return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
157}
158
159static int PTRFASTCALL
160utf8_isNmstrt3(const ENCODING *enc, const char *p) {
161 UNUSED_P(enc);
162 return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
163}
164
165#define utf8_isNmstrt4 isNever
166
167static int PTRFASTCALL
168utf8_isInvalid2(const ENCODING *enc, const char *p) {
169 UNUSED_P(enc);
170 return UTF8_INVALID2((const unsigned char *)p);
171}
172
173static int PTRFASTCALL
174utf8_isInvalid3(const ENCODING *enc, const char *p) {
175 UNUSED_P(enc);
176 return UTF8_INVALID3((const unsigned char *)p);
177}
178
179static int PTRFASTCALL
180utf8_isInvalid4(const ENCODING *enc, const char *p) {
181 UNUSED_P(enc);
182 return UTF8_INVALID4((const unsigned char *)p);
183}
184
187 unsigned char type[256];
188#ifdef XML_MIN_SIZE
189 int(PTRFASTCALL *byteType)(const ENCODING *, const char *);
190 int(PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
191 int(PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
192 int(PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
193 int(PTRCALL *charMatches)(const ENCODING *, const char *, int);
194#endif /* XML_MIN_SIZE */
195 int(PTRFASTCALL *isName2)(const ENCODING *, const char *);
196 int(PTRFASTCALL *isName3)(const ENCODING *, const char *);
197 int(PTRFASTCALL *isName4)(const ENCODING *, const char *);
198 int(PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
199 int(PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
200 int(PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
201 int(PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
202 int(PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
203 int(PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
204};
205
206#define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *)(enc))
207
208#ifdef XML_MIN_SIZE
209
210# define STANDARD_VTABLE(E) \
211 E##byteType, E##isNameMin, E##isNmstrtMin, E##byteToAscii, E##charMatches,
212
213#else
214
215# define STANDARD_VTABLE(E) /* as nothing */
216
217#endif
218
219#define NORMAL_VTABLE(E) \
220 E##isName2, E##isName3, E##isName4, E##isNmstrt2, E##isNmstrt3, \
221 E##isNmstrt4, E##isInvalid2, E##isInvalid3, E##isInvalid4
222
223#define NULL_VTABLE \
224 /* isName2 */ NULL, /* isName3 */ NULL, /* isName4 */ NULL, \
225 /* isNmstrt2 */ NULL, /* isNmstrt3 */ NULL, /* isNmstrt4 */ NULL, \
226 /* isInvalid2 */ NULL, /* isInvalid3 */ NULL, /* isInvalid4 */ NULL
227
228static int FASTCALL checkCharRefNumber(int);
229
230#include "xmltok_impl.h"
231#include "ascii.h"
232
233#ifdef XML_MIN_SIZE
234# define sb_isNameMin isNever
235# define sb_isNmstrtMin isNever
236#endif
237
238#ifdef XML_MIN_SIZE
239# define MINBPC(enc) ((enc)->minBytesPerChar)
240#else
241/* minimum bytes per character */
242# define MINBPC(enc) 1
243#endif
244
245#define SB_BYTE_TYPE(enc, p) \
246 (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
247
248#ifdef XML_MIN_SIZE
249static int PTRFASTCALL
250sb_byteType(const ENCODING *enc, const char *p) {
251 return SB_BYTE_TYPE(enc, p);
252}
253# define BYTE_TYPE(enc, p) (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
254#else
255# define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
256#endif
257
258#ifdef XML_MIN_SIZE
259# define BYTE_TO_ASCII(enc, p) (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
260static int PTRFASTCALL
261sb_byteToAscii(const ENCODING *enc, const char *p) {
262 UNUSED_P(enc);
263 return *p;
264}
265#else
266# define BYTE_TO_ASCII(enc, p) (*(p))
267#endif
268
269#define IS_NAME_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isName##n(enc, p))
270#define IS_NMSTRT_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isNmstrt##n(enc, p))
271#ifdef XML_MIN_SIZE
272# define IS_INVALID_CHAR(enc, p, n) \
273 (AS_NORMAL_ENCODING(enc)->isInvalid##n \
274 && AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
275#else
276# define IS_INVALID_CHAR(enc, p, n) \
277 (AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
278#endif
279
280#ifdef XML_MIN_SIZE
281# define IS_NAME_CHAR_MINBPC(enc, p) \
282 (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
283# define IS_NMSTRT_CHAR_MINBPC(enc, p) \
284 (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
285#else
286# define IS_NAME_CHAR_MINBPC(enc, p) (0)
287# define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
288#endif
289
290#ifdef XML_MIN_SIZE
291# define CHAR_MATCHES(enc, p, c) \
292 (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
293static int PTRCALL
294sb_charMatches(const ENCODING *enc, const char *p, int c) {
295 UNUSED_P(enc);
296 return *p == c;
297}
298#else
299/* c is an ASCII character */
300# define CHAR_MATCHES(enc, p, c) (*(p) == (c))
301#endif
302
303#define PREFIX(ident) normal_##ident
304#define XML_TOK_IMPL_C
305#include "xmltok_impl.c"
306#undef XML_TOK_IMPL_C
307
308#undef MINBPC
309#undef BYTE_TYPE
310#undef BYTE_TO_ASCII
311#undef CHAR_MATCHES
312#undef IS_NAME_CHAR
313#undef IS_NAME_CHAR_MINBPC
314#undef IS_NMSTRT_CHAR
315#undef IS_NMSTRT_CHAR_MINBPC
316#undef IS_INVALID_CHAR
317
318enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
322 UTF8_cval4 = 0xf0
324
325void
327 const char **fromLimRef) {
328 const char *fromLim = *fromLimRef;
329 size_t walked = 0;
330 for (; fromLim > from; fromLim--, walked++) {
331 const unsigned char prev = (unsigned char)fromLim[-1];
332 if ((prev & 0xf8u)
333 == 0xf0u) { /* 4-byte character, lead by 0b11110xxx byte */
334 if (walked + 1 >= 4) {
335 fromLim += 4 - 1;
336 break;
337 } else {
338 walked = 0;
339 }
340 } else if ((prev & 0xf0u)
341 == 0xe0u) { /* 3-byte character, lead by 0b1110xxxx byte */
342 if (walked + 1 >= 3) {
343 fromLim += 3 - 1;
344 break;
345 } else {
346 walked = 0;
347 }
348 } else if ((prev & 0xe0u)
349 == 0xc0u) { /* 2-byte character, lead by 0b110xxxxx byte */
350 if (walked + 1 >= 2) {
351 fromLim += 2 - 1;
352 break;
353 } else {
354 walked = 0;
355 }
356 } else if ((prev & 0x80u)
357 == 0x00u) { /* 1-byte character, matching 0b0xxxxxxx */
358 break;
359 }
360 }
361 *fromLimRef = fromLim;
362}
363
364static enum XML_Convert_Result PTRCALL
365utf8_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
366 char **toP, const char *toLim) {
367 bool input_incomplete = false;
368 bool output_exhausted = false;
369
370 /* Avoid copying partial characters (due to limited space). */
371 const ptrdiff_t bytesAvailable = fromLim - *fromP;
372 const ptrdiff_t bytesStorable = toLim - *toP;
373 UNUSED_P(enc);
374 if (bytesAvailable > bytesStorable) {
375 fromLim = *fromP + bytesStorable;
376 output_exhausted = true;
377 }
378
379 /* Avoid copying partial characters (from incomplete input). */
380 {
381 const char *const fromLimBefore = fromLim;
383 if (fromLim < fromLimBefore) {
384 input_incomplete = true;
385 }
386 }
387
388 {
389 const ptrdiff_t bytesToCopy = fromLim - *fromP;
390 memcpy(*toP, *fromP, bytesToCopy);
391 *fromP += bytesToCopy;
392 *toP += bytesToCopy;
393 }
394
395 if (output_exhausted) /* needs to go first */
397 else if (input_incomplete)
399 else
401}
402
403static enum XML_Convert_Result PTRCALL
404utf8_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
405 unsigned short **toP, const unsigned short *toLim) {
407 unsigned short *to = *toP;
408 const char *from = *fromP;
409 while (from < fromLim && to < toLim) {
410 switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
411 case BT_LEAD2:
412 if (fromLim - from < 2) {
414 goto after;
415 }
416 *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
417 from += 2;
418 break;
419 case BT_LEAD3:
420 if (fromLim - from < 3) {
422 goto after;
423 }
424 *to++ = (unsigned short)(((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6)
425 | (from[2] & 0x3f));
426 from += 3;
427 break;
428 case BT_LEAD4: {
429 unsigned long n;
430 if (toLim - to < 2) {
432 goto after;
433 }
434 if (fromLim - from < 4) {
436 goto after;
437 }
438 n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
439 | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
440 n -= 0x10000;
441 to[0] = (unsigned short)((n >> 10) | 0xD800);
442 to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
443 to += 2;
444 from += 4;
445 } break;
446 default:
447 *to++ = *from++;
448 break;
449 }
450 }
451 if (from < fromLim)
453after:
454 *fromP = from;
455 *toP = to;
456 return res;
457}
458
459#ifdef XML_NS
460static const struct normal_encoding utf8_encoding_ns
461 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
462 {
463# include "asciitab.h"
464# include "utf8tab.h"
465 },
466 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
467#endif
468
469static const struct normal_encoding utf8_encoding
470 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
471 {
472#define BT_COLON BT_NMSTRT
473#include "asciitab.h"
474#undef BT_COLON
475#include "utf8tab.h"
476 },
477 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
478
479#ifdef XML_NS
480
481static const struct normal_encoding internal_utf8_encoding_ns
482 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
483 {
484# include "iasciitab.h"
485# include "utf8tab.h"
486 },
487 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
488
489#endif
490
491static const struct normal_encoding internal_utf8_encoding
492 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
493 {
494#define BT_COLON BT_NMSTRT
495#include "iasciitab.h"
496#undef BT_COLON
497#include "utf8tab.h"
498 },
499 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
500
501static enum XML_Convert_Result PTRCALL
502latin1_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
503 char **toP, const char *toLim) {
504 UNUSED_P(enc);
505 for (;;) {
506 unsigned char c;
507 if (*fromP == fromLim)
509 c = (unsigned char)**fromP;
510 if (c & 0x80) {
511 if (toLim - *toP < 2)
513 *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
514 *(*toP)++ = (char)((c & 0x3f) | 0x80);
515 (*fromP)++;
516 } else {
517 if (*toP == toLim)
519 *(*toP)++ = *(*fromP)++;
520 }
521 }
522}
523
524static enum XML_Convert_Result PTRCALL
525latin1_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
526 unsigned short **toP, const unsigned short *toLim) {
527 UNUSED_P(enc);
528 while (*fromP < fromLim && *toP < toLim)
529 *(*toP)++ = (unsigned char)*(*fromP)++;
530
531 if ((*toP == toLim) && (*fromP < fromLim))
533 else
535}
536
537#ifdef XML_NS
538
539static const struct normal_encoding latin1_encoding_ns
540 = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
541 {
542# include "asciitab.h"
543# include "latin1tab.h"
544 },
546
547#endif
548
549static const struct normal_encoding latin1_encoding
550 = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
551 {
552#define BT_COLON BT_NMSTRT
553#include "asciitab.h"
554#undef BT_COLON
555#include "latin1tab.h"
556 },
558
559static enum XML_Convert_Result PTRCALL
560ascii_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
561 char **toP, const char *toLim) {
562 UNUSED_P(enc);
563 while (*fromP < fromLim && *toP < toLim)
564 *(*toP)++ = *(*fromP)++;
565
566 if ((*toP == toLim) && (*fromP < fromLim))
568 else
570}
571
572#ifdef XML_NS
573
574static const struct normal_encoding ascii_encoding_ns
575 = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
576 {
577# include "asciitab.h"
578 /* BT_NONXML == 0 */
579 },
581
582#endif
583
584static const struct normal_encoding ascii_encoding
585 = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
586 {
587#define BT_COLON BT_NMSTRT
588#include "asciitab.h"
589#undef BT_COLON
590 /* BT_NONXML == 0 */
591 },
593
594static int PTRFASTCALL
595unicode_byte_type(char hi, char lo) {
596 switch ((unsigned char)hi) {
597 /* 0xD800-0xDBFF first 16-bit code unit or high surrogate (W1) */
598 case 0xD8:
599 case 0xD9:
600 case 0xDA:
601 case 0xDB:
602 return BT_LEAD4;
603 /* 0xDC00-0xDFFF second 16-bit code unit or low surrogate (W2) */
604 case 0xDC:
605 case 0xDD:
606 case 0xDE:
607 case 0xDF:
608 return BT_TRAIL;
609 case 0xFF:
610 switch ((unsigned char)lo) {
611 case 0xFF: /* noncharacter-FFFF */
612 case 0xFE: /* noncharacter-FFFE */
613 return BT_NONXML;
614 }
615 break;
616 }
617 return BT_NONASCII;
618}
619
620#define DEFINE_UTF16_TO_UTF8(E) \
621 static enum XML_Convert_Result PTRCALL E##toUtf8( \
622 const ENCODING *enc, const char **fromP, const char *fromLim, \
623 char **toP, const char *toLim) { \
624 const char *from = *fromP; \
625 UNUSED_P(enc); \
626 fromLim = from + (((fromLim - from) >> 1) << 1); /* shrink to even */ \
627 for (; from < fromLim; from += 2) { \
628 int plane; \
629 unsigned char lo2; \
630 unsigned char lo = GET_LO(from); \
631 unsigned char hi = GET_HI(from); \
632 switch (hi) { \
633 case 0: \
634 if (lo < 0x80) { \
635 if (*toP == toLim) { \
636 *fromP = from; \
637 return XML_CONVERT_OUTPUT_EXHAUSTED; \
638 } \
639 *(*toP)++ = lo; \
640 break; \
641 } \
642 /* fall through */ \
643 case 0x1: \
644 case 0x2: \
645 case 0x3: \
646 case 0x4: \
647 case 0x5: \
648 case 0x6: \
649 case 0x7: \
650 if (toLim - *toP < 2) { \
651 *fromP = from; \
652 return XML_CONVERT_OUTPUT_EXHAUSTED; \
653 } \
654 *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \
655 *(*toP)++ = ((lo & 0x3f) | 0x80); \
656 break; \
657 default: \
658 if (toLim - *toP < 3) { \
659 *fromP = from; \
660 return XML_CONVERT_OUTPUT_EXHAUSTED; \
661 } \
662 /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
663 *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
664 *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
665 *(*toP)++ = ((lo & 0x3f) | 0x80); \
666 break; \
667 case 0xD8: \
668 case 0xD9: \
669 case 0xDA: \
670 case 0xDB: \
671 if (toLim - *toP < 4) { \
672 *fromP = from; \
673 return XML_CONVERT_OUTPUT_EXHAUSTED; \
674 } \
675 if (fromLim - from < 4) { \
676 *fromP = from; \
677 return XML_CONVERT_INPUT_INCOMPLETE; \
678 } \
679 plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
680 *(*toP)++ = (char)((plane >> 2) | UTF8_cval4); \
681 *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
682 from += 2; \
683 lo2 = GET_LO(from); \
684 *(*toP)++ = (((lo & 0x3) << 4) | ((GET_HI(from) & 0x3) << 2) \
685 | (lo2 >> 6) | 0x80); \
686 *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
687 break; \
688 } \
689 } \
690 *fromP = from; \
691 if (from < fromLim) \
692 return XML_CONVERT_INPUT_INCOMPLETE; \
693 else \
694 return XML_CONVERT_COMPLETED; \
695 }
696
697#define DEFINE_UTF16_TO_UTF16(E) \
698 static enum XML_Convert_Result PTRCALL E##toUtf16( \
699 const ENCODING *enc, const char **fromP, const char *fromLim, \
700 unsigned short **toP, const unsigned short *toLim) { \
701 enum XML_Convert_Result res = XML_CONVERT_COMPLETED; \
702 UNUSED_P(enc); \
703 fromLim = *fromP + (((fromLim - *fromP) >> 1) << 1); /* shrink to even */ \
704 /* Avoid copying first half only of surrogate */ \
705 if (fromLim - *fromP > ((toLim - *toP) << 1) \
706 && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) { \
707 fromLim -= 2; \
708 res = XML_CONVERT_INPUT_INCOMPLETE; \
709 } \
710 for (; *fromP < fromLim && *toP < toLim; *fromP += 2) \
711 *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
712 if ((*toP == toLim) && (*fromP < fromLim)) \
713 return XML_CONVERT_OUTPUT_EXHAUSTED; \
714 else \
715 return res; \
716 }
717
718#define SET2(ptr, ch) (((ptr)[0] = ((ch)&0xff)), ((ptr)[1] = ((ch) >> 8)))
719#define GET_LO(ptr) ((unsigned char)(ptr)[0])
720#define GET_HI(ptr) ((unsigned char)(ptr)[1])
721
722DEFINE_UTF16_TO_UTF8(little2_)
723DEFINE_UTF16_TO_UTF16(little2_)
724
725#undef SET2
726#undef GET_LO
727#undef GET_HI
728
729#define SET2(ptr, ch) (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch)&0xFF)))
730#define GET_LO(ptr) ((unsigned char)(ptr)[1])
731#define GET_HI(ptr) ((unsigned char)(ptr)[0])
732
735
736#undef SET2
737#undef GET_LO
738#undef GET_HI
739
740#define LITTLE2_BYTE_TYPE(enc, p) \
741 ((p)[1] == 0 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
742 : unicode_byte_type((p)[1], (p)[0]))
743#define LITTLE2_BYTE_TO_ASCII(p) ((p)[1] == 0 ? (p)[0] : -1)
744#define LITTLE2_CHAR_MATCHES(p, c) ((p)[1] == 0 && (p)[0] == (c))
745#define LITTLE2_IS_NAME_CHAR_MINBPC(p) \
746 UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
747#define LITTLE2_IS_NMSTRT_CHAR_MINBPC(p) \
748 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
749
750#ifdef XML_MIN_SIZE
751
752static int PTRFASTCALL
753little2_byteType(const ENCODING *enc, const char *p) {
754 return LITTLE2_BYTE_TYPE(enc, p);
755}
756
757static int PTRFASTCALL
758little2_byteToAscii(const ENCODING *enc, const char *p) {
759 UNUSED_P(enc);
760 return LITTLE2_BYTE_TO_ASCII(p);
761}
762
763static int PTRCALL
764little2_charMatches(const ENCODING *enc, const char *p, int c) {
765 UNUSED_P(enc);
766 return LITTLE2_CHAR_MATCHES(p, c);
767}
768
769static int PTRFASTCALL
770little2_isNameMin(const ENCODING *enc, const char *p) {
771 UNUSED_P(enc);
773}
774
775static int PTRFASTCALL
776little2_isNmstrtMin(const ENCODING *enc, const char *p) {
777 UNUSED_P(enc);
779}
780
781# undef VTABLE
782# define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
783
784#else /* not XML_MIN_SIZE */
785
786# undef PREFIX
787# define PREFIX(ident) little2_##ident
788# define MINBPC(enc) 2
789/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
790# define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
791# define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(p)
792# define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(p, c)
793# define IS_NAME_CHAR(enc, p, n) 0
794# define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(p)
795# define IS_NMSTRT_CHAR(enc, p, n) (0)
796# define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(p)
797
798# define XML_TOK_IMPL_C
799# include "xmltok_impl.c"
800# undef XML_TOK_IMPL_C
801
802# undef MINBPC
803# undef BYTE_TYPE
804# undef BYTE_TO_ASCII
805# undef CHAR_MATCHES
806# undef IS_NAME_CHAR
807# undef IS_NAME_CHAR_MINBPC
808# undef IS_NMSTRT_CHAR
809# undef IS_NMSTRT_CHAR_MINBPC
810# undef IS_INVALID_CHAR
811
812#endif /* not XML_MIN_SIZE */
813
814#ifdef XML_NS
815
816static const struct normal_encoding little2_encoding_ns
817 = {{VTABLE, 2, 0,
818# if BYTEORDER == 1234
819 1
820# else
821 0
822# endif
823 },
824 {
825# include "asciitab.h"
826# include "latin1tab.h"
827 },
828 STANDARD_VTABLE(little2_) NULL_VTABLE};
829
830#endif
831
832static const struct normal_encoding little2_encoding
833 = {{VTABLE, 2, 0,
834#if BYTEORDER == 1234
835 1
836#else
837 0
838#endif
839 },
840 {
841#define BT_COLON BT_NMSTRT
842#include "asciitab.h"
843#undef BT_COLON
844#include "latin1tab.h"
845 },
846 STANDARD_VTABLE(little2_) NULL_VTABLE};
847
848#if BYTEORDER != 4321
849
850# ifdef XML_NS
851
852static const struct normal_encoding internal_little2_encoding_ns
853 = {{VTABLE, 2, 0, 1},
854 {
855# include "iasciitab.h"
856# include "latin1tab.h"
857 },
858 STANDARD_VTABLE(little2_) NULL_VTABLE};
859
860# endif
861
862static const struct normal_encoding internal_little2_encoding
863 = {{VTABLE, 2, 0, 1},
864 {
865# define BT_COLON BT_NMSTRT
866# include "iasciitab.h"
867# undef BT_COLON
868# include "latin1tab.h"
869 },
870 STANDARD_VTABLE(little2_) NULL_VTABLE};
871
872#endif
873
874#define BIG2_BYTE_TYPE(enc, p) \
875 ((p)[0] == 0 \
876 ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
877 : unicode_byte_type((p)[0], (p)[1]))
878#define BIG2_BYTE_TO_ASCII(p) ((p)[0] == 0 ? (p)[1] : -1)
879#define BIG2_CHAR_MATCHES(p, c) ((p)[0] == 0 && (p)[1] == (c))
880#define BIG2_IS_NAME_CHAR_MINBPC(p) \
881 UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
882#define BIG2_IS_NMSTRT_CHAR_MINBPC(p) \
883 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
884
885#ifdef XML_MIN_SIZE
886
887static int PTRFASTCALL
888big2_byteType(const ENCODING *enc, const char *p) {
889 return BIG2_BYTE_TYPE(enc, p);
890}
891
892static int PTRFASTCALL
893big2_byteToAscii(const ENCODING *enc, const char *p) {
894 UNUSED_P(enc);
895 return BIG2_BYTE_TO_ASCII(p);
896}
897
898static int PTRCALL
899big2_charMatches(const ENCODING *enc, const char *p, int c) {
900 UNUSED_P(enc);
901 return BIG2_CHAR_MATCHES(p, c);
902}
903
904static int PTRFASTCALL
905big2_isNameMin(const ENCODING *enc, const char *p) {
906 UNUSED_P(enc);
907 return BIG2_IS_NAME_CHAR_MINBPC(p);
908}
909
910static int PTRFASTCALL
911big2_isNmstrtMin(const ENCODING *enc, const char *p) {
912 UNUSED_P(enc);
914}
915
916# undef VTABLE
917# define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
918
919#else /* not XML_MIN_SIZE */
920
921# undef PREFIX
922# define PREFIX(ident) big2_##ident
923# define MINBPC(enc) 2
924/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
925# define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
926# define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(p)
927# define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(p, c)
928# define IS_NAME_CHAR(enc, p, n) 0
929# define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(p)
930# define IS_NMSTRT_CHAR(enc, p, n) (0)
931# define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(p)
932
933# define XML_TOK_IMPL_C
934# include "xmltok_impl.c"
935# undef XML_TOK_IMPL_C
936
937# undef MINBPC
938# undef BYTE_TYPE
939# undef BYTE_TO_ASCII
940# undef CHAR_MATCHES
941# undef IS_NAME_CHAR
942# undef IS_NAME_CHAR_MINBPC
943# undef IS_NMSTRT_CHAR
944# undef IS_NMSTRT_CHAR_MINBPC
945# undef IS_INVALID_CHAR
946
947#endif /* not XML_MIN_SIZE */
948
949#ifdef XML_NS
950
951static const struct normal_encoding big2_encoding_ns
952 = {{VTABLE, 2, 0,
953# if BYTEORDER == 4321
954 1
955# else
956 0
957# endif
958 },
959 {
960# include "asciitab.h"
961# include "latin1tab.h"
962 },
964
965#endif
966
967static const struct normal_encoding big2_encoding
968 = {{VTABLE, 2, 0,
969#if BYTEORDER == 4321
970 1
971#else
972 0
973#endif
974 },
975 {
976#define BT_COLON BT_NMSTRT
977#include "asciitab.h"
978#undef BT_COLON
979#include "latin1tab.h"
980 },
982
983#if BYTEORDER != 1234
984
985# ifdef XML_NS
986
987static const struct normal_encoding internal_big2_encoding_ns
988 = {{VTABLE, 2, 0, 1},
989 {
990# include "iasciitab.h"
991# include "latin1tab.h"
992 },
994
995# endif
996
997static const struct normal_encoding internal_big2_encoding
998 = {{VTABLE, 2, 0, 1},
999 {
1000# define BT_COLON BT_NMSTRT
1001# include "iasciitab.h"
1002# undef BT_COLON
1003# include "latin1tab.h"
1004 },
1006
1007#endif
1008
1009#undef PREFIX
1010
1011static int FASTCALL
1012streqci(const char *s1, const char *s2) {
1013 for (;;) {
1014 char c1 = *s1++;
1015 char c2 = *s2++;
1016 if (ASCII_a <= c1 && c1 <= ASCII_z)
1017 c1 += ASCII_A - ASCII_a;
1018 if (ASCII_a <= c2 && c2 <= ASCII_z)
1019 /* The following line will never get executed. streqci() is
1020 * only called from two places, both of which guarantee to put
1021 * upper-case strings into s2.
1022 */
1023 c2 += ASCII_A - ASCII_a; /* LCOV_EXCL_LINE */
1024 if (c1 != c2)
1025 return 0;
1026 if (! c1)
1027 break;
1028 }
1029 return 1;
1030}
1031
1032static void PTRCALL
1033initUpdatePosition(const ENCODING *enc, const char *ptr, const char *end,
1034 POSITION *pos) {
1035 UNUSED_P(enc);
1036 normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
1037}
1038
1039static int
1040toAscii(const ENCODING *enc, const char *ptr, const char *end) {
1041 char buf[1];
1042 char *p = buf;
1043 XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
1044 if (p == buf)
1045 return -1;
1046 else
1047 return buf[0];
1048}
1049
1050static int FASTCALL
1051isSpace(int c) {
1052 switch (c) {
1053 case 0x20:
1054 case 0xD:
1055 case 0xA:
1056 case 0x9:
1057 return 1;
1058 }
1059 return 0;
1060}
1061
1062/* Return 1 if there's just optional white space or there's an S
1063 followed by name=val.
1064*/
1065static int
1066parsePseudoAttribute(const ENCODING *enc, const char *ptr, const char *end,
1067 const char **namePtr, const char **nameEndPtr,
1068 const char **valPtr, const char **nextTokPtr) {
1069 int c;
1070 char open;
1071 if (ptr == end) {
1072 *namePtr = NULL;
1073 return 1;
1074 }
1075 if (! isSpace(toAscii(enc, ptr, end))) {
1076 *nextTokPtr = ptr;
1077 return 0;
1078 }
1079 do {
1080 ptr += enc->minBytesPerChar;
1081 } while (isSpace(toAscii(enc, ptr, end)));
1082 if (ptr == end) {
1083 *namePtr = NULL;
1084 return 1;
1085 }
1086 *namePtr = ptr;
1087 for (;;) {
1088 c = toAscii(enc, ptr, end);
1089 if (c == -1) {
1090 *nextTokPtr = ptr;
1091 return 0;
1092 }
1093 if (c == ASCII_EQUALS) {
1094 *nameEndPtr = ptr;
1095 break;
1096 }
1097 if (isSpace(c)) {
1098 *nameEndPtr = ptr;
1099 do {
1100 ptr += enc->minBytesPerChar;
1101 } while (isSpace(c = toAscii(enc, ptr, end)));
1102 if (c != ASCII_EQUALS) {
1103 *nextTokPtr = ptr;
1104 return 0;
1105 }
1106 break;
1107 }
1108 ptr += enc->minBytesPerChar;
1109 }
1110 if (ptr == *namePtr) {
1111 *nextTokPtr = ptr;
1112 return 0;
1113 }
1114 ptr += enc->minBytesPerChar;
1115 c = toAscii(enc, ptr, end);
1116 while (isSpace(c)) {
1117 ptr += enc->minBytesPerChar;
1118 c = toAscii(enc, ptr, end);
1119 }
1120 if (c != ASCII_QUOT && c != ASCII_APOS) {
1121 *nextTokPtr = ptr;
1122 return 0;
1123 }
1124 open = (char)c;
1125 ptr += enc->minBytesPerChar;
1126 *valPtr = ptr;
1127 for (;; ptr += enc->minBytesPerChar) {
1128 c = toAscii(enc, ptr, end);
1129 if (c == open)
1130 break;
1131 if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)
1132 && ! (ASCII_0 <= c && c <= ASCII_9) && c != ASCII_PERIOD
1133 && c != ASCII_MINUS && c != ASCII_UNDERSCORE) {
1134 *nextTokPtr = ptr;
1135 return 0;
1136 }
1137 }
1138 *nextTokPtr = ptr + enc->minBytesPerChar;
1139 return 1;
1140}
1141
1142static const char KW_version[]
1144
1145static const char KW_encoding[] = {ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d,
1146 ASCII_i, ASCII_n, ASCII_g, '\0'};
1147
1148static const char KW_standalone[]
1150 ASCII_l, ASCII_o, ASCII_n, ASCII_e, '\0'};
1151
1152static const char KW_yes[] = {ASCII_y, ASCII_e, ASCII_s, '\0'};
1153
1154static const char KW_no[] = {ASCII_n, ASCII_o, '\0'};
1155
1156static int
1157doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *, const char *,
1158 const char *),
1159 int isGeneralTextEntity, const ENCODING *enc, const char *ptr,
1160 const char *end, const char **badPtr, const char **versionPtr,
1161 const char **versionEndPtr, const char **encodingName,
1162 const ENCODING **encoding, int *standalone) {
1163 const char *val = NULL;
1164 const char *name = NULL;
1165 const char *nameEnd = NULL;
1166 ptr += 5 * enc->minBytesPerChar;
1167 end -= 2 * enc->minBytesPerChar;
1168 if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
1169 || ! name) {
1170 *badPtr = ptr;
1171 return 0;
1172 }
1173 if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
1174 if (! isGeneralTextEntity) {
1175 *badPtr = name;
1176 return 0;
1177 }
1178 } else {
1179 if (versionPtr)
1180 *versionPtr = val;
1181 if (versionEndPtr)
1182 *versionEndPtr = ptr;
1183 if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1184 *badPtr = ptr;
1185 return 0;
1186 }
1187 if (! name) {
1188 if (isGeneralTextEntity) {
1189 /* a TextDecl must have an EncodingDecl */
1190 *badPtr = ptr;
1191 return 0;
1192 }
1193 return 1;
1194 }
1195 }
1196 if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
1197 int c = toAscii(enc, val, end);
1198 if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)) {
1199 *badPtr = val;
1200 return 0;
1201 }
1202 if (encodingName)
1203 *encodingName = val;
1204 if (encoding)
1205 *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1206 if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1207 *badPtr = ptr;
1208 return 0;
1209 }
1210 if (! name)
1211 return 1;
1212 }
1213 if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
1214 || isGeneralTextEntity) {
1215 *badPtr = name;
1216 return 0;
1217 }
1218 if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
1219 if (standalone)
1220 *standalone = 1;
1221 } else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
1222 if (standalone)
1223 *standalone = 0;
1224 } else {
1225 *badPtr = val;
1226 return 0;
1227 }
1228 while (isSpace(toAscii(enc, ptr, end)))
1229 ptr += enc->minBytesPerChar;
1230 if (ptr != end) {
1231 *badPtr = ptr;
1232 return 0;
1233 }
1234 return 1;
1235}
1236
1237static int FASTCALL
1238checkCharRefNumber(int result) {
1239 switch (result >> 8) {
1240 case 0xD8:
1241 case 0xD9:
1242 case 0xDA:
1243 case 0xDB:
1244 case 0xDC:
1245 case 0xDD:
1246 case 0xDE:
1247 case 0xDF:
1248 return -1;
1249 case 0:
1250 if (latin1_encoding.type[result] == BT_NONXML)
1251 return -1;
1252 break;
1253 case 0xFF:
1254 if (result == 0xFFFE || result == 0xFFFF)
1255 return -1;
1256 break;
1257 }
1258 return result;
1259}
1260
1261int FASTCALL
1262XmlUtf8Encode(int c, char *buf) {
1263 enum {
1264 /* minN is minimum legal resulting value for N byte sequence */
1265 min2 = 0x80,
1266 min3 = 0x800,
1267 min4 = 0x10000
1268 };
1269
1270 if (c < 0)
1271 return 0; /* LCOV_EXCL_LINE: this case is always eliminated beforehand */
1272 if (c < min2) {
1273 buf[0] = (char)(c | UTF8_cval1);
1274 return 1;
1275 }
1276 if (c < min3) {
1277 buf[0] = (char)((c >> 6) | UTF8_cval2);
1278 buf[1] = (char)((c & 0x3f) | 0x80);
1279 return 2;
1280 }
1281 if (c < min4) {
1282 buf[0] = (char)((c >> 12) | UTF8_cval3);
1283 buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
1284 buf[2] = (char)((c & 0x3f) | 0x80);
1285 return 3;
1286 }
1287 if (c < 0x110000) {
1288 buf[0] = (char)((c >> 18) | UTF8_cval4);
1289 buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
1290 buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
1291 buf[3] = (char)((c & 0x3f) | 0x80);
1292 return 4;
1293 }
1294 return 0; /* LCOV_EXCL_LINE: this case too is eliminated before calling */
1295}
1296
1297int FASTCALL
1298XmlUtf16Encode(int charNum, unsigned short *buf) {
1299 if (charNum < 0)
1300 return 0;
1301 if (charNum < 0x10000) {
1302 buf[0] = (unsigned short)charNum;
1303 return 1;
1304 }
1305 if (charNum < 0x110000) {
1306 charNum -= 0x10000;
1307 buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
1308 buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
1309 return 2;
1310 }
1311 return 0;
1312}
1313
1318 unsigned short utf16[256];
1319 char utf8[256][4];
1320};
1321
1322#define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *)(enc))
1323
1324int
1326 return sizeof(struct unknown_encoding);
1327}
1328
1329static int PTRFASTCALL
1330unknown_isName(const ENCODING *enc, const char *p) {
1331 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1332 int c = uenc->convert(uenc->userData, p);
1333 if (c & ~0xFFFF)
1334 return 0;
1335 return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1336}
1337
1338static int PTRFASTCALL
1339unknown_isNmstrt(const ENCODING *enc, const char *p) {
1340 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1341 int c = uenc->convert(uenc->userData, p);
1342 if (c & ~0xFFFF)
1343 return 0;
1344 return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1345}
1346
1347static int PTRFASTCALL
1348unknown_isInvalid(const ENCODING *enc, const char *p) {
1349 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1350 int c = uenc->convert(uenc->userData, p);
1351 return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
1352}
1353
1354static enum XML_Convert_Result PTRCALL
1355unknown_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
1356 char **toP, const char *toLim) {
1357 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1358 char buf[XML_UTF8_ENCODE_MAX];
1359 for (;;) {
1360 const char *utf8;
1361 int n;
1362 if (*fromP == fromLim)
1363 return XML_CONVERT_COMPLETED;
1364 utf8 = uenc->utf8[(unsigned char)**fromP];
1365 n = *utf8++;
1366 if (n == 0) {
1367 int c = uenc->convert(uenc->userData, *fromP);
1368 n = XmlUtf8Encode(c, buf);
1369 if (n > toLim - *toP)
1371 utf8 = buf;
1372 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1373 - (BT_LEAD2 - 2));
1374 } else {
1375 if (n > toLim - *toP)
1377 (*fromP)++;
1378 }
1379 memcpy(*toP, utf8, n);
1380 *toP += n;
1381 }
1382}
1383
1384static enum XML_Convert_Result PTRCALL
1385unknown_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
1386 unsigned short **toP, const unsigned short *toLim) {
1387 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1388 while (*fromP < fromLim && *toP < toLim) {
1389 unsigned short c = uenc->utf16[(unsigned char)**fromP];
1390 if (c == 0) {
1391 c = (unsigned short)uenc->convert(uenc->userData, *fromP);
1392 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1393 - (BT_LEAD2 - 2));
1394 } else
1395 (*fromP)++;
1396 *(*toP)++ = c;
1397 }
1398
1399 if ((*toP == toLim) && (*fromP < fromLim))
1401 else
1402 return XML_CONVERT_COMPLETED;
1403}
1404
1405ENCODING *
1407 void *userData) {
1408 int i;
1409 struct unknown_encoding *e = (struct unknown_encoding *)mem;
1410 memcpy(mem, &latin1_encoding, sizeof(struct normal_encoding));
1411 for (i = 0; i < 128; i++)
1412 if (latin1_encoding.type[i] != BT_OTHER
1413 && latin1_encoding.type[i] != BT_NONXML && table[i] != i)
1414 return 0;
1415 for (i = 0; i < 256; i++) {
1416 int c = table[i];
1417 if (c == -1) {
1418 e->normal.type[i] = BT_MALFORM;
1419 /* This shouldn't really get used. */
1420 e->utf16[i] = 0xFFFF;
1421 e->utf8[i][0] = 1;
1422 e->utf8[i][1] = 0;
1423 } else if (c < 0) {
1424 if (c < -4)
1425 return 0;
1426 /* Multi-byte sequences need a converter function */
1427 if (! convert)
1428 return 0;
1429 e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
1430 e->utf8[i][0] = 0;
1431 e->utf16[i] = 0;
1432 } else if (c < 0x80) {
1433 if (latin1_encoding.type[c] != BT_OTHER
1434 && latin1_encoding.type[c] != BT_NONXML && c != i)
1435 return 0;
1436 e->normal.type[i] = latin1_encoding.type[c];
1437 e->utf8[i][0] = 1;
1438 e->utf8[i][1] = (char)c;
1439 e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
1440 } else if (checkCharRefNumber(c) < 0) {
1441 e->normal.type[i] = BT_NONXML;
1442 /* This shouldn't really get used. */
1443 e->utf16[i] = 0xFFFF;
1444 e->utf8[i][0] = 1;
1445 e->utf8[i][1] = 0;
1446 } else {
1447 if (c > 0xFFFF)
1448 return 0;
1449 if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1450 e->normal.type[i] = BT_NMSTRT;
1451 else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1452 e->normal.type[i] = BT_NAME;
1453 else
1454 e->normal.type[i] = BT_OTHER;
1455 e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1456 e->utf16[i] = (unsigned short)c;
1457 }
1458 }
1459 e->userData = userData;
1460 e->convert = convert;
1461 if (convert) {
1462 e->normal.isName2 = unknown_isName;
1463 e->normal.isName3 = unknown_isName;
1464 e->normal.isName4 = unknown_isName;
1465 e->normal.isNmstrt2 = unknown_isNmstrt;
1466 e->normal.isNmstrt3 = unknown_isNmstrt;
1467 e->normal.isNmstrt4 = unknown_isNmstrt;
1468 e->normal.isInvalid2 = unknown_isInvalid;
1469 e->normal.isInvalid3 = unknown_isInvalid;
1470 e->normal.isInvalid4 = unknown_isInvalid;
1471 }
1472 e->normal.enc.utf8Convert = unknown_toUtf8;
1473 e->normal.enc.utf16Convert = unknown_toUtf16;
1474 return &(e->normal.enc);
1475}
1476
1477/* If this enumeration is changed, getEncodingIndex and encodings
1478must also be changed. */
1479enum {
1487 /* must match encodingNames up to here */
1488 NO_ENC
1490
1491static const char KW_ISO_8859_1[]
1494static const char KW_US_ASCII[]
1496 ASCII_C, ASCII_I, ASCII_I, '\0'};
1497static const char KW_UTF_8[]
1499static const char KW_UTF_16[]
1501static const char KW_UTF_16BE[]
1503 ASCII_6, ASCII_B, ASCII_E, '\0'};
1504static const char KW_UTF_16LE[]
1506 ASCII_6, ASCII_L, ASCII_E, '\0'};
1507
1508static int FASTCALL
1509getEncodingIndex(const char *name) {
1510 static const char *const encodingNames[] = {
1511 KW_ISO_8859_1, KW_US_ASCII, KW_UTF_8, KW_UTF_16, KW_UTF_16BE, KW_UTF_16LE,
1512 };
1513 int i;
1514 if (name == NULL)
1515 return NO_ENC;
1516 for (i = 0; i < (int)(sizeof(encodingNames) / sizeof(encodingNames[0])); i++)
1517 if (streqci(name, encodingNames[i]))
1518 return i;
1519 return UNKNOWN_ENC;
1520}
1521
1522/* For binary compatibility, we store the index of the encoding
1523 specified at initialization in the isUtf16 member.
1524*/
1525
1526#define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
1527#define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
1528
1529/* This is what detects the encoding. encodingTable maps from
1530 encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
1531 the external (protocol) specified encoding; state is
1532 XML_CONTENT_STATE if we're parsing an external text entity, and
1533 XML_PROLOG_STATE otherwise.
1534*/
1535
1536static int
1537initScan(const ENCODING *const *encodingTable, const INIT_ENCODING *enc,
1538 int state, const char *ptr, const char *end, const char **nextTokPtr) {
1539 const ENCODING **encPtr;
1540
1541 if (ptr >= end)
1542 return XML_TOK_NONE;
1543 encPtr = enc->encPtr;
1544 if (ptr + 1 == end) {
1545 /* only a single byte available for auto-detection */
1546#ifndef XML_DTD /* FIXME */
1547 /* a well-formed document entity must have more than one byte */
1548 if (state != XML_CONTENT_STATE)
1549 return XML_TOK_PARTIAL;
1550#endif
1551 /* so we're parsing an external text entity... */
1552 /* if UTF-16 was externally specified, then we need at least 2 bytes */
1553 switch (INIT_ENC_INDEX(enc)) {
1554 case UTF_16_ENC:
1555 case UTF_16LE_ENC:
1556 case UTF_16BE_ENC:
1557 return XML_TOK_PARTIAL;
1558 }
1559 switch ((unsigned char)*ptr) {
1560 case 0xFE:
1561 case 0xFF:
1562 case 0xEF: /* possibly first byte of UTF-8 BOM */
1563 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1564 break;
1565 /* fall through */
1566 case 0x00:
1567 case 0x3C:
1568 return XML_TOK_PARTIAL;
1569 }
1570 } else {
1571 switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
1572 case 0xFEFF:
1573 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1574 break;
1575 *nextTokPtr = ptr + 2;
1576 *encPtr = encodingTable[UTF_16BE_ENC];
1577 return XML_TOK_BOM;
1578 /* 00 3C is handled in the default case */
1579 case 0x3C00:
1580 if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
1581 || INIT_ENC_INDEX(enc) == UTF_16_ENC)
1582 && state == XML_CONTENT_STATE)
1583 break;
1584 *encPtr = encodingTable[UTF_16LE_ENC];
1585 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1586 case 0xFFFE:
1587 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1588 break;
1589 *nextTokPtr = ptr + 2;
1590 *encPtr = encodingTable[UTF_16LE_ENC];
1591 return XML_TOK_BOM;
1592 case 0xEFBB:
1593 /* Maybe a UTF-8 BOM (EF BB BF) */
1594 /* If there's an explicitly specified (external) encoding
1595 of ISO-8859-1 or some flavour of UTF-16
1596 and this is an external text entity,
1597 don't look for the BOM,
1598 because it might be a legal data.
1599 */
1600 if (state == XML_CONTENT_STATE) {
1601 int e = INIT_ENC_INDEX(enc);
1602 if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC
1603 || e == UTF_16_ENC)
1604 break;
1605 }
1606 if (ptr + 2 == end)
1607 return XML_TOK_PARTIAL;
1608 if ((unsigned char)ptr[2] == 0xBF) {
1609 *nextTokPtr = ptr + 3;
1610 *encPtr = encodingTable[UTF_8_ENC];
1611 return XML_TOK_BOM;
1612 }
1613 break;
1614 default:
1615 if (ptr[0] == '\0') {
1616 /* 0 isn't a legal data character. Furthermore a document
1617 entity can only start with ASCII characters. So the only
1618 way this can fail to be big-endian UTF-16 if it it's an
1619 external parsed general entity that's labelled as
1620 UTF-16LE.
1621 */
1622 if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1623 break;
1624 *encPtr = encodingTable[UTF_16BE_ENC];
1625 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1626 } else if (ptr[1] == '\0') {
1627 /* We could recover here in the case:
1628 - parsing an external entity
1629 - second byte is 0
1630 - no externally specified encoding
1631 - no encoding declaration
1632 by assuming UTF-16LE. But we don't, because this would mean when
1633 presented just with a single byte, we couldn't reliably determine
1634 whether we needed further bytes.
1635 */
1636 if (state == XML_CONTENT_STATE)
1637 break;
1638 *encPtr = encodingTable[UTF_16LE_ENC];
1639 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1640 }
1641 break;
1642 }
1643 }
1644 *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1645 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1646}
1647
1648#define NS(x) x
1649#define ns(x) x
1650#define XML_TOK_NS_C
1651#include "xmltok_ns.c"
1652#undef XML_TOK_NS_C
1653#undef NS
1654#undef ns
1655
1656#ifdef XML_NS
1657
1658# define NS(x) x##NS
1659# define ns(x) x##_ns
1660
1661# define XML_TOK_NS_C
1662# include "xmltok_ns.c"
1663# undef XML_TOK_NS_C
1664
1665# undef NS
1666# undef ns
1667
1668ENCODING *
1669XmlInitUnknownEncodingNS(void *mem, int *table, CONVERTER convert,
1670 void *userData) {
1671 ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1672 if (enc)
1673 ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
1674 return enc;
1675}
1676
1677#endif /* XML_NS */
#define ASCII_l
Definition: ascii.h:74
#define ASCII_i
Definition: ascii.h:71
#define ASCII_F
Definition: ascii.h:41
#define ASCII_o
Definition: ascii.h:77
#define ASCII_E
Definition: ascii.h:40
#define ASCII_C
Definition: ascii.h:38
#define ASCII_O
Definition: ascii.h:50
#define ASCII_Z
Definition: ascii.h:61
#define ASCII_n
Definition: ascii.h:76
#define ASCII_s
Definition: ascii.h:81
#define ASCII_UNDERSCORE
Definition: ascii.h:116
#define ASCII_t
Definition: ascii.h:82
#define ASCII_APOS
Definition: ascii.h:106
#define ASCII_c
Definition: ascii.h:65
#define ASCII_PERIOD
Definition: ascii.h:108
#define ASCII_5
Definition: ascii.h:95
#define ASCII_I
Definition: ascii.h:44
#define ASCII_A
Definition: ascii.h:36
#define ASCII_z
Definition: ascii.h:88
#define ASCII_U
Definition: ascii.h:56
#define ASCII_9
Definition: ascii.h:99
#define ASCII_e
Definition: ascii.h:67
#define ASCII_d
Definition: ascii.h:66
#define ASCII_8
Definition: ascii.h:98
#define ASCII_r
Definition: ascii.h:80
#define ASCII_y
Definition: ascii.h:87
#define ASCII_COLON
Definition: ascii.h:109
#define ASCII_0
Definition: ascii.h:90
#define ASCII_QUOT
Definition: ascii.h:104
#define ASCII_L
Definition: ascii.h:47
#define ASCII_1
Definition: ascii.h:91
#define ASCII_a
Definition: ascii.h:63
#define ASCII_6
Definition: ascii.h:96
#define ASCII_B
Definition: ascii.h:37
#define ASCII_S
Definition: ascii.h:54
#define ASCII_g
Definition: ascii.h:69
#define ASCII_EQUALS
Definition: ascii.h:112
#define ASCII_T
Definition: ascii.h:55
#define ASCII_MINUS
Definition: ascii.h:107
#define ASCII_v
Definition: ascii.h:84
BT_OTHER
Definition: asciitab.h:44
BT_NMSTRT
Definition: asciitab.h:52
BT_NAME
Definition: asciitab.h:46
BT_NONXML
Definition: asciitab.h:35
#define PTRFASTCALL
Definition: internal.h:89
#define FASTCALL
Definition: internal.h:81
#define UNUSED_P(p)
Definition: internal.h:135
#define PTRCALL
Definition: internal.h:85
const char * name(G4int ptype)
const ENCODING ** encPtr
Definition: xmltok.h:281
int minBytesPerChar
Definition: xmltok.h:195
const char const char const char * int(PTRFASTCALL *isNmstrt2)(const ENCODING *
const char const char const char const char * int(PTRFASTCALL *isNmstrt3)(const ENCODING *
int(PTRFASTCALL *isName2)(const ENCODING *
const char * int(PTRFASTCALL *isName3)(const ENCODING *
unsigned char type[256]
Definition: xmltok.c:187
const char const char const char const char const char const char const char * int(PTRFASTCALL *isInvalid3)(const ENCODING *
const char const char const char const char const char * int(PTRFASTCALL *isNmstrt4)(const ENCODING *
const char const char const char const char const char const char const char const char * int(PTRFASTCALL *isInvalid4)(const ENCODING *
const char const char const char const char const char const char * int(PTRFASTCALL *isInvalid2)(const ENCODING *
const char const char * int(PTRFASTCALL *isName4)(const ENCODING *
ENCODING enc
Definition: xmltok.c:186
unsigned short utf16[256]
Definition: xmltok.c:1318
CONVERTER convert
Definition: xmltok.c:1316
void * userData
Definition: xmltok.c:1317
char utf8[256][4]
Definition: xmltok.c:1319
struct normal_encoding normal
Definition: xmltok.c:1315
BT_LEAD3
Definition: utf8tab.h:59
BT_LEAD4
Definition: utf8tab.h:63
BT_TRAIL
Definition: utf8tab.h:35
BT_MALFORM
Definition: utf8tab.h:66
BT_LEAD2
Definition: utf8tab.h:51
#define XmlInitUnknownEncodingNS
Definition: xmlparse.c:168
#define BIG2_CHAR_MATCHES(p, c)
Definition: xmltok.c:879
#define STANDARD_VTABLE(E)
Definition: xmltok.c:215
#define VTABLE1
Definition: xmltok.c:68
@ UTF8_cval4
Definition: xmltok.c:322
@ UTF8_cval1
Definition: xmltok.c:319
@ UTF8_cval2
Definition: xmltok.c:320
@ UTF8_cval3
Definition: xmltok.c:321
#define UTF8_GET_NAMING3(pages, byte)
Definition: xmltok.c:95
#define LITTLE2_BYTE_TO_ASCII(p)
Definition: xmltok.c:743
#define DEFINE_UTF16_TO_UTF8(E)
Definition: xmltok.c:620
#define BIG2_BYTE_TYPE(enc, p)
Definition: xmltok.c:874
#define NORMAL_VTABLE(E)
Definition: xmltok.c:219
void _INTERNAL_trim_to_complete_utf8_characters(const char *from, const char **fromLimRef)
Definition: xmltok.c:326
#define INIT_ENC_INDEX(enc)
Definition: xmltok.c:1526
#define LITTLE2_IS_NMSTRT_CHAR_MINBPC(p)
Definition: xmltok.c:747
ENCODING * XmlInitUnknownEncoding(void *mem, int *table, CONVERTER convert, void *userData)
Definition: xmltok.c:1406
int XmlSizeOfUnknownEncoding(void)
Definition: xmltok.c:1325
#define NULL_VTABLE
Definition: xmltok.c:223
#define UTF8_INVALID2(p)
Definition: xmltok.c:112
#define BIG2_BYTE_TO_ASCII(p)
Definition: xmltok.c:878
#define LITTLE2_IS_NAME_CHAR_MINBPC(p)
Definition: xmltok.c:745
#define SB_BYTE_TYPE(enc, p)
Definition: xmltok.c:245
#define VTABLE
Definition: xmltok.c:76
#define BIG2_IS_NAME_CHAR_MINBPC(p)
Definition: xmltok.c:880
@ NO_ENC
Definition: xmltok.c:1488
@ US_ASCII_ENC
Definition: xmltok.c:1482
@ ISO_8859_1_ENC
Definition: xmltok.c:1481
@ UTF_8_ENC
Definition: xmltok.c:1483
@ UTF_16_ENC
Definition: xmltok.c:1484
@ UNKNOWN_ENC
Definition: xmltok.c:1480
@ UTF_16BE_ENC
Definition: xmltok.c:1485
@ UTF_16LE_ENC
Definition: xmltok.c:1486
#define LITTLE2_BYTE_TYPE(enc, p)
Definition: xmltok.c:740
#define BT_COLON
#define UTF8_INVALID4(p)
Definition: xmltok.c:124
#define AS_UNKNOWN_ENCODING(enc)
Definition: xmltok.c:1322
#define UCS2_GET_NAMING(pages, hi, lo)
Definition: xmltok.c:78
#define UTF8_GET_NAMING2(pages, byte)
Definition: xmltok.c:85
int FASTCALL XmlUtf16Encode(int charNum, unsigned short *buf)
Definition: xmltok.c:1298
#define UTF8_INVALID3(p)
Definition: xmltok.c:115
#define DEFINE_UTF16_TO_UTF16(E)
Definition: xmltok.c:697
#define BIG2_IS_NMSTRT_CHAR_MINBPC(p)
Definition: xmltok.c:882
#define AS_NORMAL_ENCODING(enc)
Definition: xmltok.c:206
#define LITTLE2_CHAR_MATCHES(p, c)
Definition: xmltok.c:744
int FASTCALL XmlUtf8Encode(int c, char *buf)
Definition: xmltok.c:1262
#define XML_CONTENT_STATE
Definition: xmltok.h:129
#define XmlUtf8Convert(enc, fromP, fromLim, toP, toLim)
Definition: xmltok.h:273
#define XML_UTF8_ENCODE_MAX
Definition: xmltok.h:140
#define XML_TOK_PARTIAL
Definition: xmltok.h:54
#define XmlNameMatchesAscii(enc, ptr1, end1, ptr2)
Definition: xmltok.h:252
int(XMLCALL * CONVERTER)(void *userData, const char *p)
Definition: xmltok.h:297
XML_Convert_Result
Definition: xmltok.h:163
@ XML_CONVERT_OUTPUT_EXHAUSTED
Definition: xmltok.h:166
@ XML_CONVERT_COMPLETED
Definition: xmltok.h:164
@ XML_CONVERT_INPUT_INCOMPLETE
Definition: xmltok.h:165
#define XML_TOK_NONE
Definition: xmltok.h:50
#define XmlTok(enc, state, ptr, end, nextTokPtr)
Definition: xmltok.h:221
#define XML_TOK_BOM
Definition: xmltok.h:77
@ BT_NONASCII
Definition: xmltok_impl.h:64