Libparserutils
codec_8859.c
Go to the documentation of this file.
1 /*
2  * This file is part of LibParserUtils.
3  * Licensed under the MIT License,
4  * http://www.opensource.org/licenses/mit-license.php
5  * Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org>
6  */
7 
8 #include <assert.h>
9 #include <stdlib.h>
10 #include <string.h>
11 
13 
15 #include "utils/endian.h"
16 #include "utils/utils.h"
17 
19 
20 static struct {
21  uint16_t mib;
22  const char *name;
23  size_t len;
24  uint32_t *table;
25 } known_charsets[] = {
26  { 0, "ISO-8859-1", SLEN("ISO-8859-1"), t1 },
27  { 0, "ISO-8859-2", SLEN("ISO-8859-2"), t2 },
28  { 0, "ISO-8859-3", SLEN("ISO-8859-3"), t3 },
29  { 0, "ISO-8859-4", SLEN("ISO-8859-4"), t4 },
30  { 0, "ISO-8859-5", SLEN("ISO-8859-5"), t5 },
31  { 0, "ISO-8859-6", SLEN("ISO-8859-6"), t6 },
32  { 0, "ISO-8859-7", SLEN("ISO-8859-7"), t7 },
33  { 0, "ISO-8859-8", SLEN("ISO-8859-8"), t8 },
34  { 0, "ISO-8859-9", SLEN("ISO-8859-9"), t9 },
35  { 0, "ISO-8859-10", SLEN("ISO-8859-10"), t10 },
36  { 0, "ISO-8859-11", SLEN("ISO-8859-11"), t11 },
37  { 0, "ISO-8859-13", SLEN("ISO-8859-13"), t13 },
38  { 0, "ISO-8859-14", SLEN("ISO-8859-14"), t14 },
39  { 0, "ISO-8859-15", SLEN("ISO-8859-15"), t15 },
40  { 0, "ISO-8859-16", SLEN("ISO-8859-16"), t16 }
41 };
42 
46 typedef struct charset_8859_codec {
49  uint32_t *table;
51 #define READ_BUFSIZE (8)
52  uint32_t read_buf[READ_BUFSIZE];
55  size_t read_len;
57 #define WRITE_BUFSIZE (8)
61  size_t write_len;
64 
65 static bool charset_8859_codec_handles_charset(const char *charset);
66 static parserutils_error charset_8859_codec_create(const char *charset,
72  const uint8_t **source, size_t *sourcelen,
73  uint8_t **dest, size_t *destlen);
76  const uint8_t **source, size_t *sourcelen,
77  uint8_t **dest, size_t *destlen);
82  const uint8_t **source, size_t *sourcelen,
83  uint8_t **dest, size_t *destlen);
86  uint32_t ucs4, uint8_t **dest, size_t *destlen);
88  uint32_t ucs4, uint8_t **s, size_t *len);
90  const uint8_t *s, size_t len, uint32_t *ucs4);
91 
98 bool charset_8859_codec_handles_charset(const char *charset)
99 {
100  uint32_t i;
101  uint16_t match = parserutils_charset_mibenum_from_name(charset,
102  strlen(charset));
103 
104  if (known_charsets[0].mib == 0) {
105  for (i = 0; i < N_ELEMENTS(known_charsets); i++) {
106  known_charsets[i].mib =
108  known_charsets[i].name,
109  known_charsets[i].len);
110  }
111  }
112 
113  for (i = 0; i < N_ELEMENTS(known_charsets); i++) {
114  if (known_charsets[i].mib == match)
115  return true;
116  }
117 
118  return false;
119 }
120 
132 {
133  uint32_t i;
135  uint16_t match = parserutils_charset_mibenum_from_name(
136  charset, strlen(charset));
137  uint32_t *table = NULL;
138 
139  for (i = 0; i < N_ELEMENTS(known_charsets); i++) {
140  if (known_charsets[i].mib == match) {
141  table = known_charsets[i].table;
142  break;
143  }
144  }
145 
146  assert(table != NULL);
147 
148  c = malloc(sizeof(charset_8859_codec));
149  if (c == NULL)
150  return PARSERUTILS_NOMEM;
151 
152  c->table = table;
153 
154  c->read_buf[0] = 0;
155  c->read_len = 0;
156 
157  c->write_buf[0] = 0;
158  c->write_len = 0;
159 
160  /* Finally, populate vtable */
165 
166  *codec = (parserutils_charset_codec *) c;
167 
168  return PARSERUTILS_OK;
169 }
170 
178 {
179  UNUSED(codec);
180 
181  return PARSERUTILS_OK;
182 }
183 
212  const uint8_t **source, size_t *sourcelen,
213  uint8_t **dest, size_t *destlen)
214 {
215  charset_8859_codec *c = (charset_8859_codec *) codec;
216  uint32_t ucs4;
217  uint32_t *towrite;
218  size_t towritelen;
219  parserutils_error error;
220 
221  /* Process any outstanding characters from the previous call */
222  if (c->write_len > 0) {
223  uint32_t *pwrite = c->write_buf;
224 
225  while (c->write_len > 0) {
226  error = charset_8859_from_ucs4(c, pwrite[0],
227  dest, destlen);
228  if (error != PARSERUTILS_OK) {
229  uint32_t len;
230  assert(error == PARSERUTILS_NOMEM);
231 
232  for (len = 0; len < c->write_len; len++) {
233  c->write_buf[len] = pwrite[len];
234  }
235 
236  return error;
237  }
238 
239  pwrite++;
240  c->write_len--;
241  }
242  }
243 
244  /* Now process the characters for this call */
245  while (*sourcelen > 0) {
246  ucs4 = endian_big_to_host(*((uint32_t *) (void *) *source));
247  towrite = &ucs4;
248  towritelen = 1;
249 
250  /* Output current characters */
251  while (towritelen > 0) {
252  error = charset_8859_from_ucs4(c, towrite[0], dest,
253  destlen);
254  if (error != PARSERUTILS_OK) {
255  uint32_t len;
256  if (error != PARSERUTILS_NOMEM) {
257  return error;
258  }
259 
260  /* Insufficient output space */
261  assert(towritelen < WRITE_BUFSIZE);
262 
263  c->write_len = towritelen;
264 
265  /* Copy pending chars to save area, for
266  * processing next call. */
267  for (len = 0; len < towritelen; len++)
268  c->write_buf[len] = towrite[len];
269 
270  /* Claim character we've just buffered,
271  * so it's not reprocessed */
272  *source += 4;
273  *sourcelen -= 4;
274 
275  return PARSERUTILS_NOMEM;
276  }
277 
278  towrite++;
279  towritelen--;
280  }
281 
282  *source += 4;
283  *sourcelen -= 4;
284  }
285 
286  return PARSERUTILS_OK;
287 }
288 
331  const uint8_t **source, size_t *sourcelen,
332  uint8_t **dest, size_t *destlen)
333 {
334  charset_8859_codec *c = (charset_8859_codec *) codec;
335  parserutils_error error;
336 
337  if (c->read_len > 0) {
338  /* Output left over from last decode */
339  uint32_t *pread = c->read_buf;
340 
341  while (c->read_len > 0 && *destlen >= c->read_len * 4) {
342  *((uint32_t *) (void *) *dest) =
343  endian_host_to_big(pread[0]);
344 
345  *dest += 4;
346  *destlen -= 4;
347 
348  pread++;
349  c->read_len--;
350  }
351 
352  if (*destlen < c->read_len * 4) {
353  /* Ran out of output buffer */
354  size_t i;
355 
356  /* Shuffle remaining output down */
357  for (i = 0; i < c->read_len; i++)
358  c->read_buf[i] = pread[i];
359 
360  return PARSERUTILS_NOMEM;
361  }
362  }
363 
364  /* Finally, the "normal" case; process all outstanding characters */
365  while (*sourcelen > 0) {
367  source, sourcelen, dest, destlen);
368  if (error != PARSERUTILS_OK) {
369  return error;
370  }
371  }
372 
373  return PARSERUTILS_OK;
374 }
375 
383 {
384  charset_8859_codec *c = (charset_8859_codec *) codec;
385 
386  c->read_buf[0] = 0;
387  c->read_len = 0;
388 
389  c->write_buf[0] = 0;
390  c->write_len = 0;
391 
392  return PARSERUTILS_OK;
393 }
394 
395 
425  const uint8_t **source, size_t *sourcelen,
426  uint8_t **dest, size_t *destlen)
427 {
428  uint32_t ucs4;
429  parserutils_error error;
430 
431  /* Convert a single character */
432  error = charset_8859_to_ucs4(c, *source, *sourcelen, &ucs4);
433  if (error == PARSERUTILS_OK) {
434  /* Read a character */
436  ucs4, dest, destlen);
437  if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
438  /* output succeeded; update source pointers */
439  *source += 1;
440  *sourcelen -= 1;
441  }
442 
443  return error;
444  } else if (error == PARSERUTILS_NEEDDATA) {
445  /* Can only happen if sourcelen == 0 */
446  return error;
447  } else if (error == PARSERUTILS_INVALID) {
448  /* Illegal input sequence */
449 
450  /* Strict errormode; simply flag invalid character */
451  if (c->base.errormode ==
453  return PARSERUTILS_INVALID;
454  }
455 
456  /* output U+FFFD and continue processing. */
458  0xFFFD, dest, destlen);
459  if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
460  /* output succeeded; update source pointers */
461  *source += 1;
462  *sourcelen -= 1;
463  }
464 
465  return error;
466  }
467 
468  return PARSERUTILS_OK;
469 }
470 
482  uint32_t ucs4, uint8_t **dest, size_t *destlen)
483 {
484  if (*destlen < 4) {
485  /* Run out of output buffer */
486  c->read_len = 1;
487  c->read_buf[0] = ucs4;
488 
489  return PARSERUTILS_NOMEM;
490  }
491 
492  *((uint32_t *) (void *) *dest) = endian_host_to_big(ucs4);
493  *dest += 4;
494  *destlen -= 4;
495 
496  return PARSERUTILS_OK;
497 }
498 
516  uint32_t ucs4, uint8_t **s, size_t *len)
517 {
518  uint8_t out = 0;
519 
520  if (*len < 1)
521  return PARSERUTILS_NOMEM;
522 
523  if (ucs4 < 0x80) {
524  /* ASCII */
525  out = ucs4;
526  } else {
527  uint32_t i;
528 
529  for (i = 0; i < 96; i++) {
530  if (ucs4 == c->table[i])
531  break;
532  }
533 
534  if (i == 96) {
535  if (c->base.errormode ==
537  return PARSERUTILS_INVALID;
538  else
539  out = '?';
540  } else {
541  out = 0xA0 + i;
542  }
543  }
544 
545  *(*s) = out;
546  (*s)++;
547  (*len)--;
548 
549  return PARSERUTILS_OK;
550 }
551 
564  const uint8_t *s, size_t len, uint32_t *ucs4)
565 {
566  uint32_t out;
567 
568  if (len < 1)
569  return PARSERUTILS_NEEDDATA;
570 
571  if (*s < 0x80) {
572  out = *s;
573  } else if (*s >= 0xA0) {
574  if (c->table[*s - 0xA0] == 0xFFFF)
575  return PARSERUTILS_INVALID;
576 
577  out = c->table[*s - 0xA0];
578  } else {
579  return PARSERUTILS_INVALID;
580  }
581 
582  *ucs4 = out;
583 
584  return PARSERUTILS_OK;
585 }
586 
590 };
591 
Codec factory component definition.
Definition: codec_impl.h:39
#define SLEN(s)
Definition: utils.h:21
static struct @1 known_charsets[]
static uint32_t t16[96]
Definition: 8859_tables.h:226
uint32_t read_buf[READ_BUFSIZE]
Buffer for partial output sequences (decode) (host-endian)
Definition: codec_8859.c:52
static uint32_t t7[96]
Definition: 8859_tables.h:106
uint32_t write_buf[WRITE_BUFSIZE]
Buffer for partial output sequences (encode) (host-endian)
Definition: codec_8859.c:58
static uint32_t t11[96]
Definition: 8859_tables.h:166
static uint32_t t4[96]
Definition: 8859_tables.h:61
static uint32_t t8[96]
Definition: 8859_tables.h:121
static uint32_t t14[96]
Definition: 8859_tables.h:196
static parserutils_error charset_8859_codec_destroy(parserutils_charset_codec *codec)
Destroy an ISO-8859-n codec.
Definition: codec_8859.c:177
const char * name
Definition: codec_8859.c:22
static uint32_t endian_host_to_big(uint32_t host)
Definition: endian.h:24
parserutils_error(* destroy)(parserutils_charset_codec *codec)
Definition: codec_impl.h:25
static uint32_t t1[96]
Definition: 8859_tables.h:16
ISO-8859-n charset codec.
Definition: codec_8859.c:46
#define UNUSED(x)
Definition: utils.h:25
size_t write_len
Character length of write_buf.
Definition: codec_8859.c:61
static uint32_t t3[96]
Definition: 8859_tables.h:46
static parserutils_error charset_8859_to_ucs4(charset_8859_codec *c, const uint8_t *s, size_t len, uint32_t *ucs4)
Convert an ISO-8859-n character to UCS4 (host endian)
Definition: codec_8859.c:563
parserutils_error
Definition: errors.h:18
uint32_t * table
Mapping table for 0xA0-0xFF.
Definition: codec_8859.c:49
static uint32_t t9[96]
Definition: 8859_tables.h:136
parserutils_error(* decode)(parserutils_charset_codec *codec, const uint8_t **source, size_t *sourcelen, uint8_t **dest, size_t *destlen)
Definition: codec_impl.h:29
static parserutils_error charset_8859_from_ucs4(charset_8859_codec *c, uint32_t ucs4, uint8_t **s, size_t *len)
Convert a UCS4 (host endian) character to ISO-8859-n.
Definition: codec_8859.c:515
static uint32_t t6[96]
Definition: 8859_tables.h:91
size_t len
Definition: codec_8859.c:23
size_t read_len
Character length of read_buf.
Definition: codec_8859.c:55
#define N_ELEMENTS(s)
Definition: utils.h:29
static uint32_t t2[96]
Definition: 8859_tables.h:31
static parserutils_error charset_8859_codec_reset(parserutils_charset_codec *codec)
Clear an ISO-8859-n codec's encoding state.
Definition: codec_8859.c:382
parserutils_charset_codec_errormode errormode
error mode
Definition: codec_impl.h:22
const parserutils_charset_handler charset_8859_codec_handler
Definition: codec_8859.c:587
static parserutils_error charset_8859_codec_decode(parserutils_charset_codec *codec, const uint8_t **source, size_t *sourcelen, uint8_t **dest, size_t *destlen)
Decode a chunk of ISO-8859-n data into UCS-4 (big endian)
Definition: codec_8859.c:330
#define READ_BUFSIZE
Definition: codec_8859.c:51
static uint32_t endian_big_to_host(uint32_t big)
Definition: endian.h:32
static uint32_t t13[96]
Definition: 8859_tables.h:181
parserutils_charset_codec base
Base class.
Definition: codec_8859.c:47
static uint32_t t10[96]
Definition: 8859_tables.h:151
static bool charset_8859_codec_handles_charset(const char *charset)
Determine whether this codec handles a specific charset.
Definition: codec_8859.c:98
struct parserutils_charset_codec::@3 handler
Vtable for handler code.
static uint32_t t5[96]
Definition: 8859_tables.h:76
parserutils_error(* encode)(parserutils_charset_codec *codec, const uint8_t **source, size_t *sourcelen, uint8_t **dest, size_t *destlen)
Definition: codec_impl.h:26
Abort processing if unrepresentable character encountered.
Definition: codec.h:64
Core charset codec definition; implementations extend this.
Definition: codec_impl.h:19
uint16_t mib
Definition: codec_8859.c:21
static parserutils_error charset_8859_codec_create(const char *charset, parserutils_charset_codec **codec)
Create an ISO-8859-n codec.
Definition: codec_8859.c:130
parserutils_error(* reset)(parserutils_charset_codec *codec)
Definition: codec_impl.h:32
#define WRITE_BUFSIZE
Definition: codec_8859.c:57
static parserutils_error charset_8859_codec_encode(parserutils_charset_codec *codec, const uint8_t **source, size_t *sourcelen, uint8_t **dest, size_t *destlen)
Encode a chunk of UCS-4 (big endian) data into ISO-8859-n.
Definition: codec_8859.c:211
static parserutils_error charset_8859_codec_output_decoded_char(charset_8859_codec *c, uint32_t ucs4, uint8_t **dest, size_t *destlen)
Output a UCS-4 character (big endian)
Definition: codec_8859.c:481
uint16_t parserutils_charset_mibenum_from_name(const char *alias, size_t len)
Retrieve the MIB enum value assigned to an encoding name.
Definition: aliases.c:107
static uint32_t t15[96]
Definition: 8859_tables.h:211
static parserutils_error charset_8859_codec_read_char(charset_8859_codec *c, const uint8_t **source, size_t *sourcelen, uint8_t **dest, size_t *destlen)
Read a character from the ISO-8859-n to UCS-4 (big endian)
Definition: codec_8859.c:424
struct charset_8859_codec charset_8859_codec
ISO-8859-n charset codec.
uint32_t * table
Definition: codec_8859.c:24