Leptonica 1.85.0
Image processing and image analysis suite
Loading...
Searching...
No Matches
encoding.c
1/*====================================================================*
2 - Copyright (C) 2001 Leptonica. All rights reserved.
3 - This software is distributed in the hope that it will be
4 - useful, but with NO WARRANTY OF ANY KIND.
5 - No author or distributor accepts responsibility to anyone for the
6 - consequences of using this software, or for whether it serves any
7 - particular purpose or works at all, unless he or she says so in
8 - writing. Everyone is granted permission to copy, modify and
9 - redistribute this source code, for commercial or non-commercial
10 - purposes, with the following restrictions: (1) the origin of this
11 - source code must not be misrepresented; (2) modified versions must
12 - be plainly marked as such; and (3) this notice may not be removed
13 - or altered from any source or modified source distribution.
14 *====================================================================*/
15
16/*
17 * encodings.c
18 *
19 * Base64
20 * char *encodeBase64()
21 * l_uint8 *decodeBase64()
22 * static l_int32 isBase64()
23 * static l_int32 *genReverseTab64()
24 * static void byteConvert3to4()
25 * static void byteConvert4to3()
26 *
27 * Ascii85
28 * char *encodeAscii85()
29 * l_uint8 *decodeAscii85()
30 * static l_int32 convertChunkToAscii85()
31 *
32 * char *encodeAscii85WithComp()
33 * l_uint8 *decodeAscii85WithComp()
34 *
35 * String reformatting for base 64 encoded data
36 * char *reformatPacked64()
37 *
38 * Base64 encoding is useful for encding binary data in a restricted set of
39 * 64 printable ascii symbols, that includes the 62 alphanumerics and '+'
40 * and '/'. Notably it does not include quotes, so that base64 encoded
41 * strings can be used in situations where quotes are used for formatting.
42 * 64 symbols was chosen because it is the smallest number that can be used
43 * in 4-for-3 byte encoding of binary data:
44 * log2(64) / log2(256) = 0.75 = 3/4
45 *
46 * Ascii85 encoding is used in PostScript and some pdf files for
47 * representing binary data (for example, a compressed image) in printable
48 * ascii symbols. It has a dictionary of 85 symbols; 85 was chosen because
49 * it is the smallest number that can be used in 5-for-4 byte encoding
50 * of binary data (256 possible input values). This can be seen from
51 * the max information content in such a sequence:
52 * log2(84) / log2(256) = 0.799 < 4/5
53 * log2(85) / log2(256) = 0.801 > 4/5
54 */
55
56#ifdef HAVE_CONFIG_H
57#include <config_auto.h>
58#endif /* HAVE_CONFIG_H */
59
60#include <ctype.h>
61#include <string.h>
62#include "allheaders.h"
63
64 /* Base64 encoding table in string representation */
65static const l_int32 MAX_BASE64_LINE = 72; /* max line length base64 */
66static const char *tablechar64 =
67 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
68 "abcdefghijklmnopqrstuvwxyz"
69 "0123456789+/";
70
71static l_int32 isBase64(char);
72static l_int32 *genReverseTab64(void);
73static void byteConvert3to4(l_uint8 *in3, l_uint8 *out4);
74static void byteConvert4to3(l_uint8 *in4, l_uint8 *out3);
75
76 /* Ascii85 encoding */
77static const l_int32 MAX_ASCII85_LINE = 64; /* max line length ascii85 */
78static const l_uint32 power85[5] = {1,
79 85,
80 85 * 85,
81 85 * 85 * 85,
82 85 * 85 * 85 * 85};
83
84static l_int32 convertChunkToAscii85(const l_uint8 *inarray, size_t insize,
85 l_int32 *pindex, char *outbuf,
86 l_int32 *pnbout);
87
88/*-------------------------------------------------------------*
89 * Utility for encoding and decoding data with base64 *
90 *-------------------------------------------------------------*/
106char *
107encodeBase64(const l_uint8 *inarray,
108 l_int32 insize,
109 l_int32 *poutsize)
110{
111char *chara;
112const l_uint8 *bytea;
113l_uint8 array3[3], array4[4];
114l_int32 outsize, i, j, index, linecount;
115
116 if (!poutsize)
117 return (char *)ERROR_PTR("&outsize not defined", __func__, NULL);
118 *poutsize = 0;
119 if (!inarray)
120 return (char *)ERROR_PTR("inarray not defined", __func__, NULL);
121 if (insize <= 0)
122 return (char *)ERROR_PTR("insize not > 0", __func__, NULL);
123
124 /* The output array is padded to a multiple of 4 bytes, not
125 * counting the newlines. We just need to allocate a large
126 * enough array, and add 4 bytes to make sure it is big enough. */
127 outsize = 4 * ((insize + 2) / 3); /* without newlines */
128 outsize += outsize / MAX_BASE64_LINE + 4; /* with the newlines */
129 if ((chara = (char *)LEPT_CALLOC(outsize, sizeof(char))) == NULL)
130 return (char *)ERROR_PTR("chara not made", __func__, NULL);
131
132 /* Read all the input data, and convert in sets of 3 input
133 * bytes --> 4 output bytes. */
134 i = index = linecount = 0;
135 bytea = inarray;
136 while (insize--) {
137 if (linecount == MAX_BASE64_LINE) {
138 chara[index++] = '\n';
139 linecount = 0;
140 }
141 array3[i++] = *bytea++;
142 if (i == 3) { /* convert 3 to 4 and save */
143 byteConvert3to4(array3, array4);
144 for (j = 0; j < 4; j++)
145 chara[index++] = tablechar64[array4[j]];
146 i = 0;
147 linecount += 4;
148 }
149 }
150
151 /* Suppose 1 or 2 bytes has been read but not yet processed.
152 * If 1 byte has been read, this will generate 2 bytes of
153 * output, with 6 bits to the first byte and 2 bits to the second.
154 * We will add two bytes of '=' for padding.
155 * If 2 bytes has been read, this will generate 3 bytes of output,
156 * with 6 bits to the first 2 bytes and 4 bits to the third, and
157 * we add a fourth padding byte ('='). */
158 if (i > 0) { /* left-over 1 or 2 input bytes */
159 for (j = i; j < 3; j++)
160 array3[j] = '\0'; /* zero the remaining input bytes */
161 byteConvert3to4(array3, array4);
162 for (j = 0; j <= i; j++)
163 chara[index++] = tablechar64[array4[j]];
164 for (j = i + 1; j < 4; j++)
165 chara[index++] = '=';
166 }
167 *poutsize = index;
168
169 return chara;
170}
171
172
192l_uint8 *
193decodeBase64(const char *inarray,
194 l_int32 insize,
195 l_int32 *poutsize)
196{
197char inchar;
198l_uint8 *bytea;
199l_uint8 array3[3], array4[4];
200l_int32 *rtable64;
201l_int32 i, j, outsize, in_index, out_index;
202
203 if (!poutsize)
204 return (l_uint8 *)ERROR_PTR("&outsize not defined", __func__, NULL);
205 *poutsize = 0;
206 if (!inarray)
207 return (l_uint8 *)ERROR_PTR("inarray not defined", __func__, NULL);
208 if (insize <= 0)
209 return (l_uint8 *)ERROR_PTR("insize not > 0", __func__, NULL);
210
211 /* Validate the input data */
212 for (i = 0; i < insize; i++) {
213 inchar = inarray[i];
214 if (inchar == '\n') continue;
215 if (isBase64(inchar) == 0 && inchar != '=')
216 return (l_uint8 *)ERROR_PTR("invalid char in inarray",
217 __func__, NULL);
218 }
219
220 /* The input array typically is made with a newline every
221 * MAX_BASE64_LINE input bytes. However, as a printed string, the
222 * newlines would be stripped. So when we allocate the output
223 * array, assume the input array is all data, but strip
224 * out the newlines during decoding. This guarantees that
225 * the allocated array is large enough. */
226 outsize = 3 * ((insize + 3) / 4) + 4;
227 if ((bytea = (l_uint8 *)LEPT_CALLOC(outsize, sizeof(l_uint8))) == NULL)
228 return (l_uint8 *)ERROR_PTR("bytea not made", __func__, NULL);
229
230 /* The number of encoded input data bytes is always a multiple of 4.
231 * Read all the data, until you reach either the end or
232 * the first pad character '='. The data is processed in
233 * units of 4 input bytes, generating 3 output decoded bytes
234 * of binary data. Newlines are ignored. If there are no
235 * pad bytes, i == 0 at the end of this section. */
236 rtable64 = genReverseTab64();
237 i = in_index = out_index = 0;
238 for (in_index = 0; in_index < insize; in_index++) {
239 inchar = inarray[in_index];
240 if (inchar == '\n') continue;
241 if (inchar == '=') break;
242 array4[i++] = rtable64[(unsigned char)inchar];
243 if (i < 4) {
244 continue;
245 } else { /* i == 4; convert 4 to 3 and save */
246 byteConvert4to3(array4, array3);
247 for (j = 0; j < 3; j++)
248 bytea[out_index++] = array3[j];
249 i = 0;
250 }
251 }
252
253 /* If i > 0, we ran into pad bytes ('='). If i == 2, there are
254 * two input pad bytes and one output data byte. If i == 3,
255 * there is one input pad byte and two output data bytes. */
256 if (i > 0) {
257 for (j = i; j < 4; j++)
258 array4[j] = '\0'; /* zero the remaining input bytes */
259 byteConvert4to3(array4, array3);
260 for (j = 0; j < i - 1; j++)
261 bytea[out_index++] = array3[j];
262 }
263 *poutsize = out_index;
264
265 LEPT_FREE(rtable64);
266 return bytea;
267}
268
269
273static l_int32
274isBase64(char c)
275{
276 return (isalnum(((int)c)) || ((c) == '+') || ((c) == '/')) ? 1 : 0;
277}
278
282static l_int32 *
283genReverseTab64(void)
284{
285l_int32 i;
286l_int32 *rtable64;
287
288 rtable64 = (l_int32 *)LEPT_CALLOC(128, sizeof(l_int32));
289 for (i = 0; i < 64; i++) {
290 rtable64[(unsigned char)tablechar64[i]] = i;
291 }
292 return rtable64;
293}
294
298static void
299byteConvert3to4(l_uint8 *in3,
300 l_uint8 *out4)
301{
302 out4[0] = in3[0] >> 2;
303 out4[1] = ((in3[0] & 0x03) << 4) | (in3[1] >> 4);
304 out4[2] = ((in3[1] & 0x0f) << 2) | (in3[2] >> 6);
305 out4[3] = in3[2] & 0x3f;
306 return;
307}
308
312static void
313byteConvert4to3(l_uint8 *in4,
314 l_uint8 *out3)
315{
316 out3[0] = (in4[0] << 2) | (in4[1] >> 4);
317 out3[1] = ((in4[1] & 0x0f) << 4) | (in4[2] >> 2);
318 out3[2] = ((in4[2] & 0x03) << 6) | in4[3];
319 return;
320}
321
322
323/*-------------------------------------------------------------*
324 * Utility for encoding and decoding data with ascii85 *
325 *-------------------------------------------------------------*/
341char *
342encodeAscii85(const l_uint8 *inarray,
343 size_t insize,
344 size_t *poutsize)
345{
346char *chara;
347char outbuf[8];
348l_int32 maxsize, i, index, linecount, nbout, eof;
349size_t outindex;
350
351 if (!poutsize)
352 return (char *)ERROR_PTR("&outsize not defined", __func__, NULL);
353 *poutsize = 0;
354 if (!inarray)
355 return (char *)ERROR_PTR("inarray not defined", __func__, NULL);
356 if (insize <= 0)
357 return (char *)ERROR_PTR("insize not > 0", __func__, NULL);
358
359 /* Accumulate results in char array */
360 maxsize = (l_int32)(80. + (insize * 5. / 4.) *
361 (1. + 2. / MAX_ASCII85_LINE));
362 if ((chara = (char *)LEPT_CALLOC(maxsize, sizeof(char))) == NULL)
363 return (char *)ERROR_PTR("chara not made", __func__, NULL);
364
365 linecount = 0;
366 index = 0;
367 outindex = 0;
368 while (1) {
369 eof = convertChunkToAscii85(inarray, insize, &index, outbuf, &nbout);
370 for (i = 0; i < nbout; i++) {
371 chara[outindex++] = outbuf[i];
372 linecount++;
373 if (linecount >= MAX_ASCII85_LINE) {
374 chara[outindex++] = '\n';
375 linecount = 0;
376 }
377 }
378 if (eof == TRUE) {
379 if (linecount != 0)
380 chara[outindex++] = '\n';
381 chara[outindex++] = '~';
382 chara[outindex++] = '>';
383 chara[outindex++] = '\n';
384 break;
385 }
386 }
387
388 *poutsize = outindex;
389 return chara;
390}
391
392
409static l_int32
410convertChunkToAscii85(const l_uint8 *inarray,
411 size_t insize,
412 l_int32 *pindex,
413 char *outbuf,
414 l_int32 *pnbout)
415{
416l_uint8 inbyte;
417l_uint32 inword, val;
418l_int32 eof, index, nread, nbout, i;
419
420 eof = FALSE;
421 index = *pindex;
422 nread = L_MIN(4, (insize - index));
423 if (insize == index + nread)
424 eof = TRUE;
425 *pindex += nread; /* save new index */
426
427 /* Read input data and save in l_uint32 */
428 inword = 0;
429 for (i = 0; i < nread; i++) {
430 inbyte = inarray[index + i];
431 inword += (l_uint32)inbyte << (8 * (3 - i));
432 }
433
434#if 0
435 lept_stderr("index = %d, nread = %d\n", index, nread);
436 lept_stderr("inword = %x\n", inword);
437 lept_stderr("eof = %d\n", eof);
438#endif
439
440 /* Special case: output 1 byte only */
441 if (inword == 0) {
442 outbuf[0] = 'z';
443 nbout = 1;
444 } else { /* output nread + 1 bytes */
445 for (i = 4; i >= 4 - nread; i--) {
446 val = inword / power85[i];
447 outbuf[4 - i] = (l_uint8)(val + '!');
448 inword -= val * power85[i];
449 }
450 nbout = nread + 1;
451 }
452 *pnbout = nbout;
453
454 return eof;
455}
456
457
474l_uint8 *
475decodeAscii85(const char *inarray,
476 size_t insize,
477 size_t *poutsize)
478{
479char inc;
480const char *pin;
481l_uint8 val;
482l_uint8 *outa;
483l_int32 maxsize, ocount, bytecount, index;
484l_uint32 oword;
485
486 if (!poutsize)
487 return (l_uint8 *)ERROR_PTR("&outsize not defined", __func__, NULL);
488 *poutsize = 0;
489 if (!inarray)
490 return (l_uint8 *)ERROR_PTR("inarray not defined", __func__, NULL);
491 if (insize <= 0)
492 return (l_uint8 *)ERROR_PTR("insize not > 0", __func__, NULL);
493
494 /* Accumulate results in outa */
495 maxsize = (l_int32)(80. + (insize * 4. / 5.)); /* plenty big */
496 if ((outa = (l_uint8 *)LEPT_CALLOC(maxsize, sizeof(l_uint8))) == NULL)
497 return (l_uint8 *)ERROR_PTR("outa not made", __func__, NULL);
498
499 pin = inarray;
500 ocount = 0; /* byte index into outa */
501 oword = 0;
502 for (index = 0, bytecount = 0; index < insize; index++, pin++) {
503 inc = *pin;
504
505 if (inc == ' ' || inc == '\t' || inc == '\n' ||
506 inc == '\f' || inc == '\r' || inc == '\v') /* ignore white space */
507 continue;
508
509 val = inc - '!';
510 if (val < 85) {
511 oword = oword * 85 + val;
512 if (bytecount < 4) {
513 bytecount++;
514 } else { /* we have all 5 input chars for the oword */
515 outa[ocount] = (oword >> 24) & 0xff;
516 outa[ocount + 1] = (oword >> 16) & 0xff;
517 outa[ocount + 2] = (oword >> 8) & 0xff;
518 outa[ocount + 3] = oword & 0xff;
519 ocount += 4;
520 bytecount = 0;
521 oword = 0;
522 }
523 } else if (inc == 'z' && bytecount == 0) {
524 outa[ocount] = 0;
525 outa[ocount + 1] = 0;
526 outa[ocount + 2] = 0;
527 outa[ocount + 3] = 0;
528 ocount += 4;
529 } else if (inc == '~') { /* end of data */
530 L_INFO(" %d extra bytes output\n", __func__, bytecount - 1);
531 switch (bytecount) {
532 case 0: /* normal eof */
533 case 1: /* error */
534 break;
535 case 2: /* 1 extra byte */
536 oword = oword * power85[3] + 0xffffff;
537 outa[ocount] = (oword >> 24) & 0xff;
538 break;
539 case 3: /* 2 extra bytes */
540 oword = oword * power85[2] + 0xffff;
541 outa[ocount] = (oword >> 24) & 0xff;
542 outa[ocount + 1] = (oword >> 16) & 0xff;
543 break;
544 case 4: /* 3 extra bytes */
545 oword = oword * 85 + 0xff;
546 outa[ocount] = (oword >> 24) & 0xff;
547 outa[ocount + 1] = (oword >> 16) & 0xff;
548 outa[ocount + 2] = (oword >> 8) & 0xff;
549 break;
550 }
551 if (bytecount > 1)
552 ocount += (bytecount - 1);
553 break;
554 }
555 }
556 *poutsize = ocount;
557
558 return outa;
559}
560
561
577char *
578encodeAscii85WithComp(const l_uint8 *indata,
579 size_t insize,
580 size_t *poutsize)
581{
582char *outstr;
583size_t size1;
584l_uint8 *data1;
585
586 if (!poutsize)
587 return (char *)ERROR_PTR("&outsize not defined", __func__, NULL);
588 *poutsize = 0;
589 if (!indata)
590 return (char *)ERROR_PTR("indata not defined", __func__, NULL);
591
592 if ((data1 = zlibCompress(indata, insize, &size1)) == NULL)
593 return (char *)ERROR_PTR("data1 not made", __func__, NULL);
594 outstr = encodeAscii85(data1, size1, poutsize);
595 LEPT_FREE(data1);
596 return outstr;
597}
598
599
616l_uint8 *
617decodeAscii85WithComp(const char *instr,
618 size_t insize,
619 size_t *poutsize)
620{
621size_t size1;
622l_uint8 *data1, *outdata;
623
624 if (!poutsize)
625 return (l_uint8 *)ERROR_PTR("&outsize not defined", __func__, NULL);
626 *poutsize = 0;
627 if (!instr)
628 return (l_uint8 *)ERROR_PTR("instr not defined", __func__, NULL);
629
630 if (insize == 0) insize = strlen(instr);
631 if ((data1 = decodeAscii85(instr, insize, &size1)) == NULL)
632 return (l_uint8 *)ERROR_PTR("data1 not made", __func__, NULL);
633 outdata = zlibUncompress(data1, size1, poutsize);
634 LEPT_FREE(data1);
635 return outdata;
636}
637
638
639/*-------------------------------------------------------------*
640 * String reformatting for base 64 encoded data *
641 *-------------------------------------------------------------*/
663char *
664reformatPacked64(const char *inarray,
665 l_int32 insize,
666 l_int32 leadspace,
667 l_int32 linechars,
668 l_int32 addquotes,
669 l_int32 *poutsize)
670{
671char *flata, *outa;
672l_int32 i, j, flatindex, flatsize, outindex, nlines, linewithpad, linecount;
673
674 if (!poutsize)
675 return (char *)ERROR_PTR("&outsize not defined", __func__, NULL);
676 *poutsize = 0;
677 if (!inarray)
678 return (char *)ERROR_PTR("inarray not defined", __func__, NULL);
679 if (insize <= 0)
680 return (char *)ERROR_PTR("insize not > 0", __func__, NULL);
681 if (leadspace < 0)
682 return (char *)ERROR_PTR("leadspace must be >= 0", __func__, NULL);
683 if (linechars % 4)
684 return (char *)ERROR_PTR("linechars % 4 must be 0", __func__, NULL);
685
686 /* Remove all white space */
687 if ((flata = (char *)LEPT_CALLOC(insize, sizeof(char))) == NULL)
688 return (char *)ERROR_PTR("flata not made", __func__, NULL);
689 for (i = 0, flatindex = 0; i < insize; i++) {
690 if (isBase64(inarray[i]) || inarray[i] == '=')
691 flata[flatindex++] = inarray[i];
692 }
693
694 /* Generate output string */
695 flatsize = flatindex;
696 nlines = (flatsize + linechars - 1) / linechars;
697 linewithpad = leadspace + linechars + 1; /* including newline */
698 if (addquotes) linewithpad += 2;
699 if ((outa = (char *)LEPT_CALLOC((size_t)nlines * linewithpad,
700 sizeof(char))) == NULL) {
701 LEPT_FREE(flata);
702 return (char *)ERROR_PTR("outa not made", __func__, NULL);
703 }
704 for (j = 0, outindex = 0; j < leadspace; j++)
705 outa[outindex++] = ' ';
706 if (addquotes) outa[outindex++] = '"';
707 for (i = 0, linecount = 0; i < flatsize; i++) {
708 if (linecount == linechars) {
709 if (addquotes) outa[outindex++] = '"';
710 outa[outindex++] = '\n';
711 for (j = 0; j < leadspace; j++)
712 outa[outindex++] = ' ';
713 if (addquotes) outa[outindex++] = '"';
714 linecount = 0;
715 }
716 outa[outindex++] = flata[i];
717 linecount++;
718 }
719 if (addquotes) outa[outindex++] = '"';
720 *poutsize = outindex;
721
722 LEPT_FREE(flata);
723 return outa;
724}