Leptonica 1.82.0
Image processing and image analysis suite
encoding.c
1/*====================================================================*
2 - Copyright (C) 2001 Leptonica. All rights reserved.
3 - This software is distributed in the hope that it will be
4 - useful, but with NO WARRANTY OF ANY KIND.
5 - No author or distributor accepts responsibility to anyone for the
6 - consequences of using this software, or for whether it serves any
7 - particular purpose or works at all, unless he or she says so in
8 - writing. Everyone is granted permission to copy, modify and
9 - redistribute this source code, for commercial or non-commercial
10 - purposes, with the following restrictions: (1) the origin of this
11 - source code must not be misrepresented; (2) modified versions must
12 - be plainly marked as such; and (3) this notice may not be removed
13 - or altered from any source or modified source distribution.
14 *====================================================================*/
15
16/*
17 * encodings.c
18 *
19 * Base64
20 * char *encodeBase64()
21 * l_uint8 *decodeBase64()
22 * static l_int32 isBase64()
23 * static l_int32 *genReverseTab64()
24 * static void byteConvert3to4()
25 * static void byteConvert4to3()
26 *
27 * Ascii85
28 * char *encodeAscii85()
29 * l_uint8 *decodeAscii85()
30 * static l_int32 convertChunkToAscii85()
31 *
32 * char *encodeAscii85WithComp()
33 * l_uint8 *decodeAscii85WithComp()
34 *
35 * String reformatting for base 64 encoded data
36 * char *reformatPacked64()
37 *
38 * Base64 encoding is useful for encding binary data in a restricted set of
39 * 64 printable ascii symbols, that includes the 62 alphanumerics and '+'
40 * and '/'. Notably it does not include quotes, so that base64 encoded
41 * strings can be used in situations where quotes are used for formatting.
42 * 64 symbols was chosen because it is the smallest number that can be used
43 * in 4-for-3 byte encoding of binary data:
44 * log2(64) / log2(256) = 0.75 = 3/4
45 *
46 * Ascii85 encoding is used in PostScript and some pdf files for
47 * representing binary data (for example, a compressed image) in printable
48 * ascii symbols. It has a dictionary of 85 symbols; 85 was chosen because
49 * it is the smallest number that can be used in 5-for-4 byte encoding
50 * of binary data (256 possible input values). This can be seen from
51 * the max information content in such a sequence:
52 * log2(84) / log2(256) = 0.799 < 4/5
53 * log2(85) / log2(256) = 0.801 > 4/5
54 */
55
56#ifdef HAVE_CONFIG_H
57#include <config_auto.h>
58#endif /* HAVE_CONFIG_H */
59
60#include <ctype.h>
61#include <string.h>
62#include "allheaders.h"
63
64 /* Base64 encoding table in string representation */
65static const l_int32 MAX_BASE64_LINE = 72; /* max line length base64 */
66static const char *tablechar64 =
67 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
68 "abcdefghijklmnopqrstuvwxyz"
69 "0123456789+/";
70
71static l_int32 isBase64(char);
72static l_int32 *genReverseTab64(void);
73static void byteConvert3to4(l_uint8 *in3, l_uint8 *out4);
74static void byteConvert4to3(l_uint8 *in4, l_uint8 *out3);
75
76 /* Ascii85 encoding */
77static const l_int32 MAX_ASCII85_LINE = 64; /* max line length ascii85 */
78static const l_uint32 power85[5] = {1,
79 85,
80 85 * 85,
81 85 * 85 * 85,
82 85 * 85 * 85 * 85};
83
84static l_int32 convertChunkToAscii85(const l_uint8 *inarray, size_t insize,
85 l_int32 *pindex, char *outbuf,
86 l_int32 *pnbout);
87
88/*-------------------------------------------------------------*
89 * Utility for encoding and decoding data with base64 *
90 *-------------------------------------------------------------*/
106char *
107encodeBase64(const l_uint8 *inarray,
108 l_int32 insize,
109 l_int32 *poutsize)
110{
111char *chara;
112const l_uint8 *bytea;
113l_uint8 array3[3], array4[4];
114l_int32 outsize, i, j, index, linecount;
115
116 PROCNAME("encodeBase64");
117
118 if (!poutsize)
119 return (char *)ERROR_PTR("&outsize not defined", procName, NULL);
120 *poutsize = 0;
121 if (!inarray)
122 return (char *)ERROR_PTR("inarray not defined", procName, NULL);
123 if (insize <= 0)
124 return (char *)ERROR_PTR("insize not > 0", procName, NULL);
125
126 /* The output array is padded to a multiple of 4 bytes, not
127 * counting the newlines. We just need to allocate a large
128 * enough array, and add 4 bytes to make sure it is big enough. */
129 outsize = 4 * ((insize + 2) / 3); /* without newlines */
130 outsize += outsize / MAX_BASE64_LINE + 4; /* with the newlines */
131 if ((chara = (char *)LEPT_CALLOC(outsize, sizeof(char))) == NULL)
132 return (char *)ERROR_PTR("chara not made", procName, NULL);
133
134 /* Read all the input data, and convert in sets of 3 input
135 * bytes --> 4 output bytes. */
136 i = index = linecount = 0;
137 bytea = inarray;
138 while (insize--) {
139 if (linecount == MAX_BASE64_LINE) {
140 chara[index++] = '\n';
141 linecount = 0;
142 }
143 array3[i++] = *bytea++;
144 if (i == 3) { /* convert 3 to 4 and save */
145 byteConvert3to4(array3, array4);
146 for (j = 0; j < 4; j++)
147 chara[index++] = tablechar64[array4[j]];
148 i = 0;
149 linecount += 4;
150 }
151 }
152
153 /* Suppose 1 or 2 bytes has been read but not yet processed.
154 * If 1 byte has been read, this will generate 2 bytes of
155 * output, with 6 bits to the first byte and 2 bits to the second.
156 * We will add two bytes of '=' for padding.
157 * If 2 bytes has been read, this will generate 3 bytes of output,
158 * with 6 bits to the first 2 bytes and 4 bits to the third, and
159 * we add a fourth padding byte ('='). */
160 if (i > 0) { /* left-over 1 or 2 input bytes */
161 for (j = i; j < 3; j++)
162 array3[j] = '\0'; /* zero the remaining input bytes */
163 byteConvert3to4(array3, array4);
164 for (j = 0; j <= i; j++)
165 chara[index++] = tablechar64[array4[j]];
166 for (j = i + 1; j < 4; j++)
167 chara[index++] = '=';
168 }
169 *poutsize = index;
170
171 return chara;
172}
173
174
194l_uint8 *
195decodeBase64(const char *inarray,
196 l_int32 insize,
197 l_int32 *poutsize)
198{
199char inchar;
200l_uint8 *bytea;
201l_uint8 array3[3], array4[4];
202l_int32 *rtable64;
203l_int32 i, j, outsize, in_index, out_index;
204
205 PROCNAME("decodeBase64");
206
207 if (!poutsize)
208 return (l_uint8 *)ERROR_PTR("&outsize not defined", procName, NULL);
209 *poutsize = 0;
210 if (!inarray)
211 return (l_uint8 *)ERROR_PTR("inarray not defined", procName, NULL);
212 if (insize <= 0)
213 return (l_uint8 *)ERROR_PTR("insize not > 0", procName, NULL);
214
215 /* Validate the input data */
216 for (i = 0; i < insize; i++) {
217 inchar = inarray[i];
218 if (inchar == '\n') continue;
219 if (isBase64(inchar) == 0 && inchar != '=')
220 return (l_uint8 *)ERROR_PTR("invalid char in inarray",
221 procName, NULL);
222 }
223
224 /* The input array typically is made with a newline every
225 * MAX_BASE64_LINE input bytes. However, as a printed string, the
226 * newlines would be stripped. So when we allocate the output
227 * array, assume the input array is all data, but strip
228 * out the newlines during decoding. This guarantees that
229 * the allocated array is large enough. */
230 outsize = 3 * ((insize + 3) / 4) + 4;
231 if ((bytea = (l_uint8 *)LEPT_CALLOC(outsize, sizeof(l_uint8))) == NULL)
232 return (l_uint8 *)ERROR_PTR("bytea not made", procName, NULL);
233
234 /* The number of encoded input data bytes is always a multiple of 4.
235 * Read all the data, until you reach either the end or
236 * the first pad character '='. The data is processed in
237 * units of 4 input bytes, generating 3 output decoded bytes
238 * of binary data. Newlines are ignored. If there are no
239 * pad bytes, i == 0 at the end of this section. */
240 rtable64 = genReverseTab64();
241 i = in_index = out_index = 0;
242 for (in_index = 0; in_index < insize; in_index++) {
243 inchar = inarray[in_index];
244 if (inchar == '\n') continue;
245 if (inchar == '=') break;
246 array4[i++] = rtable64[(unsigned char)inchar];
247 if (i < 4) {
248 continue;
249 } else { /* i == 4; convert 4 to 3 and save */
250 byteConvert4to3(array4, array3);
251 for (j = 0; j < 3; j++)
252 bytea[out_index++] = array3[j];
253 i = 0;
254 }
255 }
256
257 /* If i > 0, we ran into pad bytes ('='). If i == 2, there are
258 * two input pad bytes and one output data byte. If i == 3,
259 * there is one input pad byte and two output data bytes. */
260 if (i > 0) {
261 for (j = i; j < 4; j++)
262 array4[j] = '\0'; /* zero the remaining input bytes */
263 byteConvert4to3(array4, array3);
264 for (j = 0; j < i - 1; j++)
265 bytea[out_index++] = array3[j];
266 }
267 *poutsize = out_index;
268
269 LEPT_FREE(rtable64);
270 return bytea;
271}
272
273
277static l_int32
278isBase64(char c)
279{
280 return (isalnum(((int)c)) || ((c) == '+') || ((c) == '/')) ? 1 : 0;
281}
282
286static l_int32 *
287genReverseTab64()
288{
289l_int32 i;
290l_int32 *rtable64;
291
292 rtable64 = (l_int32 *)LEPT_CALLOC(128, sizeof(l_int32));
293 for (i = 0; i < 64; i++) {
294 rtable64[(unsigned char)tablechar64[i]] = i;
295 }
296 return rtable64;
297}
298
302static void
303byteConvert3to4(l_uint8 *in3,
304 l_uint8 *out4)
305{
306 out4[0] = in3[0] >> 2;
307 out4[1] = ((in3[0] & 0x03) << 4) | (in3[1] >> 4);
308 out4[2] = ((in3[1] & 0x0f) << 2) | (in3[2] >> 6);
309 out4[3] = in3[2] & 0x3f;
310 return;
311}
312
316static void
317byteConvert4to3(l_uint8 *in4,
318 l_uint8 *out3)
319{
320 out3[0] = (in4[0] << 2) | (in4[1] >> 4);
321 out3[1] = ((in4[1] & 0x0f) << 4) | (in4[2] >> 2);
322 out3[2] = ((in4[2] & 0x03) << 6) | in4[3];
323 return;
324}
325
326
327/*-------------------------------------------------------------*
328 * Utility for encoding and decoding data with ascii85 *
329 *-------------------------------------------------------------*/
345char *
346encodeAscii85(const l_uint8 *inarray,
347 size_t insize,
348 size_t *poutsize)
349{
350char *chara;
351char outbuf[8];
352l_int32 maxsize, i, index, linecount, nbout, eof;
353size_t outindex;
354
355 PROCNAME("encodeAscii85");
356
357 if (!poutsize)
358 return (char *)ERROR_PTR("&outsize not defined", procName, NULL);
359 *poutsize = 0;
360 if (!inarray)
361 return (char *)ERROR_PTR("inarray not defined", procName, NULL);
362 if (insize <= 0)
363 return (char *)ERROR_PTR("insize not > 0", procName, NULL);
364
365 /* Accumulate results in char array */
366 maxsize = (l_int32)(80. + (insize * 5. / 4.) *
367 (1. + 2. / MAX_ASCII85_LINE));
368 if ((chara = (char *)LEPT_CALLOC(maxsize, sizeof(char))) == NULL)
369 return (char *)ERROR_PTR("chara not made", procName, NULL);
370
371 linecount = 0;
372 index = 0;
373 outindex = 0;
374 while (1) {
375 eof = convertChunkToAscii85(inarray, insize, &index, outbuf, &nbout);
376 for (i = 0; i < nbout; i++) {
377 chara[outindex++] = outbuf[i];
378 linecount++;
379 if (linecount >= MAX_ASCII85_LINE) {
380 chara[outindex++] = '\n';
381 linecount = 0;
382 }
383 }
384 if (eof == TRUE) {
385 if (linecount != 0)
386 chara[outindex++] = '\n';
387 chara[outindex++] = '~';
388 chara[outindex++] = '>';
389 chara[outindex++] = '\n';
390 break;
391 }
392 }
393
394 *poutsize = outindex;
395 return chara;
396}
397
398
415static l_int32
416convertChunkToAscii85(const l_uint8 *inarray,
417 size_t insize,
418 l_int32 *pindex,
419 char *outbuf,
420 l_int32 *pnbout)
421{
422l_uint8 inbyte;
423l_uint32 inword, val;
424l_int32 eof, index, nread, nbout, i;
425
426 eof = FALSE;
427 index = *pindex;
428 nread = L_MIN(4, (insize - index));
429 if (insize == index + nread)
430 eof = TRUE;
431 *pindex += nread; /* save new index */
432
433 /* Read input data and save in l_uint32 */
434 inword = 0;
435 for (i = 0; i < nread; i++) {
436 inbyte = inarray[index + i];
437 inword += (l_uint32)inbyte << (8 * (3 - i));
438 }
439
440#if 0
441 lept_stderr("index = %d, nread = %d\n", index, nread);
442 lept_stderr("inword = %x\n", inword);
443 lept_stderr("eof = %d\n", eof);
444#endif
445
446 /* Special case: output 1 byte only */
447 if (inword == 0) {
448 outbuf[0] = 'z';
449 nbout = 1;
450 } else { /* output nread + 1 bytes */
451 for (i = 4; i >= 4 - nread; i--) {
452 val = inword / power85[i];
453 outbuf[4 - i] = (l_uint8)(val + '!');
454 inword -= val * power85[i];
455 }
456 nbout = nread + 1;
457 }
458 *pnbout = nbout;
459
460 return eof;
461}
462
463
480l_uint8 *
481decodeAscii85(const char *inarray,
482 size_t insize,
483 size_t *poutsize)
484{
485char inc;
486const char *pin;
487l_uint8 val;
488l_uint8 *outa;
489l_int32 maxsize, ocount, bytecount, index;
490l_uint32 oword;
491
492 PROCNAME("decodeAscii85");
493
494 if (!poutsize)
495 return (l_uint8 *)ERROR_PTR("&outsize not defined", procName, NULL);
496 *poutsize = 0;
497 if (!inarray)
498 return (l_uint8 *)ERROR_PTR("inarray not defined", procName, NULL);
499 if (insize <= 0)
500 return (l_uint8 *)ERROR_PTR("insize not > 0", procName, NULL);
501
502 /* Accumulate results in outa */
503 maxsize = (l_int32)(80. + (insize * 4. / 5.)); /* plenty big */
504 if ((outa = (l_uint8 *)LEPT_CALLOC(maxsize, sizeof(l_uint8))) == NULL)
505 return (l_uint8 *)ERROR_PTR("outa not made", procName, NULL);
506
507 pin = inarray;
508 ocount = 0; /* byte index into outa */
509 oword = 0;
510 for (index = 0, bytecount = 0; index < insize; index++, pin++) {
511 inc = *pin;
512
513 if (inc == ' ' || inc == '\t' || inc == '\n' ||
514 inc == '\f' || inc == '\r' || inc == '\v') /* ignore white space */
515 continue;
516
517 val = inc - '!';
518 if (val < 85) {
519 oword = oword * 85 + val;
520 if (bytecount < 4) {
521 bytecount++;
522 } else { /* we have all 5 input chars for the oword */
523 outa[ocount] = (oword >> 24) & 0xff;
524 outa[ocount + 1] = (oword >> 16) & 0xff;
525 outa[ocount + 2] = (oword >> 8) & 0xff;
526 outa[ocount + 3] = oword & 0xff;
527 ocount += 4;
528 bytecount = 0;
529 oword = 0;
530 }
531 } else if (inc == 'z' && bytecount == 0) {
532 outa[ocount] = 0;
533 outa[ocount + 1] = 0;
534 outa[ocount + 2] = 0;
535 outa[ocount + 3] = 0;
536 ocount += 4;
537 } else if (inc == '~') { /* end of data */
538 L_INFO(" %d extra bytes output\n", procName, bytecount - 1);
539 switch (bytecount) {
540 case 0: /* normal eof */
541 case 1: /* error */
542 break;
543 case 2: /* 1 extra byte */
544 oword = oword * power85[3] + 0xffffff;
545 outa[ocount] = (oword >> 24) & 0xff;
546 break;
547 case 3: /* 2 extra bytes */
548 oword = oword * power85[2] + 0xffff;
549 outa[ocount] = (oword >> 24) & 0xff;
550 outa[ocount + 1] = (oword >> 16) & 0xff;
551 break;
552 case 4: /* 3 extra bytes */
553 oword = oword * 85 + 0xff;
554 outa[ocount] = (oword >> 24) & 0xff;
555 outa[ocount + 1] = (oword >> 16) & 0xff;
556 outa[ocount + 2] = (oword >> 8) & 0xff;
557 break;
558 }
559 if (bytecount > 1)
560 ocount += (bytecount - 1);
561 break;
562 }
563 }
564 *poutsize = ocount;
565
566 return outa;
567}
568
569
585char *
586encodeAscii85WithComp(const l_uint8 *indata,
587 size_t insize,
588 size_t *poutsize)
589{
590char *outstr;
591size_t size1;
592l_uint8 *data1;
593
594 PROCNAME("encodeAscii85WithComp");
595
596 if (!poutsize)
597 return (char *)ERROR_PTR("&outsize not defined", procName, NULL);
598 *poutsize = 0;
599 if (!indata)
600 return (char *)ERROR_PTR("indata not defined", procName, NULL);
601
602 if ((data1 = zlibCompress(indata, insize, &size1)) == NULL)
603 return (char *)ERROR_PTR("data1 not made", procName, NULL);
604 outstr = encodeAscii85(data1, size1, poutsize);
605 LEPT_FREE(data1);
606 return outstr;
607}
608
609
626l_uint8 *
627decodeAscii85WithComp(const char *instr,
628 size_t insize,
629 size_t *poutsize)
630{
631size_t size1;
632l_uint8 *data1, *outdata;
633
634 PROCNAME("decodeAscii85WithComp");
635
636 if (!poutsize)
637 return (l_uint8 *)ERROR_PTR("&outsize not defined", procName, NULL);
638 *poutsize = 0;
639 if (!instr)
640 return (l_uint8 *)ERROR_PTR("instr not defined", procName, NULL);
641
642 if (insize == 0) insize = strlen(instr);
643 if ((data1 = decodeAscii85(instr, insize, &size1)) == NULL)
644 return (l_uint8 *)ERROR_PTR("data1 not made", procName, NULL);
645 outdata = zlibUncompress(data1, size1, poutsize);
646 LEPT_FREE(data1);
647 return outdata;
648}
649
650
651/*-------------------------------------------------------------*
652 * String reformatting for base 64 encoded data *
653 *-------------------------------------------------------------*/
675char *
676reformatPacked64(const char *inarray,
677 l_int32 insize,
678 l_int32 leadspace,
679 l_int32 linechars,
680 l_int32 addquotes,
681 l_int32 *poutsize)
682{
683char *flata, *outa;
684l_int32 i, j, flatindex, flatsize, outindex, nlines, linewithpad, linecount;
685
686 PROCNAME("reformatPacked64");
687
688 if (!poutsize)
689 return (char *)ERROR_PTR("&outsize not defined", procName, NULL);
690 *poutsize = 0;
691 if (!inarray)
692 return (char *)ERROR_PTR("inarray not defined", procName, NULL);
693 if (insize <= 0)
694 return (char *)ERROR_PTR("insize not > 0", procName, NULL);
695 if (leadspace < 0)
696 return (char *)ERROR_PTR("leadspace must be >= 0", procName, NULL);
697 if (linechars % 4)
698 return (char *)ERROR_PTR("linechars % 4 must be 0", procName, NULL);
699
700 /* Remove all white space */
701 if ((flata = (char *)LEPT_CALLOC(insize, sizeof(char))) == NULL)
702 return (char *)ERROR_PTR("flata not made", procName, NULL);
703 for (i = 0, flatindex = 0; i < insize; i++) {
704 if (isBase64(inarray[i]) || inarray[i] == '=')
705 flata[flatindex++] = inarray[i];
706 }
707
708 /* Generate output string */
709 flatsize = flatindex;
710 nlines = (flatsize + linechars - 1) / linechars;
711 linewithpad = leadspace + linechars + 1; /* including newline */
712 if (addquotes) linewithpad += 2;
713 if ((outa = (char *)LEPT_CALLOC((size_t)nlines * linewithpad,
714 sizeof(char))) == NULL) {
715 LEPT_FREE(flata);
716 return (char *)ERROR_PTR("outa not made", procName, NULL);
717 }
718 for (j = 0, outindex = 0; j < leadspace; j++)
719 outa[outindex++] = ' ';
720 if (addquotes) outa[outindex++] = '"';
721 for (i = 0, linecount = 0; i < flatsize; i++) {
722 if (linecount == linechars) {
723 if (addquotes) outa[outindex++] = '"';
724 outa[outindex++] = '\n';
725 for (j = 0; j < leadspace; j++)
726 outa[outindex++] = ' ';
727 if (addquotes) outa[outindex++] = '"';
728 linecount = 0;
729 }
730 outa[outindex++] = flata[i];
731 linecount++;
732 }
733 if (addquotes) outa[outindex++] = '"';
734 *poutsize = outindex;
735
736 LEPT_FREE(flata);
737 return outa;
738}
void lept_stderr(const char *fmt,...)
lept_stderr()
Definition: utils1.c:306
l_uint8 * zlibCompress(const l_uint8 *datain, size_t nin, size_t *pnout)
zlibCompress()
Definition: zlibmem.c:92
l_uint8 * zlibUncompress(const l_uint8 *datain, size_t nin, size_t *pnout)
zlibUncompress()
Definition: zlibmem.c:196