254a99729c
Resolves: #1404933
1229 lines
50 KiB
Diff
1229 lines
50 KiB
Diff
From 273a133110838ee5702e7eb6409a853c598211b2 Mon Sep 17 00:00:00 2001
|
|
From: Ken Sharp <ken.sharp@artifex.com>
|
|
Date: Thu, 29 Sep 2016 17:35:05 +0100
|
|
Subject: [PATCH] Remove (and re-implement) ConvertUTF.c
|
|
|
|
Bug #697122 " embedded ConvertUTF.c is buggy and licensed incompatibly with GPL/APGL"
|
|
|
|
Its not clear that this code is incompatible with GPL, nor do we think
|
|
any 'bugginess' in the code affects us, since we are using a comparatively
|
|
small part of the included code.
|
|
|
|
Nevertheless its possible to remove the code, and re-implement the small
|
|
part we actually need, and that is done here.
|
|
|
|
Also removed the DSCEncodingToUnicode option which was insanely difficult
|
|
to use, and incorrectly documented.
|
|
|
|
Yhis shows one difference, 692486_-_heap_overflow_in_pdf_to_ucs2.pdf
|
|
now correctly throws an error, because the PDF file contains document
|
|
information (Application) which has an invalid UTF16-BE sequence.
|
|
---
|
|
base/ConvertUTF.c | 539 -----------------------------------------
|
|
base/ConvertUTF.h | 155 ------------
|
|
base/lib.mak | 4 -
|
|
devices/devs.mak | 5 +-
|
|
devices/vector/gdevpdf.c | 16 +-
|
|
devices/vector/gdevpdfb.h | 1 -
|
|
devices/vector/gdevpdfe.c | 270 +++++++++++----------
|
|
devices/vector/gdevpdfp.c | 1 -
|
|
devices/vector/gdevpdfx.h | 17 +-
|
|
windows/ghostscript.vcproj | 8 -
|
|
windows/ghostscript_rt.vcxproj | 2 -
|
|
11 files changed, 155 insertions(+), 863 deletions(-)
|
|
delete mode 100644 base/ConvertUTF.c
|
|
delete mode 100644 base/ConvertUTF.h
|
|
|
|
diff --git a/base/ConvertUTF.c b/base/ConvertUTF.c
|
|
deleted file mode 100644
|
|
index cb2e2de..0000000
|
|
--- a/base/ConvertUTF.c
|
|
+++ /dev/null
|
|
@@ -1,539 +0,0 @@
|
|
-/*
|
|
- * Copyright 2001-2004 Unicode, Inc.
|
|
- *
|
|
- * Disclaimer
|
|
- *
|
|
- * This source code is provided as is by Unicode, Inc. No claims are
|
|
- * made as to fitness for any particular purpose. No warranties of any
|
|
- * kind are expressed or implied. The recipient agrees to determine
|
|
- * applicability of information provided. If this file has been
|
|
- * purchased on magnetic or optical media from Unicode, Inc., the
|
|
- * sole remedy for any claim will be exchange of defective media
|
|
- * within 90 days of receipt.
|
|
- *
|
|
- * Limitations on Rights to Redistribute This Code
|
|
- *
|
|
- * Unicode, Inc. hereby grants the right to freely use the information
|
|
- * supplied in this file in the creation of products supporting the
|
|
- * Unicode Standard, and to make copies of this file in any form
|
|
- * for internal or external distribution as long as this notice
|
|
- * remains attached.
|
|
- */
|
|
-
|
|
-
|
|
-/* ---------------------------------------------------------------------
|
|
-
|
|
- Conversions between UTF32, UTF-16, and UTF-8. Source code file.
|
|
- Author: Mark E. Davis, 1994.
|
|
- Rev History: Rick McGowan, fixes & updates May 2001.
|
|
- Sept 2001: fixed const & error conditions per
|
|
- mods suggested by S. Parent & A. Lillich.
|
|
- June 2002: Tim Dodd added detection and handling of incomplete
|
|
- source sequences, enhanced error detection, added casts
|
|
- to eliminate compiler warnings.
|
|
- July 2003: slight mods to back out aggressive FFFE detection.
|
|
- Jan 2004: updated switches in from-UTF8 conversions.
|
|
- Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
|
|
-
|
|
- See the header file "ConvertUTF.h" for complete documentation.
|
|
-
|
|
------------------------------------------------------------------------- */
|
|
-
|
|
-#include "ConvertUTF.h"
|
|
-#ifdef CVTUTF_DEBUG
|
|
-#include <stdio.h>
|
|
-#endif
|
|
-
|
|
-static const int halfShift = 10; /* used for shifting by 10 bits */
|
|
-
|
|
-static const UTF32 halfBase = 0x0010000UL;
|
|
-static const UTF32 halfMask = 0x3FFUL;
|
|
-
|
|
-#define UNI_SUR_HIGH_START (UTF32)0xD800
|
|
-#define UNI_SUR_HIGH_END (UTF32)0xDBFF
|
|
-#define UNI_SUR_LOW_START (UTF32)0xDC00
|
|
-#define UNI_SUR_LOW_END (UTF32)0xDFFF
|
|
-#define false 0
|
|
-#define true 1
|
|
-
|
|
-/* --------------------------------------------------------------------- */
|
|
-
|
|
-ConversionResult ConvertUTF32toUTF16 (
|
|
- const UTF32** sourceStart, const UTF32* sourceEnd,
|
|
- UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
|
|
- ConversionResult result = conversionOK;
|
|
- const UTF32* source = *sourceStart;
|
|
- UTF16* target = *targetStart;
|
|
- while (source < sourceEnd) {
|
|
- UTF32 ch;
|
|
- if (target >= targetEnd) {
|
|
- result = targetExhausted; break;
|
|
- }
|
|
- ch = *source++;
|
|
- if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
|
|
- /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
|
|
- if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
|
|
- if (flags == strictConversion) {
|
|
- --source; /* return to the illegal value itself */
|
|
- result = sourceIllegal;
|
|
- break;
|
|
- } else {
|
|
- *target++ = UNI_REPLACEMENT_CHAR;
|
|
- }
|
|
- } else {
|
|
- *target++ = (UTF16)ch; /* normal case */
|
|
- }
|
|
- } else if (ch > UNI_MAX_LEGAL_UTF32) {
|
|
- if (flags == strictConversion) {
|
|
- result = sourceIllegal;
|
|
- } else {
|
|
- *target++ = UNI_REPLACEMENT_CHAR;
|
|
- }
|
|
- } else {
|
|
- /* target is a character in range 0xFFFF - 0x10FFFF. */
|
|
- if (target + 1 >= targetEnd) {
|
|
- --source; /* Back up source pointer! */
|
|
- result = targetExhausted; break;
|
|
- }
|
|
- ch -= halfBase;
|
|
- *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
|
|
- *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
|
|
- }
|
|
- }
|
|
- *sourceStart = source;
|
|
- *targetStart = target;
|
|
- return result;
|
|
-}
|
|
-
|
|
-/* --------------------------------------------------------------------- */
|
|
-
|
|
-ConversionResult ConvertUTF16toUTF32 (
|
|
- const UTF16** sourceStart, const UTF16* sourceEnd,
|
|
- UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
|
|
- ConversionResult result = conversionOK;
|
|
- const UTF16* source = *sourceStart;
|
|
- UTF32* target = *targetStart;
|
|
- UTF32 ch, ch2;
|
|
- while (source < sourceEnd) {
|
|
- const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
|
|
- ch = *source++;
|
|
- /* If we have a surrogate pair, convert to UTF32 first. */
|
|
- if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
|
|
- /* If the 16 bits following the high surrogate are in the source buffer... */
|
|
- if (source < sourceEnd) {
|
|
- ch2 = *source;
|
|
- /* If it's a low surrogate, convert to UTF32. */
|
|
- if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
|
|
- ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
|
|
- + (ch2 - UNI_SUR_LOW_START) + halfBase;
|
|
- ++source;
|
|
- } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
|
|
- --source; /* return to the illegal value itself */
|
|
- result = sourceIllegal;
|
|
- break;
|
|
- }
|
|
- } else { /* We don't have the 16 bits following the high surrogate. */
|
|
- --source; /* return to the high surrogate */
|
|
- result = sourceExhausted;
|
|
- break;
|
|
- }
|
|
- } else if (flags == strictConversion) {
|
|
- /* UTF-16 surrogate values are illegal in UTF-32 */
|
|
- if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
|
|
- --source; /* return to the illegal value itself */
|
|
- result = sourceIllegal;
|
|
- break;
|
|
- }
|
|
- }
|
|
- if (target >= targetEnd) {
|
|
- source = oldSource; /* Back up source pointer! */
|
|
- result = targetExhausted; break;
|
|
- }
|
|
- *target++ = ch;
|
|
- }
|
|
- *sourceStart = source;
|
|
- *targetStart = target;
|
|
-#ifdef CVTUTF_DEBUG
|
|
-if (result == sourceIllegal) {
|
|
- fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
|
|
- fflush(stderr);
|
|
-}
|
|
-#endif
|
|
- return result;
|
|
-}
|
|
-
|
|
-/* --------------------------------------------------------------------- */
|
|
-
|
|
-/*
|
|
- * Index into the table below with the first byte of a UTF-8 sequence to
|
|
- * get the number of trailing bytes that are supposed to follow it.
|
|
- * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
|
|
- * left as-is for anyone who may want to do such conversion, which was
|
|
- * allowed in earlier algorithms.
|
|
- */
|
|
-static const char trailingBytesForUTF8[256] = {
|
|
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
- 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
|
|
-};
|
|
-
|
|
-/*
|
|
- * Magic values subtracted from a buffer value during UTF8 conversion.
|
|
- * This table contains as many values as there might be trailing bytes
|
|
- * in a UTF-8 sequence.
|
|
- */
|
|
-static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
|
|
- 0x03C82080UL, 0xFA082080UL, 0x82082080UL };
|
|
-
|
|
-/*
|
|
- * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
|
|
- * into the first byte, depending on how many bytes follow. There are
|
|
- * as many entries in this table as there are UTF-8 sequence types.
|
|
- * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
|
|
- * for *legal* UTF-8 will be 4 or fewer bytes total.
|
|
- */
|
|
-static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
|
|
-
|
|
-/* --------------------------------------------------------------------- */
|
|
-
|
|
-/* The interface converts a whole buffer to avoid function-call overhead.
|
|
- * Constants have been gathered. Loops & conditionals have been removed as
|
|
- * much as possible for efficiency, in favor of drop-through switches.
|
|
- * (See "Note A" at the bottom of the file for equivalent code.)
|
|
- * If your compiler supports it, the "isLegalUTF8" call can be turned
|
|
- * into an inline function.
|
|
- */
|
|
-
|
|
-/* --------------------------------------------------------------------- */
|
|
-
|
|
-ConversionResult ConvertUTF16toUTF8 (
|
|
- const UTF16** sourceStart, const UTF16* sourceEnd,
|
|
- UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
|
|
- ConversionResult result = conversionOK;
|
|
- const UTF16* source = *sourceStart;
|
|
- UTF8* target = *targetStart;
|
|
- while (source < sourceEnd) {
|
|
- UTF32 ch;
|
|
- unsigned short bytesToWrite = 0;
|
|
- const UTF32 byteMask = 0xBF;
|
|
- const UTF32 byteMark = 0x80;
|
|
- const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
|
|
- ch = *source++;
|
|
- /* If we have a surrogate pair, convert to UTF32 first. */
|
|
- if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
|
|
- /* If the 16 bits following the high surrogate are in the source buffer... */
|
|
- if (source < sourceEnd) {
|
|
- UTF32 ch2 = *source;
|
|
- /* If it's a low surrogate, convert to UTF32. */
|
|
- if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
|
|
- ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
|
|
- + (ch2 - UNI_SUR_LOW_START) + halfBase;
|
|
- ++source;
|
|
- } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
|
|
- --source; /* return to the illegal value itself */
|
|
- result = sourceIllegal;
|
|
- break;
|
|
- }
|
|
- } else { /* We don't have the 16 bits following the high surrogate. */
|
|
- --source; /* return to the high surrogate */
|
|
- result = sourceExhausted;
|
|
- break;
|
|
- }
|
|
- } else if (flags == strictConversion) {
|
|
- /* UTF-16 surrogate values are illegal in UTF-32 */
|
|
- if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
|
|
- --source; /* return to the illegal value itself */
|
|
- result = sourceIllegal;
|
|
- break;
|
|
- }
|
|
- }
|
|
- /* Figure out how many bytes the result will require */
|
|
- if (ch < (UTF32)0x80) { bytesToWrite = 1;
|
|
- } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
|
|
- } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
|
|
- } else if (ch < (UTF32)0x110000) { bytesToWrite = 4;
|
|
- } else { bytesToWrite = 3;
|
|
- ch = UNI_REPLACEMENT_CHAR;
|
|
- }
|
|
-
|
|
- target += bytesToWrite;
|
|
- if (target > targetEnd) {
|
|
- source = oldSource; /* Back up source pointer! */
|
|
- target -= bytesToWrite; result = targetExhausted; break;
|
|
- }
|
|
- switch (bytesToWrite) { /* note: everything falls through. */
|
|
- case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
|
|
- case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
|
|
- case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
|
|
- case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]);
|
|
- }
|
|
- target += bytesToWrite;
|
|
- }
|
|
- *sourceStart = source;
|
|
- *targetStart = target;
|
|
- return result;
|
|
-}
|
|
-
|
|
-/* --------------------------------------------------------------------- */
|
|
-
|
|
-/*
|
|
- * Utility routine to tell whether a sequence of bytes is legal UTF-8.
|
|
- * This must be called with the length pre-determined by the first byte.
|
|
- * If not calling this from ConvertUTF8to*, then the length can be set by:
|
|
- * length = trailingBytesForUTF8[*source]+1;
|
|
- * and the sequence is illegal right away if there aren't that many bytes
|
|
- * available.
|
|
- * If presented with a length > 4, this returns false. The Unicode
|
|
- * definition of UTF-8 goes up to 4-byte sequences.
|
|
- */
|
|
-
|
|
-static Boolean isLegalUTF8(const UTF8 *source, int length) {
|
|
- UTF8 a;
|
|
- const UTF8 *srcptr = source+length;
|
|
- switch (length) {
|
|
- default: return false;
|
|
- /* Everything else falls through when "true"... */
|
|
- case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
|
|
- case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
|
|
- case 2: if ((a = (*--srcptr)) > 0xBF) return false;
|
|
-
|
|
- switch (*source) {
|
|
- /* no fall-through in this inner switch */
|
|
- case 0xE0: if (a < 0xA0) return false; break;
|
|
- case 0xED: if (a > 0x9F) return false; break;
|
|
- case 0xF0: if (a < 0x90) return false; break;
|
|
- case 0xF4: if (a > 0x8F) return false; break;
|
|
- default: if (a < 0x80) return false;
|
|
- }
|
|
-
|
|
- case 1: if (*source >= 0x80 && *source < 0xC2) return false;
|
|
- }
|
|
- if (*source > 0xF4) return false;
|
|
- return true;
|
|
-}
|
|
-
|
|
-/* --------------------------------------------------------------------- */
|
|
-
|
|
-/*
|
|
- * Exported function to return whether a UTF-8 sequence is legal or not.
|
|
- * This is not used here; it's just exported.
|
|
- */
|
|
-Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
|
|
- int length = trailingBytesForUTF8[*source]+1;
|
|
- if (source+length > sourceEnd) {
|
|
- return false;
|
|
- }
|
|
- return isLegalUTF8(source, length);
|
|
-}
|
|
-
|
|
-/* --------------------------------------------------------------------- */
|
|
-
|
|
-ConversionResult ConvertUTF8toUTF16 (
|
|
- const UTF8** sourceStart, const UTF8* sourceEnd,
|
|
- UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
|
|
- ConversionResult result = conversionOK;
|
|
- const UTF8* source = *sourceStart;
|
|
- UTF16* target = *targetStart;
|
|
- while (source < sourceEnd) {
|
|
- UTF32 ch = 0;
|
|
- unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
|
|
- if (source + extraBytesToRead >= sourceEnd) {
|
|
- result = sourceExhausted; break;
|
|
- }
|
|
- /* Do this check whether lenient or strict */
|
|
- if (! isLegalUTF8(source, extraBytesToRead+1)) {
|
|
- result = sourceIllegal;
|
|
- break;
|
|
- }
|
|
- /*
|
|
- * The cases all fall through. See "Note A" below.
|
|
- */
|
|
- switch (extraBytesToRead) {
|
|
- case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
|
|
- case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
|
|
- case 3: ch += *source++; ch <<= 6;
|
|
- case 2: ch += *source++; ch <<= 6;
|
|
- case 1: ch += *source++; ch <<= 6;
|
|
- case 0: ch += *source++;
|
|
- }
|
|
- ch -= offsetsFromUTF8[extraBytesToRead];
|
|
-
|
|
- if (target >= targetEnd) {
|
|
- source -= (extraBytesToRead+1); /* Back up source pointer! */
|
|
- result = targetExhausted; break;
|
|
- }
|
|
- if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
|
|
- /* UTF-16 surrogate values are illegal in UTF-32 */
|
|
- if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
|
|
- if (flags == strictConversion) {
|
|
- source -= (extraBytesToRead+1); /* return to the illegal value itself */
|
|
- result = sourceIllegal;
|
|
- break;
|
|
- } else {
|
|
- *target++ = UNI_REPLACEMENT_CHAR;
|
|
- }
|
|
- } else {
|
|
- *target++ = (UTF16)ch; /* normal case */
|
|
- }
|
|
- } else if (ch > UNI_MAX_UTF16) {
|
|
- if (flags == strictConversion) {
|
|
- result = sourceIllegal;
|
|
- source -= (extraBytesToRead+1); /* return to the start */
|
|
- break; /* Bail out; shouldn't continue */
|
|
- } else {
|
|
- *target++ = UNI_REPLACEMENT_CHAR;
|
|
- }
|
|
- } else {
|
|
- /* target is a character in range 0xFFFF - 0x10FFFF. */
|
|
- if (target + 1 >= targetEnd) {
|
|
- source -= (extraBytesToRead+1); /* Back up source pointer! */
|
|
- result = targetExhausted; break;
|
|
- }
|
|
- ch -= halfBase;
|
|
- *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
|
|
- *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
|
|
- }
|
|
- }
|
|
- *sourceStart = source;
|
|
- *targetStart = target;
|
|
- return result;
|
|
-}
|
|
-
|
|
-/* --------------------------------------------------------------------- */
|
|
-
|
|
-ConversionResult ConvertUTF32toUTF8 (
|
|
- const UTF32** sourceStart, const UTF32* sourceEnd,
|
|
- UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
|
|
- ConversionResult result = conversionOK;
|
|
- const UTF32* source = *sourceStart;
|
|
- UTF8* target = *targetStart;
|
|
- while (source < sourceEnd) {
|
|
- UTF32 ch;
|
|
- unsigned short bytesToWrite = 0;
|
|
- const UTF32 byteMask = 0xBF;
|
|
- const UTF32 byteMark = 0x80;
|
|
- ch = *source++;
|
|
- if (flags == strictConversion ) {
|
|
- /* UTF-16 surrogate values are illegal in UTF-32 */
|
|
- if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
|
|
- --source; /* return to the illegal value itself */
|
|
- result = sourceIllegal;
|
|
- break;
|
|
- }
|
|
- }
|
|
- /*
|
|
- * Figure out how many bytes the result will require. Turn any
|
|
- * illegally large UTF32 things (> Plane 17) into replacement chars.
|
|
- */
|
|
- if (ch < (UTF32)0x80) { bytesToWrite = 1;
|
|
- } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
|
|
- } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
|
|
- } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4;
|
|
- } else { bytesToWrite = 3;
|
|
- ch = UNI_REPLACEMENT_CHAR;
|
|
- result = sourceIllegal;
|
|
- }
|
|
-
|
|
- target += bytesToWrite;
|
|
- if (target > targetEnd) {
|
|
- --source; /* Back up source pointer! */
|
|
- target -= bytesToWrite; result = targetExhausted; break;
|
|
- }
|
|
- switch (bytesToWrite) { /* note: everything falls through. */
|
|
- case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
|
|
- case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
|
|
- case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
|
|
- case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
|
|
- }
|
|
- target += bytesToWrite;
|
|
- }
|
|
- *sourceStart = source;
|
|
- *targetStart = target;
|
|
- return result;
|
|
-}
|
|
-
|
|
-/* --------------------------------------------------------------------- */
|
|
-
|
|
-ConversionResult ConvertUTF8toUTF32 (
|
|
- const UTF8** sourceStart, const UTF8* sourceEnd,
|
|
- UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
|
|
- ConversionResult result = conversionOK;
|
|
- const UTF8* source = *sourceStart;
|
|
- UTF32* target = *targetStart;
|
|
- while (source < sourceEnd) {
|
|
- UTF32 ch = 0;
|
|
- unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
|
|
- if (source + extraBytesToRead >= sourceEnd) {
|
|
- result = sourceExhausted; break;
|
|
- }
|
|
- /* Do this check whether lenient or strict */
|
|
- if (! isLegalUTF8(source, extraBytesToRead+1)) {
|
|
- result = sourceIllegal;
|
|
- break;
|
|
- }
|
|
- /*
|
|
- * The cases all fall through. See "Note A" below.
|
|
- */
|
|
- switch (extraBytesToRead) {
|
|
- case 5: ch += *source++; ch <<= 6;
|
|
- case 4: ch += *source++; ch <<= 6;
|
|
- case 3: ch += *source++; ch <<= 6;
|
|
- case 2: ch += *source++; ch <<= 6;
|
|
- case 1: ch += *source++; ch <<= 6;
|
|
- case 0: ch += *source++;
|
|
- }
|
|
- ch -= offsetsFromUTF8[extraBytesToRead];
|
|
-
|
|
- if (target >= targetEnd) {
|
|
- source -= (extraBytesToRead+1); /* Back up the source pointer! */
|
|
- result = targetExhausted; break;
|
|
- }
|
|
- if (ch <= UNI_MAX_LEGAL_UTF32) {
|
|
- /*
|
|
- * UTF-16 surrogate values are illegal in UTF-32, and anything
|
|
- * over Plane 17 (> 0x10FFFF) is illegal.
|
|
- */
|
|
- if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
|
|
- if (flags == strictConversion) {
|
|
- source -= (extraBytesToRead+1); /* return to the illegal value itself */
|
|
- result = sourceIllegal;
|
|
- break;
|
|
- } else {
|
|
- *target++ = UNI_REPLACEMENT_CHAR;
|
|
- }
|
|
- } else {
|
|
- *target++ = ch;
|
|
- }
|
|
- } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
|
|
- result = sourceIllegal;
|
|
- *target++ = UNI_REPLACEMENT_CHAR;
|
|
- }
|
|
- }
|
|
- *sourceStart = source;
|
|
- *targetStart = target;
|
|
- return result;
|
|
-}
|
|
-
|
|
-/* ---------------------------------------------------------------------
|
|
-
|
|
- Note A.
|
|
- The fall-through switches in UTF-8 reading code save a
|
|
- temp variable, some decrements & conditionals. The switches
|
|
- are equivalent to the following loop:
|
|
- {
|
|
- int tmpBytesToRead = extraBytesToRead+1;
|
|
- do {
|
|
- ch += *source++;
|
|
- --tmpBytesToRead;
|
|
- if (tmpBytesToRead) ch <<= 6;
|
|
- } while (tmpBytesToRead > 0);
|
|
- }
|
|
- In UTF-8 writing code, the switches on "bytesToWrite" are
|
|
- similarly unrolled loops.
|
|
-
|
|
- --------------------------------------------------------------------- */
|
|
diff --git a/base/ConvertUTF.h b/base/ConvertUTF.h
|
|
deleted file mode 100644
|
|
index 538bec6..0000000
|
|
--- a/base/ConvertUTF.h
|
|
+++ /dev/null
|
|
@@ -1,155 +0,0 @@
|
|
-/*
|
|
- * Copyright 2001-2004 Unicode, Inc.
|
|
- *
|
|
- * Disclaimer
|
|
- *
|
|
- * This source code is provided as is by Unicode, Inc. No claims are
|
|
- * made as to fitness for any particular purpose. No warranties of any
|
|
- * kind are expressed or implied. The recipient agrees to determine
|
|
- * applicability of information provided. If this file has been
|
|
- * purchased on magnetic or optical media from Unicode, Inc., the
|
|
- * sole remedy for any claim will be exchange of defective media
|
|
- * within 90 days of receipt.
|
|
- *
|
|
- * Limitations on Rights to Redistribute This Code
|
|
- *
|
|
- * Unicode, Inc. hereby grants the right to freely use the information
|
|
- * supplied in this file in the creation of products supporting the
|
|
- * Unicode Standard, and to make copies of this file in any form
|
|
- * for internal or external distribution as long as this notice
|
|
- * remains attached.
|
|
- */
|
|
-
|
|
-
|
|
-#ifndef ConvertUTF_INCLUDED
|
|
-#define ConvertUTF_INCLUDED
|
|
-
|
|
-/* ---------------------------------------------------------------------
|
|
-
|
|
- Conversions between UTF32, UTF-16, and UTF-8. Header file.
|
|
-
|
|
- Several funtions are included here, forming a complete set of
|
|
- conversions between the three formats. UTF-7 is not included
|
|
- here, but is handled in a separate source file.
|
|
-
|
|
- Each of these routines takes pointers to input buffers and output
|
|
- buffers. The input buffers are const.
|
|
-
|
|
- Each routine converts the text between *sourceStart and sourceEnd,
|
|
- putting the result into the buffer between *targetStart and
|
|
- targetEnd. Note: the end pointers are *after* the last item: e.g.
|
|
- *(sourceEnd - 1) is the last item.
|
|
-
|
|
- The return result indicates whether the conversion was successful,
|
|
- and if not, whether the problem was in the source or target buffers.
|
|
- (Only the first encountered problem is indicated.)
|
|
-
|
|
- After the conversion, *sourceStart and *targetStart are both
|
|
- updated to point to the end of last text successfully converted in
|
|
- the respective buffers.
|
|
-
|
|
- Input parameters:
|
|
- sourceStart - pointer to a pointer to the source buffer.
|
|
- The contents of this are modified on return so that
|
|
- it points at the next thing to be converted.
|
|
- targetStart - similarly, pointer to pointer to the target buffer.
|
|
- sourceEnd, targetEnd - respectively pointers to the ends of the
|
|
- two buffers, for overflow checking only.
|
|
-
|
|
- These conversion functions take a ConversionFlags argument. When this
|
|
- flag is set to strict, both irregular sequences and isolated surrogates
|
|
- will cause an error. When the flag is set to lenient, both irregular
|
|
- sequences and isolated surrogates are converted.
|
|
-
|
|
- Whether the flag is strict or lenient, all illegal sequences will cause
|
|
- an error return. This includes sequences such as: <F4 90 80 80>, <C0 80>,
|
|
- or <A0> in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code
|
|
- must check for illegal sequences.
|
|
-
|
|
- When the flag is set to lenient, characters over 0x10FFFF are converted
|
|
- to the replacement character; otherwise (when the flag is set to strict)
|
|
- they constitute an error.
|
|
-
|
|
- Output parameters:
|
|
- The value "sourceIllegal" is returned from some routines if the input
|
|
- sequence is malformed. When "sourceIllegal" is returned, the source
|
|
- value will point to the illegal value that caused the problem. E.g.,
|
|
- in UTF-8 when a sequence is malformed, it points to the start of the
|
|
- malformed sequence.
|
|
-
|
|
- Author: Mark E. Davis, 1994.
|
|
- Rev History: Rick McGowan, fixes & updates May 2001.
|
|
- Fixes & updates, Sept 2001.
|
|
-
|
|
------------------------------------------------------------------------- */
|
|
-
|
|
-/* ---------------------------------------------------------------------
|
|
- The following 4 definitions are compiler-specific.
|
|
- The C standard does not guarantee that wchar_t has at least
|
|
- 16 bits, so wchar_t is no less portable than unsigned short!
|
|
- All should be unsigned values to avoid sign extension during
|
|
- bit mask & shift operations.
|
|
------------------------------------------------------------------------- */
|
|
-
|
|
-typedef unsigned long UTF32; /* at least 32 bits */
|
|
-typedef unsigned short UTF16; /* at least 16 bits */
|
|
-typedef unsigned char UTF8; /* typically 8 bits */
|
|
-typedef unsigned char Boolean; /* 0 or 1 */
|
|
-
|
|
-/* Some fundamental constants */
|
|
-#define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD
|
|
-#define UNI_MAX_BMP (UTF32)0x0000FFFF
|
|
-#define UNI_MAX_UTF16 (UTF32)0x0010FFFF
|
|
-#define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF
|
|
-#define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF
|
|
-
|
|
-typedef enum {
|
|
- conversionOK, /* conversion successful */
|
|
- sourceExhausted, /* partial character in source, but hit end */
|
|
- targetExhausted, /* insuff. room in target for conversion */
|
|
- sourceIllegal /* source sequence is illegal/malformed */
|
|
-} ConversionResult;
|
|
-
|
|
-typedef enum {
|
|
- strictConversion = 0,
|
|
- lenientConversion
|
|
-} ConversionFlags;
|
|
-
|
|
-/* This is for C++ and does no harm in C */
|
|
-#ifdef __cplusplus
|
|
-extern "C" {
|
|
-#endif
|
|
-
|
|
-ConversionResult ConvertUTF8toUTF16 (
|
|
- const UTF8** sourceStart, const UTF8* sourceEnd,
|
|
- UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags);
|
|
-
|
|
-ConversionResult ConvertUTF16toUTF8 (
|
|
- const UTF16** sourceStart, const UTF16* sourceEnd,
|
|
- UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags);
|
|
-
|
|
-ConversionResult ConvertUTF8toUTF32 (
|
|
- const UTF8** sourceStart, const UTF8* sourceEnd,
|
|
- UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags);
|
|
-
|
|
-ConversionResult ConvertUTF32toUTF8 (
|
|
- const UTF32** sourceStart, const UTF32* sourceEnd,
|
|
- UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags);
|
|
-
|
|
-ConversionResult ConvertUTF16toUTF32 (
|
|
- const UTF16** sourceStart, const UTF16* sourceEnd,
|
|
- UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags);
|
|
-
|
|
-ConversionResult ConvertUTF32toUTF16 (
|
|
- const UTF32** sourceStart, const UTF32* sourceEnd,
|
|
- UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags);
|
|
-
|
|
-Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd);
|
|
-
|
|
-#ifdef __cplusplus
|
|
-}
|
|
-#endif
|
|
-
|
|
-/* --------------------------------------------------------------------- */
|
|
-
|
|
-#endif /* ConvertUTF_INCLUDED */
|
|
diff --git a/base/lib.mak b/base/lib.mak
|
|
index 173e2c6..2de6565 100644
|
|
--- a/base/lib.mak
|
|
+++ b/base/lib.mak
|
|
@@ -52,7 +52,6 @@ GLLCMS2CC=$(CC) $(LCMS2_CFLAGS) $(CFLAGS) $(I_)$(GLI_) $(II)$(LCMS2SRCDIR)$(D)in
|
|
lcms2_h=$(LCMS2SRCDIR)$(D)include$(D)lcms2.h
|
|
lcms2_plugin_h=$(LCMS2SRCDIR)$(D)include$(D)lcms2_plugin.h
|
|
|
|
-ConvertUTF_h=$(GLSRC)ConvertUTF.h
|
|
gdevdcrd_h=$(GLSRC)gdevdcrd.h
|
|
gdevpccm_h=$(GLSRC)gdevpccm.h
|
|
|
|
@@ -1097,9 +1096,6 @@ $(GLOBJ)gdevpccm.$(OBJ) : $(GLSRC)gdevpccm.c $(AK)\
|
|
$(gx_h) $(gsmatrix_h) $(gxdevice_h) $(gdevpccm_h) $(LIB_MAK) $(MAKEDIRS)
|
|
$(GLCC) $(GLO_)gdevpccm.$(OBJ) $(C_) $(GLSRC)gdevpccm.c
|
|
|
|
-$(GLOBJ)ConvertUTF.$(OBJ) : $(GLSRC)ConvertUTF.c $(ConvertUTF_h) $(LIB_MAK) $(MAKEDIRS)
|
|
- $(GLCC) $(GLO_)ConvertUTF.$(OBJ) $(C_) $(GLSRC)ConvertUTF.c
|
|
-
|
|
### Memory devices
|
|
|
|
$(GLOBJ)gdevmem.$(OBJ) : $(GLSRC)gdevmem.c $(AK) $(gx_h) $(gserrors_h) \
|
|
diff --git a/devices/devs.mak b/devices/devs.mak
|
|
index ea27ab0..51ec363 100644
|
|
--- a/devices/devs.mak
|
|
+++ b/devices/devs.mak
|
|
@@ -835,9 +835,8 @@ pdfwrite5_=$(DEVOBJ)gdevpdfm.$(OBJ)
|
|
pdfwrite6_=$(DEVOBJ)gdevpdfo.$(OBJ) $(DEVOBJ)gdevpdfp.$(OBJ) $(DEVOBJ)gdevpdft.$(OBJ)
|
|
pdfwrite7_=$(DEVOBJ)gdevpdfr.$(OBJ)
|
|
pdfwrite8_=$(DEVOBJ)gdevpdfu.$(OBJ) $(DEVOBJ)gdevpdfv.$(OBJ) $(DEVOBJ)gdevagl.$(OBJ)
|
|
-pdfwrite9_= $(GLOBJ)ConvertUTF.$(OBJ)
|
|
-pdfwrite10_=$(DEVOBJ)gsflip.$(OBJ)
|
|
-pdfwrite11_=$(DEVOBJ)scantab.$(OBJ) $(DEVOBJ)sfilter2.$(OBJ)
|
|
+pdfwrite9_=$(DEVOBJ)gsflip.$(OBJ)
|
|
+pdfwrite10_=$(DEVOBJ)scantab.$(OBJ) $(DEVOBJ)sfilter2.$(OBJ)
|
|
pdfwrite_=$(pdfwrite1_) $(pdfwrite2_) $(pdfwrite3_) $(pdfwrite4_)\
|
|
$(pdfwrite5_) $(pdfwrite6_) $(pdfwrite7_) $(pdfwrite8_) $(pdfwrite9_)\
|
|
$(pdfwrite10_) $(pdfwrite11_)
|
|
diff --git a/devices/vector/gdevpdf.c b/devices/vector/gdevpdf.c
|
|
index 2b3186d..20e0ae8 100644
|
|
--- a/devices/vector/gdevpdf.c
|
|
+++ b/devices/vector/gdevpdf.c
|
|
@@ -111,14 +111,13 @@ ENUM_PTRS_WITH(device_pdfwrite_enum_ptrs, gx_device_pdf *pdev)
|
|
ENUM_PTR(32, gx_device_pdf, pres_soft_mask_dict);
|
|
ENUM_PTR(33, gx_device_pdf, PDFXTrimBoxToMediaBoxOffset.data);
|
|
ENUM_PTR(34, gx_device_pdf, PDFXBleedBoxToTrimBoxOffset.data);
|
|
- ENUM_PTR(35, gx_device_pdf, DSCEncodingToUnicode.data);
|
|
- ENUM_PTR(36, gx_device_pdf, Identity_ToUnicode_CMaps[0]);
|
|
- ENUM_PTR(37, gx_device_pdf, Identity_ToUnicode_CMaps[1]);
|
|
- ENUM_PTR(38, gx_device_pdf, vgstack);
|
|
- ENUM_PTR(39, gx_device_pdf, outline_levels);
|
|
- ENUM_PTR(40, gx_device_pdf, EmbeddedFiles);
|
|
- ENUM_PTR(41, gx_device_pdf, pdf_font_dir);
|
|
- ENUM_PTR(42, gx_device_pdf, ExtensionMetadata);
|
|
+ ENUM_PTR(35, gx_device_pdf, Identity_ToUnicode_CMaps[0]);
|
|
+ ENUM_PTR(36, gx_device_pdf, Identity_ToUnicode_CMaps[1]);
|
|
+ ENUM_PTR(37, gx_device_pdf, vgstack);
|
|
+ ENUM_PTR(38, gx_device_pdf, outline_levels);
|
|
+ ENUM_PTR(39, gx_device_pdf, EmbeddedFiles);
|
|
+ ENUM_PTR(40, gx_device_pdf, pdf_font_dir);
|
|
+ ENUM_PTR(41, gx_device_pdf, ExtensionMetadata);
|
|
#define e1(i,elt) ENUM_PARAM_STRING_PTR(i + gx_device_pdf_num_ptrs, gx_device_pdf, elt);
|
|
gx_device_pdf_do_param_strings(e1)
|
|
#undef e1
|
|
@@ -165,7 +164,6 @@ static RELOC_PTRS_WITH(device_pdfwrite_reloc_ptrs, gx_device_pdf *pdev)
|
|
RELOC_PTR(gx_device_pdf, pres_soft_mask_dict);
|
|
RELOC_PTR(gx_device_pdf, PDFXTrimBoxToMediaBoxOffset.data);
|
|
RELOC_PTR(gx_device_pdf, PDFXBleedBoxToTrimBoxOffset.data);
|
|
- RELOC_PTR(gx_device_pdf, DSCEncodingToUnicode.data);
|
|
RELOC_PTR(gx_device_pdf, Identity_ToUnicode_CMaps[0]);
|
|
RELOC_PTR(gx_device_pdf, Identity_ToUnicode_CMaps[1]);
|
|
RELOC_PTR(gx_device_pdf, vgstack);
|
|
diff --git a/devices/vector/gdevpdfb.h b/devices/vector/gdevpdfb.h
|
|
index 08f18c5..447f0f5 100644
|
|
--- a/devices/vector/gdevpdfb.h
|
|
+++ b/devices/vector/gdevpdfb.h
|
|
@@ -141,7 +141,6 @@ const gx_device_pdf PDF_DEVICE_IDENT =
|
|
12000, /* MaxClipPathSize */ /* HP LaserJet 1320 hangs with 14000. */
|
|
256000, /* MaxShadingBitmapSize */
|
|
PDF_DEVICE_MaxInlineImageSize, /* MaxInlineImageSize */
|
|
- {0, 0}, /* DSCEncodingToUnicode */
|
|
{0, 0, 0}, /* OwnerPassword */
|
|
{0, 0, 0}, /* UserPassword */
|
|
0, /* KeyLength */
|
|
diff --git a/devices/vector/gdevpdfe.c b/devices/vector/gdevpdfe.c
|
|
index 1aa1f25..f23a02d 100644
|
|
--- a/devices/vector/gdevpdfe.c
|
|
+++ b/devices/vector/gdevpdfe.c
|
|
@@ -26,7 +26,6 @@
|
|
#include "gdevpdfx.h"
|
|
#include "gdevpdfg.h"
|
|
#include "gdevpdfo.h"
|
|
-#include "ConvertUTF.h"
|
|
|
|
char PDFDocEncodingLookup [92] = {
|
|
0x20, 0x22, 0x20, 0x20, 0x20, 0x21, 0x20, 0x26,
|
|
@@ -343,155 +342,162 @@ decode_escape(const byte *data, int data_length, int *index)
|
|
return c; /* A wrong escapement sequence. */
|
|
}
|
|
|
|
-static int
|
|
-pdf_xmp_write_translated(gx_device_pdf *pdev, stream *s, const byte *data, int data_length,
|
|
- void(*write)(stream *s, const byte *data, int data_length))
|
|
+/*
|
|
+ * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
|
|
+ * into the first byte, depending on how many bytes follow. There are
|
|
+ * as many entries in this table as there are UTF-8 sequence types.
|
|
+ * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
|
|
+ * for *legal* UTF-8 will be 4 or fewer bytes total.
|
|
+ */
|
|
+static const char firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
|
|
+
|
|
+static int gs_ConvertUTF16(char *UTF16, int UTF16Len, unsigned char **UTF8Start, int UTF8Len)
|
|
{
|
|
- if (pdev->DSCEncodingToUnicode.data == 0) {
|
|
- int i, j=0;
|
|
- unsigned char *buf0;
|
|
+ int i, bytes = 0;
|
|
+ short U16;
|
|
+ unsigned char *UTF8 = *UTF8Start;
|
|
+ unsigned char *UTF8End = UTF8 + UTF8Len;
|
|
|
|
- buf0 = (unsigned char *)gs_alloc_bytes(pdev->memory, data_length * sizeof(unsigned char),
|
|
- "pdf_xmp_write_translated");
|
|
- if (buf0 == NULL)
|
|
- return_error(gs_error_VMerror);
|
|
- for (i = 0; i < data_length; i++) {
|
|
- byte c = data[i];
|
|
+ if (fabs(UTF16Len % sizeof(short)) != 0)
|
|
+ return gs_note_error(gs_error_rangecheck);
|
|
+
|
|
+ for (i=0;i<UTF16Len / sizeof(short);i++)
|
|
+ {
|
|
+ U16 = (*UTF16++) << 8;
|
|
+ U16 += *UTF16++;
|
|
|
|
- if (c == '\\')
|
|
- c = decode_escape(data, data_length, &i);
|
|
- buf0[j] = c;
|
|
- j++;
|
|
+ if (U16 >= 0xD800 && U16 <= 0xDBFF) {
|
|
+ return gs_note_error(gs_error_rangecheck);
|
|
}
|
|
- if (buf0[0] != 0xfe || buf0[1] != 0xff) {
|
|
- unsigned char *buf1;
|
|
- /* We must assume that the information is PDFDocEncoding. In this case
|
|
- * we need to convert it into UTF-8. If we just convert it to UTF-16
|
|
- * then we can safely fall through to the code below.
|
|
- */
|
|
- /* NB the code below skips the BOM in positions 0 and 1, so we need
|
|
- * two extra bytes, to be ignored.
|
|
- */
|
|
- buf1 = (unsigned char *)gs_alloc_bytes(pdev->memory, (j * sizeof(UTF16)) + 2,
|
|
- "pdf_xmp_write_translated");
|
|
- if (buf1 == NULL) {
|
|
- gs_free_object(pdev->memory, buf0, "pdf_xmp_write_translated");
|
|
- return_error(gs_error_VMerror);
|
|
- }
|
|
- memset(buf1, 0x00, (j * sizeof(UTF16)) + 2);
|
|
- for (i = 0; i < j; i++) {
|
|
- if (buf0[i] <= 0x7f || buf0[i] >= 0xAE) {
|
|
- if (buf0[i] == 0x7f) {
|
|
- emprintf1(pdev->memory, "PDFDocEncoding %x cannot be represented in Unicode\n",
|
|
- buf0[i]);
|
|
- } else
|
|
- buf1[(i * 2) + 3] = buf0[i];
|
|
+ if (U16 >= 0xDC00 && U16 <= 0xDFFF) {
|
|
+ return gs_note_error(gs_error_rangecheck);
|
|
+ }
|
|
+
|
|
+ if(U16 < 0x80) {
|
|
+ bytes = 1;
|
|
+ } else {
|
|
+ if (U16 < 0x800) {
|
|
+ bytes = 2;
|
|
+ } else {
|
|
+ if (U16 < 0x10000) {
|
|
+ bytes = 3;
|
|
} else {
|
|
- buf1[(i * 2) + 2] = PDFDocEncodingLookup[(buf0[i] - 0x80) * 2];
|
|
- buf1[(i * 2) + 3] = PDFDocEncodingLookup[((buf0[i] - 0x80) * 2) + 1];
|
|
- if (PDFDocEncodingLookup[((buf0[i] - 0x80) * 2) + 1] == 0x00)
|
|
- emprintf1(pdev->memory, "PDFDocEncoding %x cannot be represented in Unicode\n",
|
|
- PDFDocEncodingLookup[((buf0[i] - 0x80) * 2) + 1]);
|
|
+ if (U16 < 0x111000) {
|
|
+ bytes = 4;
|
|
+ } else {
|
|
+ bytes = 3;
|
|
+ U16 = 0xFFFD;
|
|
+ }
|
|
}
|
|
}
|
|
+ }
|
|
+ if (UTF8 + bytes > UTF8End)
|
|
+ return gs_note_error(gs_error_VMerror);
|
|
+
|
|
+ /* Write from end to beginning, low bytes first */
|
|
+ UTF8 += bytes;
|
|
+
|
|
+ switch(bytes) {
|
|
+ case 4:
|
|
+ *--UTF8 = (unsigned char)((U16 | 0x80) & 0xBF);
|
|
+ U16 >>= 6;
|
|
+ case 3:
|
|
+ *--UTF8 = (unsigned char)((U16 | 0x80) & 0xBF);
|
|
+ U16 >>= 6;
|
|
+ case 2:
|
|
+ *--UTF8 = (unsigned char)((U16 | 0x80) & 0xBF);
|
|
+ U16 >>= 6;
|
|
+ case 1:
|
|
+ *--UTF8 = (unsigned char)(U16 | firstByteMark[bytes]);
|
|
+ break;
|
|
+ default:
|
|
+ return gs_note_error(gs_error_rangecheck);
|
|
+ }
|
|
+
|
|
+ /* Move to start of next set */
|
|
+ UTF8 += bytes;
|
|
+ }
|
|
+ *UTF8Start = UTF8;
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int
|
|
+pdf_xmp_write_translated(gx_device_pdf *pdev, stream *s, const byte *data, int data_length,
|
|
+ void(*write)(stream *s, const byte *data, int data_length))
|
|
+{
|
|
+ int i, j=0;
|
|
+ unsigned char *buf0;
|
|
+
|
|
+ buf0 = (unsigned char *)gs_alloc_bytes(pdev->memory, data_length * sizeof(unsigned char),
|
|
+ "pdf_xmp_write_translated");
|
|
+ if (buf0 == NULL)
|
|
+ return_error(gs_error_VMerror);
|
|
+ for (i = 0; i < data_length; i++) {
|
|
+ byte c = data[i];
|
|
+
|
|
+ if (c == '\\')
|
|
+ c = decode_escape(data, data_length, &i);
|
|
+ buf0[j] = c;
|
|
+ j++;
|
|
+ }
|
|
+ if (buf0[0] != 0xfe || buf0[1] != 0xff) {
|
|
+ unsigned char *buf1;
|
|
+ /* We must assume that the information is PDFDocEncoding. In this case
|
|
+ * we need to convert it into UTF-8. If we just convert it to UTF-16
|
|
+ * then we can safely fall through to the code below.
|
|
+ */
|
|
+ /* NB the code below skips the BOM in positions 0 and 1, so we need
|
|
+ * two extra bytes, to be ignored.
|
|
+ */
|
|
+ buf1 = (unsigned char *)gs_alloc_bytes(pdev->memory, (j * sizeof(short)) + 2,
|
|
+ "pdf_xmp_write_translated");
|
|
+ if (buf1 == NULL) {
|
|
gs_free_object(pdev->memory, buf0, "pdf_xmp_write_translated");
|
|
- buf0 = buf1;
|
|
- data_length = j = (j * 2) + 2;
|
|
+ return_error(gs_error_VMerror);
|
|
}
|
|
- {
|
|
- /* Its a Unicode (UTF-16BE) string, convert to UTF-8 */
|
|
- UTF16 *buf0b, U16;
|
|
- UTF8 *buf1, *buf1b;
|
|
-
|
|
- /* A single UTF-16 (2 bytes) can end up as 4 bytes in UTF-8 */
|
|
- buf1 = (UTF8 *)gs_alloc_bytes(pdev->memory, data_length * 2 * sizeof(unsigned char),
|
|
- "pdf_xmp_write_translated");
|
|
- if (buf1 == NULL) {
|
|
- gs_free_object(pdev->memory, buf0, "pdf_xmp_write_translated");
|
|
- return_error(gs_error_VMerror);
|
|
- }
|
|
- buf1b = buf1;
|
|
- /* Skip the Byte Order Mark (0xfe 0xff) */
|
|
- buf0b = (UTF16 *)(buf0 + 2);
|
|
- /* ConvertUTF16to UTF8 expects a buffer of UTF16s in the local
|
|
- * endian-ness, but the data is big-endian. In case this is a little-endian
|
|
- * machine, process the buffer from big-endian to whatever is right for this platform.
|
|
- */
|
|
- for (i = 2; i < j; i+=2) {
|
|
- U16 = (buf0[i] << 8) + buf0[i + 1];
|
|
- *(buf0b++) = U16;
|
|
- }
|
|
- buf0b = (UTF16 *)(buf0 + 2);
|
|
- switch (ConvertUTF16toUTF8((const UTF16**)&buf0b, (UTF16 *)(buf0 + j),
|
|
- &buf1b, buf1 + (data_length * 2 * sizeof(unsigned char)), strictConversion)) {
|
|
- case conversionOK:
|
|
- write(s, buf1, buf1b - buf1);
|
|
- gs_free_object(pdev->memory, buf1, "pdf_xmp_write_translated");
|
|
- break;
|
|
- case sourceExhausted:
|
|
- case targetExhausted:
|
|
- case sourceIllegal:
|
|
- default:
|
|
- gs_free_object(pdev->memory, buf0, "pdf_xmp_write_translated");
|
|
- gs_free_object(pdev->memory, buf1, "pdf_xmp_write_translated");
|
|
- return_error(gs_error_rangecheck);
|
|
+ memset(buf1, 0x00, (j * sizeof(short)) + 2);
|
|
+ for (i = 0; i < j; i++) {
|
|
+ if (buf0[i] <= 0x7f || buf0[i] >= 0xAE) {
|
|
+ if (buf0[i] == 0x7f) {
|
|
+ emprintf1(pdev->memory, "PDFDocEncoding %x cannot be represented in Unicode\n",
|
|
+ buf0[i]);
|
|
+ } else
|
|
+ buf1[(i * 2) + 3] = buf0[i];
|
|
+ } else {
|
|
+ buf1[(i * 2) + 2] = PDFDocEncodingLookup[(buf0[i] - 0x80) * 2];
|
|
+ buf1[(i * 2) + 3] = PDFDocEncodingLookup[((buf0[i] - 0x80) * 2) + 1];
|
|
+ if (PDFDocEncodingLookup[((buf0[i] - 0x80) * 2) + 1] == 0x00)
|
|
+ emprintf1(pdev->memory, "PDFDocEncoding %x cannot be represented in Unicode\n",
|
|
+ PDFDocEncodingLookup[((buf0[i] - 0x80) * 2) + 1]);
|
|
}
|
|
}
|
|
gs_free_object(pdev->memory, buf0, "pdf_xmp_write_translated");
|
|
- return 0;
|
|
- } else {
|
|
- UTF16 *buf0;
|
|
- const UTF16 *buf0b;
|
|
- UTF8 *buf1, *buf1b;
|
|
- int i, j = 0;
|
|
-
|
|
- buf0 = (UTF16 *)gs_alloc_bytes(pdev->memory, data_length * sizeof(UTF16),
|
|
- "pdf_xmp_write_translated");
|
|
- if (buf0 == NULL)
|
|
- return_error(gs_error_VMerror);
|
|
- buf1 = (UTF8 *)gs_alloc_bytes(pdev->memory, data_length * 2,
|
|
- "pdf_xmp_write_translated");
|
|
+ buf0 = buf1;
|
|
+ data_length = j = (j * 2) + 2;
|
|
+ }
|
|
+ {
|
|
+ /* Its a Unicode (UTF-16BE) string, convert to UTF-8 */
|
|
+ short *buf0b;
|
|
+ char *buf1, *buf1b;
|
|
+ int code;
|
|
+
|
|
+ /* A single UTF-16 (2 bytes) can end up as 4 bytes in UTF-8 */
|
|
+ buf1 = (char *)gs_alloc_bytes(pdev->memory, data_length * 2 * sizeof(unsigned char),
|
|
+ "pdf_xmp_write_translated");
|
|
if (buf1 == NULL) {
|
|
gs_free_object(pdev->memory, buf0, "pdf_xmp_write_translated");
|
|
return_error(gs_error_VMerror);
|
|
}
|
|
- buf0b = buf0;
|
|
buf1b = buf1;
|
|
- for (i = 0; i < data_length; i++) {
|
|
- byte c = data[i];
|
|
- int v;
|
|
-
|
|
- if (c == '\\')
|
|
- c = decode_escape(data, data_length, &i);
|
|
- if (c > pdev->DSCEncodingToUnicode.size) {
|
|
- gs_free_object(pdev->memory, buf0, "pdf_xmp_write_translated");
|
|
- gs_free_object(pdev->memory, buf1, "pdf_xmp_write_translated");
|
|
- return_error(gs_error_rangecheck);
|
|
- }
|
|
-
|
|
- v = pdev->DSCEncodingToUnicode.data[c];
|
|
- if (v == -1)
|
|
- v = '?'; /* Arbitrary. */
|
|
- buf0[j] = v;
|
|
- j++;
|
|
- }
|
|
- switch (ConvertUTF16toUTF8(&buf0b, buf0 + j,
|
|
- &buf1b, buf1 + data_length * 2, strictConversion)) {
|
|
- case conversionOK:
|
|
- write(s, buf1, buf1b - buf1);
|
|
- break;
|
|
- case sourceExhausted:
|
|
- case targetExhausted:
|
|
- case sourceIllegal:
|
|
- default:
|
|
- gs_free_object(pdev->memory, buf0, "pdf_xmp_write_translated");
|
|
- gs_free_object(pdev->memory, buf1, "pdf_xmp_write_translated");
|
|
- return_error(gs_error_rangecheck);
|
|
- }
|
|
- gs_free_object(pdev->memory, buf0, "pdf_xmp_write_translated");
|
|
- gs_free_object(pdev->memory, buf1, "pdf_xmp_write_translated");
|
|
- return 0;
|
|
+ /* Skip the Byte Order Mark (0xfe 0xff) */
|
|
+ buf0b = (short *)(buf0 + 2);
|
|
+ code = gs_ConvertUTF16((char *)buf0b, j - 2, (unsigned char **)&buf1b, data_length * 2 * sizeof(unsigned char));
|
|
+ if (code < 0)
|
|
+ return code;
|
|
+ write(s, (const byte *)buf1, buf1b - buf1);
|
|
}
|
|
+ gs_free_object(pdev->memory, buf0, "pdf_xmp_write_translated");
|
|
+ return 0;
|
|
}
|
|
|
|
static int
|
|
diff --git a/devices/vector/gdevpdfp.c b/devices/vector/gdevpdfp.c
|
|
index 0fa07e3..6ebcb0d 100644
|
|
--- a/devices/vector/gdevpdfp.c
|
|
+++ b/devices/vector/gdevpdfp.c
|
|
@@ -77,7 +77,6 @@ static const gs_param_item_t pdf_param_items[] = {
|
|
pi("CompressStreams", gs_param_type_bool, CompressStreams),
|
|
pi("PrintStatistics", gs_param_type_bool, PrintStatistics),
|
|
pi("MaxInlineImageSize", gs_param_type_long, MaxInlineImageSize),
|
|
- pi("DSCEncodingToUnicode", gs_param_type_int_array, DSCEncodingToUnicode),
|
|
|
|
/* PDF Encryption */
|
|
pi("OwnerPassword", gs_param_type_string, OwnerPassword),
|
|
diff --git a/devices/vector/gdevpdfx.h b/devices/vector/gdevpdfx.h
|
|
index 308900a..c436220 100644
|
|
--- a/devices/vector/gdevpdfx.h
|
|
+++ b/devices/vector/gdevpdfx.h
|
|
@@ -601,7 +601,6 @@ struct gx_device_pdf_s {
|
|
a bitmap representation of a shading.
|
|
(Bigger shadings to be downsampled). */
|
|
long MaxInlineImageSize;
|
|
- gs_param_int_array DSCEncodingToUnicode;
|
|
/* Encryption parameters */
|
|
gs_param_string OwnerPassword;
|
|
gs_param_string UserPassword;
|
|
@@ -911,14 +910,14 @@ struct gx_device_pdf_s {
|
|
m(28,sbstack) m(29,substream_Resources) m(30,font3)\
|
|
m(31,accumulating_substream_resource) \
|
|
m(32,pres_soft_mask_dict) m(33,PDFXTrimBoxToMediaBoxOffset.data)\
|
|
- m(34,PDFXBleedBoxToTrimBoxOffset.data) m(35, DSCEncodingToUnicode.data)\
|
|
- m(36,Identity_ToUnicode_CMaps[0]) m(37,Identity_ToUnicode_CMaps[1])\
|
|
- m(38,vgstack)\
|
|
- m(39, outline_levels)
|
|
- m(40, gx_device_pdf, EmbeddedFiles);
|
|
- m(41, gx_device_pdf, pdf_font_dir);
|
|
- m(42, gx_device_pdf, Extension_Metadata);*/
|
|
-#define gx_device_pdf_num_ptrs 43
|
|
+ m(34,PDFXBleedBoxToTrimBoxOffset.data)
|
|
+ m(35,Identity_ToUnicode_CMaps[0]) m(36,Identity_ToUnicode_CMaps[1])\
|
|
+ m(37,vgstack)\
|
|
+ m(38, outline_levels)
|
|
+ m(39, gx_device_pdf, EmbeddedFiles);
|
|
+ m(40, gx_device_pdf, pdf_font_dir);
|
|
+ m(41, gx_device_pdf, Extension_Metadata);*/
|
|
+#define gx_device_pdf_num_ptrs 42
|
|
#define gx_device_pdf_do_param_strings(m)\
|
|
m(0, OwnerPassword) m(1, UserPassword) m(2, NoEncrypt)\
|
|
m(3, DocumentUUID) m(4, InstanceUUID)
|
|
diff --git a/windows/ghostscript.vcproj b/windows/ghostscript.vcproj
|
|
index a96d317..450cb26 100644
|
|
--- a/windows/ghostscript.vcproj
|
|
+++ b/windows/ghostscript.vcproj
|
|
@@ -1794,10 +1794,6 @@
|
|
>
|
|
</File>
|
|
<File
|
|
- RelativePath="..\base\ConvertUTF.c"
|
|
- >
|
|
- </File>
|
|
- <File
|
|
RelativePath="..\base\echogs.c"
|
|
>
|
|
</File>
|
|
@@ -3330,10 +3326,6 @@
|
|
>
|
|
</File>
|
|
<File
|
|
- RelativePath="..\base\ConvertUTF.h"
|
|
- >
|
|
- </File>
|
|
- <File
|
|
RelativePath="..\base\ctype_.h"
|
|
>
|
|
</File>
|
|
diff --git a/windows/ghostscript_rt.vcxproj b/windows/ghostscript_rt.vcxproj
|
|
index 2348f08..fae2e1f 100644
|
|
--- a/windows/ghostscript_rt.vcxproj
|
|
+++ b/windows/ghostscript_rt.vcxproj
|
|
@@ -427,7 +427,6 @@
|
|
<ItemGroup>
|
|
<ClCompile Include="..\base\aes.c" />
|
|
<ClCompile Include="..\base\bench.c" />
|
|
- <ClCompile Include="..\base\ConvertUTF.c" />
|
|
<ClCompile Include="..\base\echogs.c" />
|
|
<ClCompile Include="..\base\gconf.c" />
|
|
<ClCompile Include="..\base\genarch.c" />
|
|
@@ -1689,7 +1688,6 @@
|
|
<ClInclude Include="..\jasper\src\libjasper\ras\ras_enc.h" />
|
|
<ClInclude Include="..\base\aes.h" />
|
|
<ClInclude Include="..\base\assert_.h" />
|
|
- <ClInclude Include="..\base\ConvertUTF.h" />
|
|
<ClInclude Include="..\base\ctype_.h" />
|
|
<ClInclude Include="..\base\dirent_.h" />
|
|
<ClInclude Include="..\base\dos_.h" />
|
|
--
|
|
2.9.3
|
|
|