From f2af959037e9c42a4673de2850a4461a5a223618 Mon Sep 17 00:00:00 2001 From: Harald Hoyer Date: Thu, 31 Jul 2014 13:27:40 +0200 Subject: [PATCH] add Unicode compressor --- .../main/java/org/unicode/scsu/Assert.java | 44 ++ .../main/java/org/unicode/scsu/Compress.java | 556 ++++++++++++++++++ .../org/unicode/scsu/EndOfInputException.java | 45 ++ .../unicode/scsu/EndOfOutputException.java | 46 ++ .../main/java/org/unicode/scsu/Expand.java | 388 ++++++++++++ .../unicode/scsu/IllegalInputException.java | 45 ++ app/src/main/java/org/unicode/scsu/SCSU.java | 260 ++++++++ 7 files changed, 1384 insertions(+) create mode 100644 app/src/main/java/org/unicode/scsu/Assert.java create mode 100644 app/src/main/java/org/unicode/scsu/Compress.java create mode 100644 app/src/main/java/org/unicode/scsu/EndOfInputException.java create mode 100644 app/src/main/java/org/unicode/scsu/EndOfOutputException.java create mode 100644 app/src/main/java/org/unicode/scsu/Expand.java create mode 100644 app/src/main/java/org/unicode/scsu/IllegalInputException.java create mode 100644 app/src/main/java/org/unicode/scsu/SCSU.java diff --git a/app/src/main/java/org/unicode/scsu/Assert.java b/app/src/main/java/org/unicode/scsu/Assert.java new file mode 100644 index 0000000..1044012 --- /dev/null +++ b/app/src/main/java/org/unicode/scsu/Assert.java @@ -0,0 +1,44 @@ +package org.unicode.scsu; + +/** + * This sample software accompanies Unicode Technical Report #6 and + * distributed as is by Unicode, Inc., subject to the following: + * + * Copyright � 1996-1997 Unicode, Inc.. All Rights Reserved. + * + * Permission to use, copy, modify, and distribute this software + * without fee is hereby granted provided that this copyright notice + * appears in all copies. + * + * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE + * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT. + * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND + * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND + * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING + * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES. + * + * @author Asmus Freytag + * + * @version 001 Dec 25 1996 + * @version 002 Jun 25 1997 + * @version 003 Jul 25 1997 + * @version 004 Aug 25 1997 + * + * Unicode and the Unicode logo are trademarks of Unicode, Inc., + * and are registered in some jurisdictions. + **/ + +/** + * The assertion failed + */ +public class Assert extends RuntimeException { + public Assert(String assertion) { + super("Assertion failed: " + assertion); + } + + public Assert() { + super(); + } +} \ No newline at end of file diff --git a/app/src/main/java/org/unicode/scsu/Compress.java b/app/src/main/java/org/unicode/scsu/Compress.java new file mode 100644 index 0000000..80dc46a --- /dev/null +++ b/app/src/main/java/org/unicode/scsu/Compress.java @@ -0,0 +1,556 @@ +package org.unicode.scsu; + +/** + * This sample software accompanies Unicode Technical Report #6 and + * distributed as is by Unicode, Inc., subject to the following: + * + * Copyright � 1996-1997 Unicode, Inc.. All Rights Reserved. + * + * Permission to use, copy, modify, and distribute this software + * without fee is hereby granted provided that this copyright notice + * appears in all copies. + * + * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE + * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT. + * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND + * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND + * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING + * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES. + * + * @author Asmus Freytag + * + * @version 001 Dec 25 1996 + * @version 002 Jun 25 1997 + * @version 003 Jul 25 1997 + * @version 004 Aug 25 1997 + * + * Unicode and the Unicode logo are trademarks of Unicode, Inc., + * and are registered in some jurisdictions. + **/ + +/** + * This class implements a simple compression algorithm + */ +/* + Note on exception handling + This compressor is designed so that it can be restarted after + an exception. All operations advancing input and/or output cursor + (iIn and iOut) either complete an action, or set a state (fUnicodeMode) + before updating the cursors. +*/ +public class Compress extends SCSU { + + static int iNextWindow = 3; + /** + * next input character to be read * + */ + private int iIn; + /** + * next output byte to be written * + */ + private int iOut; + /** + * start index of Unicode mode in output array, or -1 if in single byte mode * + */ + private int iSCU = -1; + /** + * true if the next command byte is of the Uxx family + */ + private boolean fUnicodeMode = false; + + /** + * locate a window for a character given a table of offsets + * + * @param ch - character + * @param offsetTable - table of window offsets + * @return true if the character fits a window from the table of windows + */ + private boolean locateWindow(int ch, int[] offsetTable) { + // always try the current window first + int iWin = getCurrentWindow(); + + // if the character fits the current window + // just use the current window + if (iWin != -1 && ch >= offsetTable[iWin] && ch < offsetTable[iWin] + 0x80) { + return true; + } + + // try all windows in order + for (iWin = 0; iWin < offsetTable.length; iWin++) { + if (ch >= offsetTable[iWin] && ch < offsetTable[iWin] + 0x80) { + selectWindow(iWin); + return true; + } + } + // none found + return false; + } + + /** + * returns true if the character is ASCII, but not a control other than CR, LF and TAB + */ + private boolean isAsciiCrLfOrTab(int ch) { + return (ch >= 0x20 && ch <= 0x7F) // ASCII + || ch == 0x09 || ch == 0x0A || ch == 0x0D; // CR/LF or TAB + + } + + /** + * output a run of characters in single byte mode + * In single byte mode pass through characters in the ASCII range, but + * quote characters overlapping with compression command codes. Runs + * of characters fitting the current window are output as runs of bytes + * in the range 0x80-0xFF. Checks for and validates Surrogate Pairs. + * Uses and updates the current input and output cursors store in + * the instance variables iIn and iOut. + * + * @param in - input character array + * @param out - output byte array + * @return the next chaacter to be processed. This may be an extended character. + */ + public int outputSingleByteRun(char[] in, byte[] out) + throws EndOfOutputException, EndOfInputException, IllegalInputException { + int iWin = getCurrentWindow(); + loop: + while (iIn < in.length) { + int outlen = 0; + byte byte1 = 0; + byte byte2 = 0; + + // get the input character + int ch = in[iIn]; + + int inlen = 1; + + // Check input for Surrogate pair + if ((ch & 0xF800) == 0xD800) { + if ((ch & 0xFC00) == 0xDC00) { + // low surrogate out of order + throw new IllegalInputException("Unpaired low surrogate: " + iIn); + } else { + // have high surrogate now get low surrogate + if (iIn >= in.length - 1) { + // premature end of input + throw new EndOfInputException(); + } + // get the char + int ch2 = in[iIn + 1]; + + // make sure it's a low surrogate + if ((ch2 & 0xFC00) != 0xDC00) { + // a low surrogate was required + throw new IllegalInputException("Unpaired high surrogate: " + (iIn + 1)); + } + + // combine the two values + ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000; + // ch = ch<<10 + ch2 - 0x36F0000; + + inlen = 2; + } + } + + // ASCII Letter, NUL, CR, LF and TAB are always passed through + if (isAsciiCrLfOrTab(ch) || ch == 0) { + // pass through directcly + byte2 = (byte) (ch & 0x7F); + outlen = 1; + } + + // All other control codes must be quoted + else if (ch < 0x20) { + byte1 = SQ0; + byte2 = (byte) (ch); + outlen = 2; + } + + // Letters that fit the current dynamic window + else if (ch >= dynamicOffset[iWin] && ch < dynamicOffset[iWin] + 0x80) { + ch -= dynamicOffset[iWin]; + byte2 = (byte) (ch | 0x80); + outlen = 1; + } + + // check for room in the output array + if (iOut + outlen >= out.length) { + throw new EndOfOutputException(); + } + + switch (outlen) { + default: + // need to use some other compression mode for this + // character so we terminate this loop + + return ch; // input not finished + + // output the characters + case 2: + out[iOut++] = byte1; + // fall through + case 1: + out[iOut++] = byte2; + break; + } + // advance input pointer + iIn += inlen; + } + return 0; // input all used up + } + + /** + * quote a single character in single byte mode + * Quoting a character (aka 'non-locking shift') gives efficient access + * to characters that occur in isolation--usually punctuation characters. + * When quoting a character from a dynamic window use 0x80 - 0xFF, when + * quoting a character from a static window use 0x00-0x7f. + * + * @param ch - character to be quoted + * @param out - output byte array + */ + + private void quoteSingleByte(int ch, byte[] out) + throws EndOfOutputException { + int iWin = getCurrentWindow(); + + // check for room in the output array + if (iOut >= out.length - 2) { + throw new EndOfOutputException(); + } + + // Output command byte followed by + out[iOut++] = (byte) (SQ0 + iWin); + + // Letter that fits the current dynamic window + if (ch >= dynamicOffset[iWin] && ch < dynamicOffset[iWin] + 0x80) { + ch -= dynamicOffset[iWin]; + out[iOut++] = (byte) (ch | 0x80); + } + + // Letter that fits the current static window + else if (ch >= staticOffset[iWin] && ch < staticOffset[iWin] + 0x80) { + ch -= staticOffset[iWin]; + out[iOut++] = (byte) ch; + } else { + throw new Assert("ch = " + ch + " not valid in quoteSingleByte. Internal Compressor Error"); + } + // advance input pointer + iIn++; + } + + /** + * output a run of characters in Unicode mode + * A run of Unicode mode consists of characters which are all in the + * range of non-compressible characters or isolated occurrence + * of any other characters. Characters in the range 0xE00-0xF2FF must + * be quoted to avoid overlap with the Unicode mode compression command codes. + * Uses and updates the current input and output cursors store in + * the instance variables iIn and iOut. + * NOTE: Characters from surrogate pairs are passed through and unlike single + * byte mode no checks are made for unpaired surrogate characters. + * + * @param in - input character array + * @param out - output byte array + * @return the next input character to be processed + */ + public char outputUnicodeRun(char[] in, byte[] out) + throws EndOfOutputException { + // current character + char ch = 0; + + while (iIn < in.length) { + // get current input and set default output length + ch = in[iIn]; + int outlen = 2; + + // Characters in these ranges could potentially be compressed. + // We require 2 or more compressible characters to break the run + if (isCompressible(ch)) { + // check whether we can look ahead + if (iIn < in.length - 1) { + // DEBUG + char ch2 = in[iIn + 1]; + if (isCompressible(ch2)) { + // at least 2 characters are compressible + // break the run + break; + } + //DEBUG + } + // If we get here, the current character is only character + // left in the input or it is followed by a non-compressible + // character. In neither case do we gain by breaking the + // run, so we proceed to output the character. + if (ch >= 0xE000 && ch <= 0xF2FF) { + // Characters in this range need to be escaped + outlen = 3; + } + + } + // check that there is enough room to output the character + if (iOut >= out.length - outlen) { + // DEBUG + // if we got here, we ran out of space in the output array + throw new EndOfOutputException(); + } + + // output any characters that cannot be compressed, + if (outlen == 3) { + // output the quote character + out[iOut++] = (byte) UQU; + } + // pass the Unicode character in MSB,LSB order + out[iOut++] = (byte) (ch >>> 8); + out[iOut++] = (byte) (ch & 0xFF); + + // advance input cursor + iIn++; + } + + // return the last character + return ch; + } + + /** + * redefine a window so it surrounds a given character value + * For now, this function uses window 3 exclusively (window 4 + * for extended windows); + * + * @param ch - character around which window is positioned + * @param out - output byte array + * @return true if a window was successfully defined + */ + private boolean positionWindow(int ch, byte[] out, boolean fUnicodeMode) + throws IllegalInputException, EndOfOutputException { + int iWin = iNextWindow % 8; // simple LRU + int iPosition = 0; + + // iPosition 0 is a reserved value + if (ch < 0x80) { + throw new Assert("ch < 0x80"); + //return false; + } + + // Check the fixed offsets + for (int i = 0; i < fixedOffset.length; i++) { + if (ch >= fixedOffset[i] && ch < fixedOffset[i] + 0x80) { + iPosition = i; + break; + } + } + + if (iPosition != 0) { + // DEBUG + + // ch fits in a fixed offset window position + dynamicOffset[iWin] = fixedOffset[iPosition]; + iPosition += 0xF9; + } else if (ch < 0x3400) { + // calculate a window position command and set the offset + iPosition = ch >>> 7; + dynamicOffset[iWin] = ch & 0xFF80; + + } else if (ch < 0xE000) { + // attempt to place a window where none can go + return false; + } else if (ch <= 0xFFFF) { + // calculate a window position command, accounting + // for the gap in position values, and set the offset + iPosition = ((ch - gapOffset) >>> 7); + + dynamicOffset[iWin] = ch & 0xFF80; + + } else { + // if we get here, the character is in the extended range. + // Always use Window 4 to define an extended window + + iPosition = (ch - 0x10000) >>> 7; + // DEBUG + + iPosition |= iWin << 13; + dynamicOffset[iWin] = ch & 0x1FFF80; + } + + // Outputting window defintion command for the general cases + if (iPosition < 0x100 && iOut < out.length - 1) { + out[iOut++] = (byte) ((fUnicodeMode ? UD0 : SD0) + iWin); + out[iOut++] = (byte) (iPosition & 0xFF); + } + // Output an extended window definiton command + else if (iPosition >= 0x100 && iOut < out.length - 2) { + + out[iOut++] = (byte) (fUnicodeMode ? UDX : SDX); + out[iOut++] = (byte) ((iPosition >>> 8) & 0xFF); + out[iOut++] = (byte) (iPosition & 0xFF); + } else { + throw new EndOfOutputException(); + } + selectWindow(iWin); + iNextWindow++; + return true; + } + + /** + * compress a Unicode character array with some simplifying assumptions + */ + public int simpleCompress(char[] in, int iStartIn, byte[] out, int iStartOut) + throws IllegalInputException, EndOfInputException, EndOfOutputException { + iIn = iStartIn; + iOut = iStartOut; + + + while (iIn < in.length) { + int ch; + + // previously we switched to a Unicode run + if (iSCU != -1) { + + + // output characters as Unicode + ch = outputUnicodeRun(in, out); + + // for single character Unicode runs (3 bytes) use quote + if (iOut - iSCU == 3) { + // go back and fix up the SCU to an SQU instead + out[iSCU] = (byte) SQU; + iSCU = -1; + continue; + } else { + iSCU = -1; + fUnicodeMode = true; + } + } + // next, try to output characters as single byte run + else { + ch = outputSingleByteRun(in, out); + } + + // check whether we still have input + if (iIn == in.length) { + break; // no more input + } + + // if we get here, we have a consistent value for ch, whether or + // not it is an regular or extended character. Locate or define a + // Window for the current character + + // Check that we have enough room to output the command byte + if (iOut >= out.length - 1) { + throw new EndOfOutputException(); + } + + // In order to switch away from Unicode mode, it is necessary + // to select (or define) a window. If the characters that follow + // the Unicode range are ASCII characters, we can't use them + // to decide which window to select, since ASCII characters don't + // influence window settings. This loop looks ahead until it finds + // one compressible character that isn't in the ASCII range. + for (int ich = iIn; ch < 0x80; ich++) { + if (ich == in.length || !isCompressible(in[ich])) { + // if there are only ASCII characters left, + ch = in[iIn]; + break; + } + ch = in[ich]; // lookahead for next non-ASCII char + } + // The character value contained in ch here will only be used to select + // output modes. Actual output of characters starts with in[iIn] and + // only takes place near the top of the loop. + + int iprevWindow = getCurrentWindow(); + + // try to locate a dynamic window + if (ch < 0x80 || locateWindow(ch, dynamicOffset)) { + // lookahead to use SQn instead of SCn for single + // character interruptions of runs in current window + if (!fUnicodeMode && iIn < in.length - 1) { + char ch2 = in[iIn + 1]; + if (ch2 >= dynamicOffset[iprevWindow] && + ch2 < dynamicOffset[iprevWindow] + 0x80) { + quoteSingleByte(ch, out); + selectWindow(iprevWindow); + continue; + } + } + + out[iOut++] = (byte) ((fUnicodeMode ? UC0 : SC0) + getCurrentWindow()); + fUnicodeMode = false; + } + // try to locate a static window + else if (!fUnicodeMode && locateWindow(ch, staticOffset)) { + // static windows are not accessible from Unicode mode + quoteSingleByte(ch, out); + selectWindow(iprevWindow); // restore current Window settings + continue; + } + // try to define a window around ch + else if (positionWindow(ch, out, fUnicodeMode)) { + fUnicodeMode = false; + } + // If all else fails, start a Unicode run + else { + iSCU = iOut; + out[iOut++] = (byte) SCU; + continue; + } + } + + return iOut - iStartOut; + } + + public byte[] compress(String inStr) + throws IllegalInputException, EndOfInputException { + // Running out of room for output can cause non-optimal + // compression. In order to not slow down compression too + // much, not all intermediate state is constantly saved. + + byte[] out = new byte[inStr.length() * 2]; + char[] in = inStr.toCharArray(); + //DEBUG + int iLen = 0; + reset(); + while (true) { + try { + simpleCompress(in, charsRead(), out, bytesWritten()); + // if we get here things went fine. + break; + } catch (EndOfOutputException e) { + // create a larger output buffer and continue + byte[] largerOut = new byte[out.length * 2]; + System.arraycopy(out, 0, largerOut, 0, out.length); + out = largerOut; + } + } + byte[] trimmedOut = new byte[bytesWritten()]; + System.arraycopy(out, 0, trimmedOut, 0, trimmedOut.length); + out = trimmedOut; + + return out; + } + + /** + * reset is only needed to bail out after an exception and + * restart with new input + */ + public void reset() { + super.reset(); + fUnicodeMode = false; + iSCU = -1; + } + + /** + * returns the number of bytes written * + */ + public int bytesWritten() { + return iOut; + } + + /** + * returns the number of bytes written * + */ + public int charsRead() { + return iIn; + } + +} \ No newline at end of file diff --git a/app/src/main/java/org/unicode/scsu/EndOfInputException.java b/app/src/main/java/org/unicode/scsu/EndOfInputException.java new file mode 100644 index 0000000..0437c15 --- /dev/null +++ b/app/src/main/java/org/unicode/scsu/EndOfInputException.java @@ -0,0 +1,45 @@ +package org.unicode.scsu; + +/** + * This sample software accompanies Unicode Technical Report #6 and + * distributed as is by Unicode, Inc., subject to the following: + * + * Copyright � 1996-1997 Unicode, Inc.. All Rights Reserved. + * + * Permission to use, copy, modify, and distribute this software + * without fee is hereby granted provided that this copyright notice + * appears in all copies. + * + * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE + * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT. + * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND + * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND + * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING + * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES. + * + * @author Asmus Freytag + * + * @version 001 Dec 25 1996 + * @version 002 Jun 25 1997 + * @version 003 Jul 25 1997 + * @version 004 Aug 25 1997 + * + * Unicode and the Unicode logo are trademarks of Unicode, Inc., + * and are registered in some jurisdictions. + **/ + +/** + * The input string or input byte array ended prematurely + */ +public class EndOfInputException + extends java.lang.Exception { + public EndOfInputException() { + super("The input string or input byte array ended prematurely"); + } + + public EndOfInputException(String s) { + super(s); + } +} diff --git a/app/src/main/java/org/unicode/scsu/EndOfOutputException.java b/app/src/main/java/org/unicode/scsu/EndOfOutputException.java new file mode 100644 index 0000000..9ffe784 --- /dev/null +++ b/app/src/main/java/org/unicode/scsu/EndOfOutputException.java @@ -0,0 +1,46 @@ +package org.unicode.scsu; + +/** + * This sample software accompanies Unicode Technical Report #6 and + * distributed as is by Unicode, Inc., subject to the following: + * + * Copyright � 1996-1997 Unicode, Inc.. All Rights Reserved. + * + * Permission to use, copy, modify, and distribute this software + * without fee is hereby granted provided that this copyright notice + * appears in all copies. + * + * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE + * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT. + * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND + * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND + * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING + * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES. + * + * @author Asmus Freytag + * + * @version 001 Dec 25 1996 + * @version 002 Jun 25 1997 + * @version 003 Jul 25 1997 + * + * Unicode and the Unicode logo are trademarks of Unicode, Inc., + * and are registered in some jurisdictions. + **/ + +/** + * The input string or input byte array ended prematurely + */ +public class EndOfOutputException + extends java.lang.Exception + +{ + public EndOfOutputException() { + super("The input string or input byte array ended prematurely"); + } + + public EndOfOutputException(String s) { + super(s); + } +} diff --git a/app/src/main/java/org/unicode/scsu/Expand.java b/app/src/main/java/org/unicode/scsu/Expand.java new file mode 100644 index 0000000..20bad5a --- /dev/null +++ b/app/src/main/java/org/unicode/scsu/Expand.java @@ -0,0 +1,388 @@ +package org.unicode.scsu; + +/** + * This sample software accompanies Unicode Technical Report #6 and + * distributed as is by Unicode, Inc., subject to the following: + * + * Copyright � 1996-1998 Unicode, Inc.. All Rights Reserved. + * + * Permission to use, copy, modify, and distribute this software + * without fee is hereby granted provided that this copyright notice + * appears in all copies. + * + * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE + * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT. + * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND + * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND + * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING + * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES. + * + * @author Asmus Freytag + * + * @version 001 Dec 25 1996 + * @version 002 Jun 25 1997 + * @version 003 Jul 25 1997 + * @version 004 Aug 25 1997 + * @version 005 Sep 30 1998 + * + * Unicode and the Unicode logo are trademarks of Unicode, Inc., + * and are registered in some jurisdictions. + **/ + +/** + * Reference decoder for the Standard Compression Scheme for Unicode (SCSU) + *

+ *

Notes on the Java implementation

+ *

+ * A limitation of Java is the exclusive use of a signed byte data type. + * The following work arounds are required: + *

+ * Copying a byte to an integer variable and adding 256 for 'negative' + * bytes gives an integer in the range 0-255. + *

+ * Values of char are between 0x0000 and 0xFFFF in Java. Arithmetic on + * char values is unsigned. + *

+ * Extended characters require an int to store them. The sign is not an + * issue because only 1024*1024 + 65536 extended characters exist. + */ +public class Expand extends SCSU { + /** + * string buffer length used by the following functions + */ + protected int iOut = 0; + /** + * input cursor used by the following functions + */ + protected int iIn = 0; + + /** + * assemble a char from two bytes + * In Java bytes are signed quantities, while chars are unsigned + * + * @param hi most significant byte + * @param lo least significant byte + * @return the character + */ + public static char charFromTwoBytes(byte hi, byte lo) { + char ch = (char) (lo >= 0 ? lo : 256 + lo); + return (char) (ch + (char) ((hi >= 0 ? hi : 256 + hi) << 8)); + } + + /** + * (re-)define (and select) a dynamic window + * A sliding window position cannot start at any Unicode value, + * so rather than providing an absolute offset, this function takes + * an index value which selects among the possible starting values. + *

+ * Most scripts in Unicode start on or near a half-block boundary + * so the default behaviour is to multiply the index by 0x80. Han, + * Hangul, Surrogates and other scripts between 0x3400 and 0xDFFF + * show very poor locality--therefore no sliding window can be set + * there. A jumpOffset is added to the index value to skip that region, + * and only 167 index values total are required to select all eligible + * half-blocks. + *

+ * Finally, a few scripts straddle half block boundaries. For them, a + * table of fixed offsets is used, and the index values from 0xF9 to + * 0xFF are used to select these special offsets. + *

+ * After (re-)defining a windows location it is selected so it is ready + * for use. + *

+ * Recall that all Windows are of the same length (128 code positions). + * + * @param iWindow - index of the window to be (re-)defined + * @param bOffset - index for the new offset value + */ + // @005 protected <-- private here and elsewhere + protected void defineWindow(int iWindow, byte bOffset) + throws IllegalInputException { + int iOffset = (bOffset < 0 ? bOffset + 256 : bOffset); + + // 0 is a reserved value + if (iOffset == 0) { + throw new IllegalInputException(); + } else if (iOffset < gapThreshold) { + dynamicOffset[iWindow] = iOffset << 7; + } else if (iOffset < reservedStart) { + dynamicOffset[iWindow] = (iOffset << 7) + gapOffset; + } else if (iOffset < fixedThreshold) { + // more reserved values + throw new IllegalInputException("iOffset == " + iOffset); + } else { + dynamicOffset[iWindow] = fixedOffset[iOffset - fixedThreshold]; + } + + // make the redefined window the active one + selectWindow(iWindow); + } + + /** + * (re-)define (and select) a window as an extended dynamic window + * The surrogate area in Unicode allows access to 2**20 codes beyond the + * first 64K codes by combining one of 1024 characters from the High + * Surrogate Area with one of 1024 characters from the Low Surrogate + * Area (see Unicode 2.0 for the details). + *

+ * The tags SDX and UDX set the window such that each subsequent byte in + * the range 80 to FF represents a surrogate pair. The following diagram + * shows how the bits in the two bytes following the SDX or UDX, and a + * subsequent data byte, map onto the bits in the resulting surrogate pair. + *

+ * hbyte lbyte data + * nnnwwwww zzzzzyyy 1xxxxxxx + *

+ * high-surrogate low-surrogate + * 110110wwwwwzzzzz 110111yyyxxxxxxx + * + * @param chOffset - Since the three top bits of chOffset are not needed to + * set the location of the extended Window, they are used instead + * to select the window, thereby reducing the number of needed command codes. + * The bottom 13 bits of chOffset are used to calculate the offset relative to + * a 7 bit input data byte to yield the 20 bits expressed by each surrogate pair. + */ + protected void defineExtendedWindow(char chOffset) { + // The top 3 bits of iOffsetHi are the window index + int iWindow = chOffset >>> 13; + + // Calculate the new offset + dynamicOffset[iWindow] = ((chOffset & 0x1FFF) << 7) + (1 << 16); + + // make the redefined window the active one + selectWindow(iWindow); + } + + /** + * expand input that is in Unicode mode + * + * @param in input byte array to be expanded + * @param iCur starting index + * @param sb string buffer to which to append expanded input + * @return the index for the lastc byte processed + */ + protected int expandUnicode(byte[] in, int iCur, StringBuffer sb) + throws IllegalInputException, EndOfInputException { + for (; iCur < in.length - 1; iCur += 2) // step by 2: + { + byte b = in[iCur]; + + if (b >= UC0 && b <= UC7) { + selectWindow(b - UC0); + return iCur; + } else if (b >= UD0 && b <= UD7) { + defineWindow(b - UD0, in[iCur + 1]); + return iCur + 1; + } else if (b == UDX) { + if (iCur >= in.length - 2) { + break; // buffer error + } + defineExtendedWindow(charFromTwoBytes(in[iCur + 1], in[iCur + 2])); + return iCur + 2; + } else if (b == UQU) { + if (iCur >= in.length - 2) { + break; // error + } + // Skip command byte and output Unicode character + iCur++; + } + + // output a Unicode character + char ch = charFromTwoBytes(in[iCur], in[iCur + 1]); + sb.append((char) ch); + iOut++; + } + + if (iCur == in.length) { + return iCur; + } + + // Error condition + throw new EndOfInputException(); + } + + /** + * expand portion of the input that is in single byte mode * + */ + protected String expandSingleByte(byte[] in) + throws IllegalInputException, EndOfInputException { + + /* Allocate the output buffer. Because of control codes, generally + each byte of input results in fewer than one character of + output. Using in.length as an intial allocation length should avoid + the need to reallocate in mid-stream. The exception to this rule are + surrogates. */ + StringBuffer sb = new StringBuffer(in.length); + iOut = 0; + + // Loop until all input is exhausted or an error occurred + int iCur; + Loop: + for (iCur = 0; iCur < in.length; iCur++) { + // DEBUG Debug.out("Expanding: ", iCur); + + // Default behaviour is that ASCII characters are passed through + // (staticOffset[0] == 0) and characters with the high bit on are + // offset by the current dynamic (or sliding) window (this.iWindow) + int iStaticWindow = 0; + int iDynamicWindow = getCurrentWindow(); + + switch (in[iCur]) { + // Quote from a static Window + case SQ0: + case SQ1: + case SQ2: + case SQ3: + case SQ4: + case SQ5: + case SQ6: + case SQ7: + // skip the command byte and check for length + if (iCur >= in.length - 1) { + break Loop; // buffer length error + } + // Select window pair to quote from + iDynamicWindow = iStaticWindow = in[iCur] - SQ0; + iCur++; + + // FALL THROUGH + + default: + // output as character + if (in[iCur] >= 0) { + // use static window + int ch = in[iCur] + staticOffset[iStaticWindow]; + sb.append((char) ch); + iOut++; + } else { + // use dynamic window + int ch = (in[iCur] + 256); // adjust for signed bytes + ch -= 0x80; // reduce to range 00..7F + ch += dynamicOffset[iDynamicWindow]; + + //DEBUG + + if (ch < 1 << 16) { + // in Unicode range, output directly + sb.append((char) ch); + iOut++; + } else { + // this is an extension character + + // compute and append the two surrogates: + // translate from 10000..10FFFF to 0..FFFFF + ch -= 0x10000; + + // high surrogate = top 10 bits added to D800 + sb.append((char) (0xD800 + (ch >> 10))); + iOut++; + + // low surrogate = bottom 10 bits added to DC00 + sb.append((char) (0xDC00 + (ch & ~0xFC00))); + iOut++; + } + } + break; + + // define a dynamic window as extended + case SDX: + iCur += 2; + if (iCur >= in.length) { + break Loop; // buffer length error + } + defineExtendedWindow(charFromTwoBytes(in[iCur - 1], in[iCur])); + break; + + // Position a dynamic Window + case SD0: + case SD1: + case SD2: + case SD3: + case SD4: + case SD5: + case SD6: + case SD7: + iCur++; + if (iCur >= in.length) { + break Loop; // buffer length error + } + defineWindow(in[iCur - 1] - SD0, in[iCur]); + break; + + // Select a new dynamic Window + case SC0: + case SC1: + case SC2: + case SC3: + case SC4: + case SC5: + case SC6: + case SC7: + selectWindow(in[iCur] - SC0); + break; + case SCU: + // switch to Unicode mode and continue parsing + iCur = expandUnicode(in, iCur + 1, sb); + // DEBUG Debug.out("Expanded Unicode range until: ", iCur); + break; + + case SQU: + // directly extract one Unicode character + iCur += 2; + if (iCur >= in.length) { + break Loop; // buffer length error + } else { + char ch = charFromTwoBytes(in[iCur - 1], in[iCur]); + + sb.append((char) ch); + iOut++; + } + break; + + case Srs: + throw new IllegalInputException(); + // break; + } + } + + if (iCur >= in.length) { + //SUCCESS: all input used up + sb.setLength(iOut); + iIn = iCur; + return sb.toString(); + } + + //ERROR: premature end of input + throw new EndOfInputException(); + } + + /** + * expand a byte array containing compressed Unicode + */ + public String expand(byte[] in) + throws IllegalInputException, EndOfInputException { + String str = expandSingleByte(in); + return str; + } + + + /** + * reset is called to start with new input, w/o creating a new + * instance + */ + public void reset() { + iOut = 0; + iIn = 0; + super.reset(); + } + + public int charsWritten() { + return iOut; + } + + public int bytesRead() { + return iIn; + } +} diff --git a/app/src/main/java/org/unicode/scsu/IllegalInputException.java b/app/src/main/java/org/unicode/scsu/IllegalInputException.java new file mode 100644 index 0000000..a82e9bc --- /dev/null +++ b/app/src/main/java/org/unicode/scsu/IllegalInputException.java @@ -0,0 +1,45 @@ +package org.unicode.scsu; + +/** + * This sample software accompanies Unicode Technical Report #6 and + * distributed as is by Unicode, Inc., subject to the following: + * + * Copyright � 1996-1997 Unicode, Inc.. All Rights Reserved. + * + * Permission to use, copy, modify, and distribute this software + * without fee is hereby granted provided that this copyright notice + * appears in all copies. + * + * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE + * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT. + * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND + * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND + * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING + * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES. + * + * @author Asmus Freytag + * + * @version 001 Dec 25 1996 + * @version 002 Jun 25 1997 + * @version 003 Jul 25 1997 + * @version 004 Aug 25 1997 + * + * Unicode and the Unicode logo are trademarks of Unicode, Inc., + * and are registered in some jurisdictions. + **/ + +/** + * The input character array or input byte array contained + * illegal sequences of bytes or characters + */ +public class IllegalInputException extends java.lang.Exception { + public IllegalInputException() { + super("The input character array or input byte array contained illegal sequences of bytes or characters"); + } + + public IllegalInputException(String s) { + super(s); + } +} diff --git a/app/src/main/java/org/unicode/scsu/SCSU.java b/app/src/main/java/org/unicode/scsu/SCSU.java new file mode 100644 index 0000000..089ce39 --- /dev/null +++ b/app/src/main/java/org/unicode/scsu/SCSU.java @@ -0,0 +1,260 @@ +package org.unicode.scsu; + +/** + * This sample software accompanies Unicode Technical Report #6 and + * distributed as is by Unicode, Inc., subject to the following: + * + * Copyright � 1996-1998 Unicode, Inc.. All Rights Reserved. + * + * Permission to use, copy, modify, and distribute this software + * without fee is hereby granted provided that this copyright notice + * appears in all copies. + * + * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE + * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT. + * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND + * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND + * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING + * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES. + * + * @author Asmus Freytag + * + * @version 001 Dec 25 1996 + * @version 002 Jun 25 1997 + * @version 003 Jul 25 1997 + * @version 004 Aug 25 1997 + * @version 005 Sep 30 1998 + * + * Unicode and the Unicode logo are trademarks of Unicode, Inc., + * and are registered in some jurisdictions. + **/ + +/** + * Encoding text data in Unicode often requires more storage than using + * an existing 8-bit character set and limited to the subset of characters + * actually found in the text. The Unicode Compression Algorithm reduces + * the necessary storage while retaining the universality of Unicode. + * A full description of the algorithm can be found in document + * http://www.unicode.org/unicode/reports/tr6.html + *

+ * Summary + *

+ * The goal of the Unicode Compression Algorithm is the abilty to + * Express all code points in Unicode + * Approximate storage size for traditional character sets + * Work well for short strings + * Provide transparency for Latin-1 data + * Support very simple decoders + * Support simple as well as sophisticated encoders + *

+ * If needed, further compression can be achieved by layering standard + * file or disk-block based compression algorithms on top. + *

+ *

Features

+ *

+ * Languages using small alphabets would contain runs of characters that + * are coded close together in Unicode. These runs are interrupted only + * by punctuation characters, which are themselves coded in proximity to + * each other in Unicode (usually in the ASCII range). + *

+ * Two basic mechanisms in the compression algorithm account for these two + * cases, sliding windows and static windows. A window is an area of 128 + * consecutive characters in Unicode. In the compressed data stream, each + * character from a sliding window would be represented as a byte between + * 0x80 and 0xFF, while a byte from 0x20 to 0x7F (as well as CR, LF, and + * TAB) would always mean an ASCII character (or control). + *

+ *

Notes on the Java implementation

+ *

+ * A limitation of Java is the exclusive use of a signed byte data type. + * The following work arounds are required: + *

+ * Copying a byte to an integer variable and adding 256 for 'negative' + * bytes gives an integer in the range 0-255. + *

+ * Values of char are between 0x0000 and 0xFFFF in Java. Arithmetic on + * char values is unsigned. + *

+ * Extended characters require an int to store them. The sign is not an + * issue because only 1024*1024 + 65536 extended characters exist. + */ +public abstract class SCSU { + /** Single Byte mode command values */ + + /** + * SQn Quote from Window .

+ * If the following byte is less than 0x80, quote from + * static window n, else quote from dynamic window n. + */ + + static final byte SQ0 = 0x01; // Quote from window pair 0 + static final byte SQ1 = 0x02; // Quote from window pair 1 + static final byte SQ2 = 0x03; // Quote from window pair 2 + static final byte SQ3 = 0x04; // Quote from window pair 3 + static final byte SQ4 = 0x05; // Quote from window pair 4 + static final byte SQ5 = 0x06; // Quote from window pair 5 + static final byte SQ6 = 0x07; // Quote from window pair 6 + static final byte SQ7 = 0x08; // Quote from window pair 7 + + static final byte SDX = 0x0B; // Define a window as extended + static final byte Srs = 0x0C; // reserved + + static final byte SQU = 0x0E; // Quote a single Unicode character + static final byte SCU = 0x0F; // Change to Unicode mode + + /** + * SCn Change to Window n.

+ * If the following bytes are less than 0x80, interpret them + * as command bytes or pass them through, else add the offset + * for dynamic window n. + */ + static final byte SC0 = 0x10; // Select window 0 + static final byte SC1 = 0x11; // Select window 1 + static final byte SC2 = 0x12; // Select window 2 + static final byte SC3 = 0x13; // Select window 3 + static final byte SC4 = 0x14; // Select window 4 + static final byte SC5 = 0x15; // Select window 5 + static final byte SC6 = 0x16; // Select window 6 + static final byte SC7 = 0x17; // Select window 7 + static final byte SD0 = 0x18; // Define and select window 0 + static final byte SD1 = 0x19; // Define and select window 1 + static final byte SD2 = 0x1A; // Define and select window 2 + static final byte SD3 = 0x1B; // Define and select window 3 + static final byte SD4 = 0x1C; // Define and select window 4 + static final byte SD5 = 0x1D; // Define and select window 5 + static final byte SD6 = 0x1E; // Define and select window 6 + static final byte SD7 = 0x1F; // Define and select window 7 + + static final byte UC0 = (byte) 0xE0; // Select window 0 + static final byte UC1 = (byte) 0xE1; // Select window 1 + static final byte UC2 = (byte) 0xE2; // Select window 2 + static final byte UC3 = (byte) 0xE3; // Select window 3 + static final byte UC4 = (byte) 0xE4; // Select window 4 + static final byte UC5 = (byte) 0xE5; // Select window 5 + static final byte UC6 = (byte) 0xE6; // Select window 6 + static final byte UC7 = (byte) 0xE7; // Select window 7 + static final byte UD0 = (byte) 0xE8; // Define and select window 0 + static final byte UD1 = (byte) 0xE9; // Define and select window 1 + static final byte UD2 = (byte) 0xEA; // Define and select window 2 + static final byte UD3 = (byte) 0xEB; // Define and select window 3 + static final byte UD4 = (byte) 0xEC; // Define and select window 4 + static final byte UD5 = (byte) 0xED; // Define and select window 5 + static final byte UD6 = (byte) 0xEE; // Define and select window 6 + static final byte UD7 = (byte) 0xEF; // Define and select window 7 + + static final byte UQU = (byte) 0xF0; // Quote a single Unicode character + static final byte UDX = (byte) 0xF1; // Define a Window as extended + static final byte Urs = (byte) 0xF2; // reserved + + /** + * constant offsets for the 8 static windows + */ + static final int staticOffset[] = + { + 0x0000, // ASCII for quoted tags + 0x0080, // Latin - 1 Supplement (for access to punctuation) + 0x0100, // Latin Extended-A + 0x0300, // Combining Diacritical Marks + 0x2000, // General Punctuation + 0x2080, // Currency Symbols + 0x2100, // Letterlike Symbols and Number Forms + 0x3000 // CJK Symbols and punctuation + }; + + /** + * initial offsets for the 8 dynamic (sliding) windows + */ + static final int initialDynamicOffset[] = + { + 0x0080, // Latin-1 + 0x00C0, // Latin Extended A //@005 fixed from 0x0100 + 0x0400, // Cyrillic + 0x0600, // Arabic + 0x0900, // Devanagari + 0x3040, // Hiragana + 0x30A0, // Katakana + 0xFF00 // Fullwidth ASCII + }; + + /** + * dynamic window offsets, intitialize to default values. + */ + int dynamicOffset[] = + { + initialDynamicOffset[0], + initialDynamicOffset[1], + initialDynamicOffset[2], + initialDynamicOffset[3], + initialDynamicOffset[4], + initialDynamicOffset[5], + initialDynamicOffset[6], + initialDynamicOffset[7] + }; + + // The following method is common to encoder and decoder + /** + * Unicode code points from 3400 to E000 are not adressible by + * dynamic window, since in these areas no short run alphabets are + * found. Therefore add gapOffset to all values from gapThreshold + */ + static final int gapThreshold = 0x68; + static final int gapOffset = 0xAC00; + /* values between reservedStart and fixedThreshold are reserved */ + static final int reservedStart = 0xA8; + + /** + * These values are used in defineWindow + */ + /* use table of predefined fixed offsets for values from fixedThreshold */ + static final int fixedThreshold = 0xF9; + /** + * Table of fixed predefined Offsets, and byte values that index into * + */ + static final int fixedOffset[] = + { + /* 0xF9 */ 0x00C0, // Latin-1 Letters + half of Latin Extended A + /* 0xFA */ 0x0250, // IPA extensions + /* 0xFB */ 0x0370, // Greek + /* 0xFC */ 0x0530, // Armenian + /* 0xFD */ 0x3040, // Hiragana + /* 0xFE */ 0x30A0, // Katakana + /* 0xFF */ 0xFF60 // Halfwidth Katakana + }; + private int iWindow = 0; // current active window + + /** + * whether a character is compressible + */ + public static boolean isCompressible(char ch) { + return (ch < 0x3400 || ch >= 0xE000); + } + + /** + * select the active dynamic window * + */ + protected void selectWindow(int iWindow) { + this.iWindow = iWindow; + } + + /** + * select the active dynamic window * + */ + protected int getCurrentWindow() { + return this.iWindow; + } + + /** + * reset is only needed to bail out after an exception and + * restart with new input + */ + protected void reset() { + + // reset the dynamic windows + for (int i = 0; i < dynamicOffset.length; i++) { + dynamicOffset[i] = initialDynamicOffset[i]; + } + this.iWindow = 0; + } +} \ No newline at end of file