add Unicode compressor

2014-07-31 13:27:40 +02:00 · 2014-07-31 13:27:40 +02:00 · f2af959037
commit f2af959037
parent 1bb906f660
7 changed files with 1384 additions and 0 deletions
--- a/app/src/main/java/org/unicode/scsu/Assert.java
+++ b/app/src/main/java/org/unicode/scsu/Assert.java
@ -0,0 +1,44 @@
+package org.unicode.scsu;
+
+/**
+ * This sample software accompanies Unicode Technical Report #6 and
+ * distributed as is by Unicode, Inc., subject to the following:
+ *
+ * Copyright <EFBFBD> 1996-1997 Unicode, Inc.. All Rights Reserved.
+ *
+ * Permission to use, copy, modify, and distribute this software
+ * without fee is hereby granted provided that this copyright notice
+ * appears in all copies.
+ *
+ * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
+ * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
+ * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
+ * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
+ * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
+ * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
+ * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
+ *
+ *  @author Asmus Freytag
+ *
+ *  @version 001 Dec 25 1996
+ *  @version 002 Jun 25 1997
+ *  @version 003 Jul 25 1997
+ *  @version 004 Aug 25 1997
+ *
+ * Unicode and the Unicode logo are trademarks of Unicode, Inc.,
+ * and are registered in some jurisdictions.
+ **/
+
+/**
+ * The assertion failed
+ */
+public class Assert extends RuntimeException {
+	public Assert(String assertion) {
+		super("Assertion failed: " + assertion);
+	}
+
+	public Assert() {
+		super();
+	}
+}
--- a/app/src/main/java/org/unicode/scsu/Compress.java
+++ b/app/src/main/java/org/unicode/scsu/Compress.java
@ -0,0 +1,556 @@
+package org.unicode.scsu;
+
+/**
+ * This sample software accompanies Unicode Technical Report #6 and
+ * distributed as is by Unicode, Inc., subject to the following:
+ *
+ * Copyright <EFBFBD> 1996-1997 Unicode, Inc.. All Rights Reserved.
+ *
+ * Permission to use, copy, modify, and distribute this software
+ * without fee is hereby granted provided that this copyright notice
+ * appears in all copies.
+ *
+ * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
+ * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
+ * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
+ * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
+ * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
+ * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
+ * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
+ *
+ *  @author Asmus Freytag
+ *
+ *  @version 001 Dec 25 1996
+ *  @version 002 Jun 25 1997
+ *  @version 003 Jul 25 1997
+ *  @version 004 Aug 25 1997
+ *
+ * Unicode and the Unicode logo are trademarks of Unicode, Inc.,
+ * and are registered in some jurisdictions.
+ **/
+
+/**
+ * This class implements a simple compression algorithm
+ */
+/*
+	Note on exception handling
+        This compressor is designed so that it can be restarted after
+        an exception. All operations advancing input and/or output cursor
+        (iIn and iOut) either complete an action, or set a state (fUnicodeMode)
+        before updating the cursors.
+*/
+public class Compress extends SCSU {
+
+	static int iNextWindow = 3;
+	/**
+	 * next input character to be read *
+	 */
+	private int iIn;
+	/**
+	 * next output byte to be written *
+	 */
+	private int iOut;
+	/**
+	 * start index of Unicode mode in output array, or -1 if in single byte mode *
+	 */
+	private int iSCU = -1;
+	/**
+	 * true if the next command byte is of the Uxx family
+	 */
+	private boolean fUnicodeMode = false;
+
+	/**
+	 * locate a window for a character given a table of offsets
+	 *
+	 * @param ch          - character
+	 * @param offsetTable - table of window offsets
+	 * @return true if the character fits a window from the table of windows
+	 */
+	private boolean locateWindow(int ch, int[] offsetTable) {
+		// always try the current window first
+		int iWin = getCurrentWindow();
+
+		// if the character fits the current window
+		// just use the current window
+		if (iWin != -1 && ch >= offsetTable[iWin] && ch < offsetTable[iWin] + 0x80) {
+			return true;
+		}
+
+		// try all windows in order
+		for (iWin = 0; iWin < offsetTable.length; iWin++) {
+			if (ch >= offsetTable[iWin] && ch < offsetTable[iWin] + 0x80) {
+				selectWindow(iWin);
+				return true;
+			}
+		}
+		// none found
+		return false;
+	}
+
+	/**
+	 * returns true if the character is ASCII, but not a control other than CR, LF and TAB
+	 */
+	private boolean isAsciiCrLfOrTab(int ch) {
+		return (ch >= 0x20 && ch <= 0x7F)                 // ASCII
+				|| ch == 0x09 || ch == 0x0A || ch == 0x0D;   // CR/LF or TAB
+
+	}
+
+	/**
+	 * output a run of characters in single byte mode
+	 * In single byte mode pass through characters in the ASCII range, but
+	 * quote characters overlapping with compression command codes. Runs
+	 * of characters fitting the current window are output as runs of bytes
+	 * in the range 0x80-0xFF. Checks for and validates Surrogate Pairs.
+	 * Uses and updates the current input and output cursors store in
+	 * the instance variables <i>iIn</i> and <i>iOut</i>.
+	 *
+	 * @param in  - input character array
+	 * @param out - output byte array
+	 * @return the next chaacter to be processed. This may be an extended character.
+	 */
+	public int outputSingleByteRun(char[] in, byte[] out)
+			throws EndOfOutputException, EndOfInputException, IllegalInputException {
+		int iWin = getCurrentWindow();
+		loop:
+		while (iIn < in.length) {
+			int outlen = 0;
+			byte byte1 = 0;
+			byte byte2 = 0;
+
+			// get the input character
+			int ch = in[iIn];
+
+			int inlen = 1;
+
+			// Check input for Surrogate pair
+			if ((ch & 0xF800) == 0xD800) {
+				if ((ch & 0xFC00) == 0xDC00) {
+					// low surrogate out of order
+					throw new IllegalInputException("Unpaired low surrogate: " + iIn);
+				} else {
+					// have high surrogate now get low surrogate
+					if (iIn >= in.length - 1) {
+						// premature end of input
+						throw new EndOfInputException();
+					}
+					// get the char
+					int ch2 = in[iIn + 1];
+
+					// make sure it's a low surrogate
+					if ((ch2 & 0xFC00) != 0xDC00) {
+						// a low surrogate was required
+						throw new IllegalInputException("Unpaired high surrogate: " + (iIn + 1));
+					}
+
+					// combine the two values
+					ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
+					// ch = ch<<10 + ch2 - 0x36F0000;
+
+					inlen = 2;
+				}
+			}
+
+			// ASCII Letter, NUL, CR, LF and TAB are always passed through
+			if (isAsciiCrLfOrTab(ch) || ch == 0) {
+				// pass through directcly
+				byte2 = (byte) (ch & 0x7F);
+				outlen = 1;
+			}
+
+			// All other control codes must be quoted
+			else if (ch < 0x20) {
+				byte1 = SQ0;
+				byte2 = (byte) (ch);
+				outlen = 2;
+			}
+
+			// Letters that fit the current dynamic window
+			else if (ch >= dynamicOffset[iWin] && ch < dynamicOffset[iWin] + 0x80) {
+				ch -= dynamicOffset[iWin];
+				byte2 = (byte) (ch | 0x80);
+				outlen = 1;
+			}
+
+			// check for room in the output array
+			if (iOut + outlen >= out.length) {
+				throw new EndOfOutputException();
+			}
+
+			switch (outlen) {
+				default:
+					// need to use some other compression mode for this
+					// character so we terminate this loop
+
+					return ch; // input not finished
+
+				// output the characters
+				case 2:
+					out[iOut++] = byte1;
+					// fall through
+				case 1:
+					out[iOut++] = byte2;
+					break;
+			}
+			// advance input pointer
+			iIn += inlen;
+		}
+		return 0; // input all used up
+	}
+
+	/**
+	 * quote a single character in single byte mode
+	 * Quoting a character (aka 'non-locking shift') gives efficient access
+	 * to characters that occur in isolation--usually punctuation characters.
+	 * When quoting a character from a dynamic window use 0x80 - 0xFF, when
+	 * quoting a character from a static window use 0x00-0x7f.
+	 *
+	 * @param ch  - character to be quoted
+	 * @param out - output byte array
+	 */
+
+	private void quoteSingleByte(int ch, byte[] out)
+			throws EndOfOutputException {
+		int iWin = getCurrentWindow();
+
+		// check for room in the output array
+		if (iOut >= out.length - 2) {
+			throw new EndOfOutputException();
+		}
+
+		// Output command byte followed by
+		out[iOut++] = (byte) (SQ0 + iWin);
+
+		// Letter that fits the current dynamic window
+		if (ch >= dynamicOffset[iWin] && ch < dynamicOffset[iWin] + 0x80) {
+			ch -= dynamicOffset[iWin];
+			out[iOut++] = (byte) (ch | 0x80);
+		}
+
+		// Letter that fits the current static window
+		else if (ch >= staticOffset[iWin] && ch < staticOffset[iWin] + 0x80) {
+			ch -= staticOffset[iWin];
+			out[iOut++] = (byte) ch;
+		} else {
+			throw new Assert("ch = " + ch + " not valid in quoteSingleByte. Internal Compressor Error");
+		}
+		// advance input pointer
+		iIn++;
+	}
+
+	/**
+	 * output a run of characters in Unicode mode
+	 * A run of Unicode mode consists of characters which are all in the
+	 * range of non-compressible characters or isolated occurrence
+	 * of any other characters. Characters in the range 0xE00-0xF2FF must
+	 * be quoted to avoid overlap with the Unicode mode compression command codes.
+	 * Uses and updates the current input and output cursors store in
+	 * the instance variables <i>iIn</i> and <i>iOut</i>.
+	 * NOTE: Characters from surrogate pairs are passed through and unlike single
+	 * byte mode no checks are made for unpaired surrogate characters.
+	 *
+	 * @param in  - input character array
+	 * @param out - output byte array
+	 * @return the next input character to be processed
+	 */
+	public char outputUnicodeRun(char[] in, byte[] out)
+			throws EndOfOutputException {
+		// current character
+		char ch = 0;
+
+		while (iIn < in.length) {
+			// get current input and set default output length
+			ch = in[iIn];
+			int outlen = 2;
+
+			// Characters in these ranges could potentially be compressed.
+			// We require 2 or more compressible characters to break the run
+			if (isCompressible(ch)) {
+				// check whether we can look ahead
+				if (iIn < in.length - 1) {
+					// DEBUG
+					char ch2 = in[iIn + 1];
+					if (isCompressible(ch2)) {
+						// at least 2 characters are compressible
+						// break the run
+						break;
+					}
+					//DEBUG
+				}
+				// If we get here, the current character is only character
+				// left in the input or it is followed by a non-compressible
+				// character. In neither case do we gain by breaking the
+				// run, so we proceed to output the character.
+				if (ch >= 0xE000 && ch <= 0xF2FF) {
+					// Characters in this range need to be escaped
+					outlen = 3;
+				}
+
+			}
+			// check that there is enough room to output the character
+			if (iOut >= out.length - outlen) {
+				// DEBUG
+				// if we got here, we ran out of space in the output array
+				throw new EndOfOutputException();
+			}
+
+			// output any characters that cannot be compressed,
+			if (outlen == 3) {
+				// output the quote character
+				out[iOut++] = (byte) UQU;
+			}
+			// pass the Unicode character in MSB,LSB order
+			out[iOut++] = (byte) (ch >>> 8);
+			out[iOut++] = (byte) (ch & 0xFF);
+
+			// advance input cursor
+			iIn++;
+		}
+
+		// return the last character
+		return ch;
+	}
+
+	/**
+	 * redefine a window so it surrounds a given character value
+	 * For now, this function uses window 3 exclusively (window 4
+	 * for extended windows);
+	 *
+	 * @param ch  - character around which window is positioned
+	 * @param out - output byte array
+	 * @return true if a window was successfully defined
+	 */
+	private boolean positionWindow(int ch, byte[] out, boolean fUnicodeMode)
+			throws IllegalInputException, EndOfOutputException {
+		int iWin = iNextWindow % 8; // simple LRU
+		int iPosition = 0;
+
+		// iPosition 0 is a reserved value
+		if (ch < 0x80) {
+			throw new Assert("ch < 0x80");
+			//return false;
+		}
+
+		// Check the fixed offsets
+		for (int i = 0; i < fixedOffset.length; i++) {
+			if (ch >= fixedOffset[i] && ch < fixedOffset[i] + 0x80) {
+				iPosition = i;
+				break;
+			}
+		}
+
+		if (iPosition != 0) {
+			// DEBUG
+
+			// ch fits in a fixed offset window position
+			dynamicOffset[iWin] = fixedOffset[iPosition];
+			iPosition += 0xF9;
+		} else if (ch < 0x3400) {
+			// calculate a window position command and set the offset
+			iPosition = ch >>> 7;
+			dynamicOffset[iWin] = ch & 0xFF80;
+
+		} else if (ch < 0xE000) {
+			// attempt to place a window where none can go
+			return false;
+		} else if (ch <= 0xFFFF) {
+			// calculate a window position command, accounting
+			// for the gap in position values, and set the offset
+			iPosition = ((ch - gapOffset) >>> 7);
+
+			dynamicOffset[iWin] = ch & 0xFF80;
+
+		} else {
+			// if we get here, the character is in the extended range.
+			// Always use Window 4 to define an extended window
+
+			iPosition = (ch - 0x10000) >>> 7;
+			// DEBUG
+
+			iPosition |= iWin << 13;
+			dynamicOffset[iWin] = ch & 0x1FFF80;
+		}
+
+		// Outputting window defintion command for the general cases
+		if (iPosition < 0x100 && iOut < out.length - 1) {
+			out[iOut++] = (byte) ((fUnicodeMode ? UD0 : SD0) + iWin);
+			out[iOut++] = (byte) (iPosition & 0xFF);
+		}
+		// Output an extended window definiton command
+		else if (iPosition >= 0x100 && iOut < out.length - 2) {
+
+			out[iOut++] = (byte) (fUnicodeMode ? UDX : SDX);
+			out[iOut++] = (byte) ((iPosition >>> 8) & 0xFF);
+			out[iOut++] = (byte) (iPosition & 0xFF);
+		} else {
+			throw new EndOfOutputException();
+		}
+		selectWindow(iWin);
+		iNextWindow++;
+		return true;
+	}
+
+	/**
+	 * compress a Unicode character array with some simplifying assumptions
+	 */
+	public int simpleCompress(char[] in, int iStartIn, byte[] out, int iStartOut)
+			throws IllegalInputException, EndOfInputException, EndOfOutputException {
+		iIn = iStartIn;
+		iOut = iStartOut;
+
+
+		while (iIn < in.length) {
+			int ch;
+
+			// previously we switched to a Unicode run
+			if (iSCU != -1) {
+
+
+				// output characters as Unicode
+				ch = outputUnicodeRun(in, out);
+
+				// for single character Unicode runs (3 bytes) use quote
+				if (iOut - iSCU == 3) {
+					// go back and fix up the SCU to an SQU instead
+					out[iSCU] = (byte) SQU;
+					iSCU = -1;
+					continue;
+				} else {
+					iSCU = -1;
+					fUnicodeMode = true;
+				}
+			}
+			// next, try to output characters as single byte run
+			else {
+				ch = outputSingleByteRun(in, out);
+			}
+
+			// check whether we still have input
+			if (iIn == in.length) {
+				break; // no more input
+			}
+
+			// if we get here, we have a consistent value for ch, whether or
+			// not it is an regular or extended character. Locate or define a
+			// Window for the current character
+
+			// Check that we have enough room to output the command byte
+			if (iOut >= out.length - 1) {
+				throw new EndOfOutputException();
+			}
+
+			// In order to switch away from Unicode mode, it is necessary
+			// to select (or define) a window. If the characters that follow
+			// the Unicode range are ASCII characters, we can't use them
+			// to decide which window to select, since ASCII characters don't
+			// influence window settings. This loop looks ahead until it finds
+			// one compressible character that isn't in the ASCII range.
+			for (int ich = iIn; ch < 0x80; ich++) {
+				if (ich == in.length || !isCompressible(in[ich])) {
+					// if there are only ASCII characters left,
+					ch = in[iIn];
+					break;
+				}
+				ch = in[ich]; // lookahead for next non-ASCII char
+			}
+			// The character value contained in ch here will only be used to select
+			// output modes. Actual output of characters starts with in[iIn] and
+			// only takes place near the top of the loop.
+
+			int iprevWindow = getCurrentWindow();
+
+			// try to locate a dynamic window
+			if (ch < 0x80 || locateWindow(ch, dynamicOffset)) {
+				// lookahead to use SQn instead of SCn for single
+				// character interruptions of runs in current window
+				if (!fUnicodeMode && iIn < in.length - 1) {
+					char ch2 = in[iIn + 1];
+					if (ch2 >= dynamicOffset[iprevWindow] &&
+							ch2 < dynamicOffset[iprevWindow] + 0x80) {
+						quoteSingleByte(ch, out);
+						selectWindow(iprevWindow);
+						continue;
+					}
+				}
+
+				out[iOut++] = (byte) ((fUnicodeMode ? UC0 : SC0) + getCurrentWindow());
+				fUnicodeMode = false;
+			}
+			// try to locate a static window
+			else if (!fUnicodeMode && locateWindow(ch, staticOffset)) {
+				// static windows are not accessible from Unicode mode
+				quoteSingleByte(ch, out);
+				selectWindow(iprevWindow); // restore current Window settings
+				continue;
+			}
+			// try to define a window around ch
+			else if (positionWindow(ch, out, fUnicodeMode)) {
+				fUnicodeMode = false;
+			}
+			// If all else fails, start a Unicode run
+			else {
+				iSCU = iOut;
+				out[iOut++] = (byte) SCU;
+				continue;
+			}
+		}
+
+		return iOut - iStartOut;
+	}
+
+	public byte[] compress(String inStr)
+			throws IllegalInputException, EndOfInputException {
+		// Running out of room for output can cause non-optimal
+		// compression. In order to not slow down compression too
+		// much, not all intermediate state is constantly saved.
+
+		byte[] out = new byte[inStr.length() * 2];
+		char[] in = inStr.toCharArray();
+		//DEBUG
+		int iLen = 0;
+		reset();
+		while (true) {
+			try {
+				simpleCompress(in, charsRead(), out, bytesWritten());
+				// if we get here things went fine.
+				break;
+			} catch (EndOfOutputException e) {
+				// create a larger output buffer and continue
+				byte[] largerOut = new byte[out.length * 2];
+				System.arraycopy(out, 0, largerOut, 0, out.length);
+				out = largerOut;
+			}
+		}
+		byte[] trimmedOut = new byte[bytesWritten()];
+		System.arraycopy(out, 0, trimmedOut, 0, trimmedOut.length);
+		out = trimmedOut;
+
+		return out;
+	}
+
+	/**
+	 * reset is only needed to bail out after an exception and
+	 * restart with new input
+	 */
+	public void reset() {
+		super.reset();
+		fUnicodeMode = false;
+		iSCU = -1;
+	}
+
+	/**
+	 * returns the number of bytes written *
+	 */
+	public int bytesWritten() {
+		return iOut;
+	}
+
+	/**
+	 * returns the number of bytes written *
+	 */
+	public int charsRead() {
+		return iIn;
+	}
+
+}
--- a/app/src/main/java/org/unicode/scsu/EndOfInputException.java
+++ b/app/src/main/java/org/unicode/scsu/EndOfInputException.java
@ -0,0 +1,45 @@
+package org.unicode.scsu;
+
+/**
+ * This sample software accompanies Unicode Technical Report #6 and
+ * distributed as is by Unicode, Inc., subject to the following:
+ *
+ * Copyright <EFBFBD> 1996-1997 Unicode, Inc.. All Rights Reserved.
+ *
+ * Permission to use, copy, modify, and distribute this software
+ * without fee is hereby granted provided that this copyright notice
+ * appears in all copies.
+ *
+ * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
+ * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
+ * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
+ * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
+ * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
+ * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
+ * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
+ *
+ *  @author Asmus Freytag
+ *
+ *  @version 001 Dec 25 1996
+ *  @version 002 Jun 25 1997
+ *  @version 003 Jul 25 1997
+ *  @version 004 Aug 25 1997
+ *
+ * Unicode and the Unicode logo are trademarks of Unicode, Inc.,
+ * and are registered in some jurisdictions.
+ **/
+
+/**
+ * The input string or input byte array ended prematurely
+ */
+public class EndOfInputException
+		extends java.lang.Exception {
+	public EndOfInputException() {
+		super("The input string or input byte array ended prematurely");
+	}
+
+	public EndOfInputException(String s) {
+		super(s);
+	}
+}
--- a/app/src/main/java/org/unicode/scsu/EndOfOutputException.java
+++ b/app/src/main/java/org/unicode/scsu/EndOfOutputException.java
@ -0,0 +1,46 @@
+package org.unicode.scsu;
+
+/**
+ * This sample software accompanies Unicode Technical Report #6 and
+ * distributed as is by Unicode, Inc., subject to the following:
+ *
+ * Copyright <EFBFBD> 1996-1997 Unicode, Inc.. All Rights Reserved.
+ *
+ * Permission to use, copy, modify, and distribute this software
+ * without fee is hereby granted provided that this copyright notice
+ * appears in all copies.
+ *
+ * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
+ * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
+ * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
+ * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
+ * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
+ * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
+ * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
+ *
+ *  @author Asmus Freytag
+ *
+ *  @version 001 Dec 25 1996
+ *  @version 002 Jun 25 1997
+ *  @version 003 Jul 25 1997
+ *
+ * Unicode and the Unicode logo are trademarks of Unicode, Inc.,
+ * and are registered in some jurisdictions.
+ **/
+
+/**
+ * The input string or input byte array ended prematurely
+ */
+public class EndOfOutputException
+		extends java.lang.Exception
+
+{
+	public EndOfOutputException() {
+		super("The input string or input byte array ended prematurely");
+	}
+
+	public EndOfOutputException(String s) {
+		super(s);
+	}
+}
--- a/app/src/main/java/org/unicode/scsu/Expand.java
+++ b/app/src/main/java/org/unicode/scsu/Expand.java
@ -0,0 +1,388 @@
+package org.unicode.scsu;
+
+/**
+ * This sample software accompanies Unicode Technical Report #6 and
+ * distributed as is by Unicode, Inc., subject to the following:
+ *
+ * Copyright <EFBFBD> 1996-1998 Unicode, Inc.. All Rights Reserved.
+ *
+ * Permission to use, copy, modify, and distribute this software
+ * without fee is hereby granted provided that this copyright notice
+ * appears in all copies.
+ *
+ * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
+ * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
+ * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
+ * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
+ * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
+ * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
+ * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
+ *
+ *  @author Asmus Freytag
+ *
+ *  @version 001 Dec 25 1996
+ *  @version 002 Jun 25 1997
+ *  @version 003 Jul 25 1997
+ *  @version 004 Aug 25 1997
+ *  @version 005 Sep 30 1998  
+ *
+ * Unicode and the Unicode logo are trademarks of Unicode, Inc.,
+ * and are registered in some jurisdictions.
+ **/
+
+/**
+ * Reference decoder for the Standard Compression Scheme for Unicode (SCSU)
+ * <p/>
+ * <H2>Notes on the Java implementation</H2>
+ * <p/>
+ * A limitation of Java is the exclusive use of a signed byte data type.
+ * The following work arounds are required:
+ * <p/>
+ * Copying a byte to an integer variable and adding 256 for 'negative'
+ * bytes gives an integer in the range 0-255.
+ * <p/>
+ * Values of char are between 0x0000 and 0xFFFF in Java. Arithmetic on
+ * char values is unsigned.
+ * <p/>
+ * Extended characters require an int to store them. The sign is not an
+ * issue because only 1024*1024 + 65536 extended characters exist.
+ */
+public class Expand extends SCSU {
+	/**
+	 * string buffer length used by the following functions
+	 */
+	protected int iOut = 0;
+	/**
+	 * input cursor used by the following functions
+	 */
+	protected int iIn = 0;
+
+	/**
+	 * assemble a char from two bytes
+	 * In Java bytes are signed quantities, while chars are unsigned
+	 *
+	 * @param hi most significant byte
+	 * @param lo least significant byte
+	 * @return the character
+	 */
+	public static char charFromTwoBytes(byte hi, byte lo) {
+		char ch = (char) (lo >= 0 ? lo : 256 + lo);
+		return (char) (ch + (char) ((hi >= 0 ? hi : 256 + hi) << 8));
+	}
+
+	/**
+	 * (re-)define (and select) a dynamic window
+	 * A sliding window position cannot start at any Unicode value,
+	 * so rather than providing an absolute offset, this function takes
+	 * an index value which selects among the possible starting values.
+	 * <p/>
+	 * Most scripts in Unicode start on or near a half-block boundary
+	 * so the default behaviour is to multiply the index by 0x80. Han,
+	 * Hangul, Surrogates and other scripts between 0x3400 and 0xDFFF
+	 * show very poor locality--therefore no sliding window can be set
+	 * there. A jumpOffset is added to the index value to skip that region,
+	 * and only 167 index values total are required to select all eligible
+	 * half-blocks.
+	 * <p/>
+	 * Finally, a few scripts straddle half block boundaries. For them, a
+	 * table of fixed offsets is used, and the index values from 0xF9 to
+	 * 0xFF are used to select these special offsets.
+	 * <p/>
+	 * After (re-)defining a windows location it is selected so it is ready
+	 * for use.
+	 * <p/>
+	 * Recall that all Windows are of the same length (128 code positions).
+	 *
+	 * @param iWindow - index of the window to be (re-)defined
+	 * @param bOffset - index for the new offset value
+	 */
+	// @005 protected <-- private here and elsewhere
+	protected void defineWindow(int iWindow, byte bOffset)
+			throws IllegalInputException {
+		int iOffset = (bOffset < 0 ? bOffset + 256 : bOffset);
+
+		// 0 is a reserved value
+		if (iOffset == 0) {
+			throw new IllegalInputException();
+		} else if (iOffset < gapThreshold) {
+			dynamicOffset[iWindow] = iOffset << 7;
+		} else if (iOffset < reservedStart) {
+			dynamicOffset[iWindow] = (iOffset << 7) + gapOffset;
+		} else if (iOffset < fixedThreshold) {
+			// more reserved values
+			throw new IllegalInputException("iOffset == " + iOffset);
+		} else {
+			dynamicOffset[iWindow] = fixedOffset[iOffset - fixedThreshold];
+		}
+
+		// make the redefined window the active one
+		selectWindow(iWindow);
+	}
+
+	/**
+	 * (re-)define (and select) a window as an extended dynamic window
+	 * The surrogate area in Unicode allows access to 2**20 codes beyond the
+	 * first 64K codes by combining one of 1024 characters from the High
+	 * Surrogate Area with one of 1024 characters from the Low Surrogate
+	 * Area (see Unicode 2.0 for the details).
+	 * <p/>
+	 * The tags SDX and UDX set the window such that each subsequent byte in
+	 * the range 80 to FF represents a surrogate pair. The following diagram
+	 * shows how the bits in the two bytes following the SDX or UDX, and a
+	 * subsequent data byte, map onto the bits in the resulting surrogate pair.
+	 * <p/>
+	 * hbyte         lbyte          data
+	 * nnnwwwww      zzzzzyyy      1xxxxxxx
+	 * <p/>
+	 * high-surrogate     low-surrogate
+	 * 110110wwwwwzzzzz   110111yyyxxxxxxx
+	 *
+	 * @param chOffset - Since the three top bits of chOffset are not needed to
+	 *                 set the location of the extended Window, they are used instead
+	 *                 to select the window, thereby reducing the number of needed command codes.
+	 *                 The bottom 13 bits of chOffset are used to calculate the offset relative to
+	 *                 a 7 bit input data byte to yield the 20 bits expressed by each surrogate pair.
+	 */
+	protected void defineExtendedWindow(char chOffset) {
+		// The top 3 bits of iOffsetHi are the window index
+		int iWindow = chOffset >>> 13;
+
+		// Calculate the new offset
+		dynamicOffset[iWindow] = ((chOffset & 0x1FFF) << 7) + (1 << 16);
+
+		// make the redefined window the active one
+		selectWindow(iWindow);
+	}
+
+	/**
+	 * expand input that is in Unicode mode
+	 *
+	 * @param in   input byte array to be expanded
+	 * @param iCur starting index
+	 * @param sb   string buffer to which to append expanded input
+	 * @return the index for the lastc byte processed
+	 */
+	protected int expandUnicode(byte[] in, int iCur, StringBuffer sb)
+			throws IllegalInputException, EndOfInputException {
+		for (; iCur < in.length - 1; iCur += 2) // step by 2:
+		{
+			byte b = in[iCur];
+
+			if (b >= UC0 && b <= UC7) {
+				selectWindow(b - UC0);
+				return iCur;
+			} else if (b >= UD0 && b <= UD7) {
+				defineWindow(b - UD0, in[iCur + 1]);
+				return iCur + 1;
+			} else if (b == UDX) {
+				if (iCur >= in.length - 2) {
+					break; // buffer error
+				}
+				defineExtendedWindow(charFromTwoBytes(in[iCur + 1], in[iCur + 2]));
+				return iCur + 2;
+			} else if (b == UQU) {
+				if (iCur >= in.length - 2) {
+					break; // error
+				}
+				// Skip command byte and output Unicode character
+				iCur++;
+			}
+
+			// output a Unicode character
+			char ch = charFromTwoBytes(in[iCur], in[iCur + 1]);
+			sb.append((char) ch);
+			iOut++;
+		}
+
+		if (iCur == in.length) {
+			return iCur;
+		}
+
+		// Error condition
+		throw new EndOfInputException();
+	}
+
+	/**
+	 * expand portion of the input that is in single byte mode *
+	 */
+	protected String expandSingleByte(byte[] in)
+			throws IllegalInputException, EndOfInputException {
+
+        /* Allocate the output buffer. Because of control codes, generally
+		each byte of input results in fewer than one character of
+        output. Using in.length as an intial allocation length should avoid
+        the need to reallocate in mid-stream. The exception to this rule are
+        surrogates. */
+		StringBuffer sb = new StringBuffer(in.length);
+		iOut = 0;
+
+		// Loop until all input is exhausted or an error occurred
+		int iCur;
+		Loop:
+		for (iCur = 0; iCur < in.length; iCur++) {
+			// DEBUG Debug.out("Expanding: ", iCur);
+
+			// Default behaviour is that ASCII characters are passed through
+			// (staticOffset[0] == 0) and characters with the high bit on are
+			// offset by the current dynamic (or sliding) window (this.iWindow)
+			int iStaticWindow = 0;
+			int iDynamicWindow = getCurrentWindow();
+
+			switch (in[iCur]) {
+				// Quote from a static Window
+				case SQ0:
+				case SQ1:
+				case SQ2:
+				case SQ3:
+				case SQ4:
+				case SQ5:
+				case SQ6:
+				case SQ7:
+					// skip the command byte and check for length
+					if (iCur >= in.length - 1) {
+						break Loop;  // buffer length error
+					}
+					// Select window pair to quote from
+					iDynamicWindow = iStaticWindow = in[iCur] - SQ0;
+					iCur++;
+
+					// FALL THROUGH
+
+				default:
+					// output as character
+					if (in[iCur] >= 0) {
+						// use static window
+						int ch = in[iCur] + staticOffset[iStaticWindow];
+						sb.append((char) ch);
+						iOut++;
+					} else {
+						// use dynamic window
+						int ch = (in[iCur] + 256); // adjust for signed bytes
+						ch -= 0x80;                // reduce to range 00..7F
+						ch += dynamicOffset[iDynamicWindow];
+
+						//DEBUG
+
+						if (ch < 1 << 16) {
+							// in Unicode range, output directly
+							sb.append((char) ch);
+							iOut++;
+						} else {
+							// this is an extension character
+
+							// compute and append the two surrogates:
+							// translate from 10000..10FFFF to 0..FFFFF
+							ch -= 0x10000;
+
+							// high surrogate = top 10 bits added to D800
+							sb.append((char) (0xD800 + (ch >> 10)));
+							iOut++;
+
+							// low surrogate = bottom 10 bits added to DC00
+							sb.append((char) (0xDC00 + (ch & ~0xFC00)));
+							iOut++;
+						}
+					}
+					break;
+
+				// define a dynamic window as extended
+				case SDX:
+					iCur += 2;
+					if (iCur >= in.length) {
+						break Loop;  // buffer length error
+					}
+					defineExtendedWindow(charFromTwoBytes(in[iCur - 1], in[iCur]));
+					break;
+
+				// Position a dynamic Window
+				case SD0:
+				case SD1:
+				case SD2:
+				case SD3:
+				case SD4:
+				case SD5:
+				case SD6:
+				case SD7:
+					iCur++;
+					if (iCur >= in.length) {
+						break Loop;  // buffer length error
+					}
+					defineWindow(in[iCur - 1] - SD0, in[iCur]);
+					break;
+
+				// Select a new dynamic Window
+				case SC0:
+				case SC1:
+				case SC2:
+				case SC3:
+				case SC4:
+				case SC5:
+				case SC6:
+				case SC7:
+					selectWindow(in[iCur] - SC0);
+					break;
+				case SCU:
+					// switch to Unicode mode and continue parsing
+					iCur = expandUnicode(in, iCur + 1, sb);
+					// DEBUG Debug.out("Expanded Unicode range until: ", iCur);
+					break;
+
+				case SQU:
+					// directly extract one Unicode character
+					iCur += 2;
+					if (iCur >= in.length) {
+						break Loop;  // buffer length error
+					} else {
+						char ch = charFromTwoBytes(in[iCur - 1], in[iCur]);
+
+						sb.append((char) ch);
+						iOut++;
+					}
+					break;
+
+				case Srs:
+					throw new IllegalInputException();
+					// break;
+			}
+		}
+
+		if (iCur >= in.length) {
+			//SUCCESS: all input used up
+			sb.setLength(iOut);
+			iIn = iCur;
+			return sb.toString();
+		}
+
+		//ERROR: premature end of input
+		throw new EndOfInputException();
+	}
+
+	/**
+	 * expand a byte array containing compressed Unicode
+	 */
+	public String expand(byte[] in)
+			throws IllegalInputException, EndOfInputException {
+		String str = expandSingleByte(in);
+		return str;
+	}
+
+
+	/**
+	 * reset is called to start with new input, w/o creating a new
+	 * instance
+	 */
+	public void reset() {
+		iOut = 0;
+		iIn = 0;
+		super.reset();
+	}
+
+	public int charsWritten() {
+		return iOut;
+	}
+
+	public int bytesRead() {
+		return iIn;
+	}
+}
--- a/app/src/main/java/org/unicode/scsu/IllegalInputException.java
+++ b/app/src/main/java/org/unicode/scsu/IllegalInputException.java
@ -0,0 +1,45 @@
+package org.unicode.scsu;
+
+/**
+ * This sample software accompanies Unicode Technical Report #6 and
+ * distributed as is by Unicode, Inc., subject to the following:
+ *
+ * Copyright <EFBFBD> 1996-1997 Unicode, Inc.. All Rights Reserved.
+ *
+ * Permission to use, copy, modify, and distribute this software
+ * without fee is hereby granted provided that this copyright notice
+ * appears in all copies.
+ *
+ * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
+ * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
+ * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
+ * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
+ * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
+ * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
+ * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
+ *
+ *  @author Asmus Freytag
+ *
+ *  @version 001 Dec 25 1996
+ *  @version 002 Jun 25 1997
+ *  @version 003 Jul 25 1997
+ *  @version 004 Aug 25 1997
+ *
+ * Unicode and the Unicode logo are trademarks of Unicode, Inc.,
+ * and are registered in some jurisdictions.
+ **/
+
+/**
+ * The input character array or input byte array contained
+ * illegal sequences of bytes or characters
+ */
+public class IllegalInputException extends java.lang.Exception {
+	public IllegalInputException() {
+		super("The input character array or input byte array contained illegal sequences of bytes or characters");
+	}
+
+	public IllegalInputException(String s) {
+		super(s);
+	}
+}
--- a/app/src/main/java/org/unicode/scsu/SCSU.java
+++ b/app/src/main/java/org/unicode/scsu/SCSU.java
@ -0,0 +1,260 @@
+package org.unicode.scsu;
+
+/**
+ * This sample software accompanies Unicode Technical Report #6 and
+ * distributed as is by Unicode, Inc., subject to the following:
+ *
+ * Copyright <EFBFBD> 1996-1998 Unicode, Inc.. All Rights Reserved.
+ *
+ * Permission to use, copy, modify, and distribute this software
+ * without fee is hereby granted provided that this copyright notice
+ * appears in all copies.
+ *
+ * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
+ * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
+ * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
+ * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
+ * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
+ * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
+ * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
+ *
+ *  @author Asmus Freytag
+ *
+ *  @version 001 Dec 25 1996
+ *  @version 002 Jun 25 1997
+ *  @version 003 Jul 25 1997
+ *  @version 004 Aug 25 1997
+ *  @version 005 Sep 30 1998
+ *
+ * Unicode and the Unicode logo are trademarks of Unicode, Inc.,
+ * and are registered in some jurisdictions.
+ **/
+
+/**
+ * Encoding text data in Unicode often requires more storage than using
+ * an existing 8-bit character set and limited to the subset of characters
+ * actually found in the text. The Unicode Compression Algorithm reduces
+ * the necessary storage while retaining the universality of Unicode.
+ * A full description of the algorithm can be found in document
+ * http://www.unicode.org/unicode/reports/tr6.html
+ * <p/>
+ * Summary
+ * <p/>
+ * The goal of the Unicode Compression Algorithm is the abilty to
+ * Express all code points in Unicode
+ * Approximate storage size for traditional character sets
+ * Work well for short strings
+ * Provide transparency for Latin-1 data
+ * Support very simple decoders
+ * Support simple as well as sophisticated encoders
+ * <p/>
+ * If needed, further compression can be achieved by layering standard
+ * file or disk-block based compression algorithms on top.
+ * <p/>
+ * <H2>Features</H2>
+ * <p/>
+ * Languages using small alphabets would contain runs of characters that
+ * are coded close together in Unicode. These runs are interrupted only
+ * by punctuation characters, which are themselves coded in proximity to
+ * each other in Unicode (usually in the ASCII range).
+ * <p/>
+ * Two basic mechanisms in the compression algorithm account for these two
+ * cases, sliding windows and static windows. A window is an area of 128
+ * consecutive characters in Unicode. In the compressed data stream, each
+ * character from a sliding window would be represented as a byte between
+ * 0x80 and 0xFF, while a byte from 0x20 to 0x7F (as well as CR, LF, and
+ * TAB) would always mean an ASCII character (or control).
+ * <p/>
+ * <H2>Notes on the Java implementation</H2>
+ * <p/>
+ * A limitation of Java is the exclusive use of a signed byte data type.
+ * The following work arounds are required:
+ * <p/>
+ * Copying a byte to an integer variable and adding 256 for 'negative'
+ * bytes gives an integer in the range 0-255.
+ * <p/>
+ * Values of char are between 0x0000 and 0xFFFF in Java. Arithmetic on
+ * char values is unsigned.
+ * <p/>
+ * Extended characters require an int to store them. The sign is not an
+ * issue because only 1024*1024 + 65536 extended characters exist.
+ */
+public abstract class SCSU {
+	/** Single Byte mode command values */
+
+	/**
+	 * SQ<i>n</i> Quote from Window . <p>
+	 * If the following byte is less than 0x80, quote from
+	 * static window <i>n</i>, else quote from dynamic window <i>n</i>.
+	 */
+
+	static final byte SQ0 = 0x01; // Quote from window pair 0
+	static final byte SQ1 = 0x02; // Quote from window pair 1
+	static final byte SQ2 = 0x03; // Quote from window pair 2
+	static final byte SQ3 = 0x04; // Quote from window pair 3
+	static final byte SQ4 = 0x05; // Quote from window pair 4
+	static final byte SQ5 = 0x06; // Quote from window pair 5
+	static final byte SQ6 = 0x07; // Quote from window pair 6
+	static final byte SQ7 = 0x08; // Quote from window pair 7
+
+	static final byte SDX = 0x0B; // Define a window as extended
+	static final byte Srs = 0x0C; // reserved
+
+	static final byte SQU = 0x0E; // Quote a single Unicode character
+	static final byte SCU = 0x0F; // Change to Unicode mode
+
+	/**
+	 * SC<i>n</i> Change to Window <i>n</i>. <p>
+	 * If the following bytes are less than 0x80, interpret them
+	 * as command bytes or pass them through, else add the offset
+	 * for dynamic window <i>n</i>.
+	 */
+	static final byte SC0 = 0x10; // Select window 0
+	static final byte SC1 = 0x11; // Select window 1
+	static final byte SC2 = 0x12; // Select window 2
+	static final byte SC3 = 0x13; // Select window 3
+	static final byte SC4 = 0x14; // Select window 4
+	static final byte SC5 = 0x15; // Select window 5
+	static final byte SC6 = 0x16; // Select window 6
+	static final byte SC7 = 0x17; // Select window 7
+	static final byte SD0 = 0x18; // Define and select window 0
+	static final byte SD1 = 0x19; // Define and select window 1
+	static final byte SD2 = 0x1A; // Define and select window 2
+	static final byte SD3 = 0x1B; // Define and select window 3
+	static final byte SD4 = 0x1C; // Define and select window 4
+	static final byte SD5 = 0x1D; // Define and select window 5
+	static final byte SD6 = 0x1E; // Define and select window 6
+	static final byte SD7 = 0x1F; // Define and select window 7
+
+	static final byte UC0 = (byte) 0xE0; // Select window 0
+	static final byte UC1 = (byte) 0xE1; // Select window 1
+	static final byte UC2 = (byte) 0xE2; // Select window 2
+	static final byte UC3 = (byte) 0xE3; // Select window 3
+	static final byte UC4 = (byte) 0xE4; // Select window 4
+	static final byte UC5 = (byte) 0xE5; // Select window 5
+	static final byte UC6 = (byte) 0xE6; // Select window 6
+	static final byte UC7 = (byte) 0xE7; // Select window 7
+	static final byte UD0 = (byte) 0xE8; // Define and select window 0
+	static final byte UD1 = (byte) 0xE9; // Define and select window 1
+	static final byte UD2 = (byte) 0xEA; // Define and select window 2
+	static final byte UD3 = (byte) 0xEB; // Define and select window 3
+	static final byte UD4 = (byte) 0xEC; // Define and select window 4
+	static final byte UD5 = (byte) 0xED; // Define and select window 5
+	static final byte UD6 = (byte) 0xEE; // Define and select window 6
+	static final byte UD7 = (byte) 0xEF; // Define and select window 7
+
+	static final byte UQU = (byte) 0xF0; // Quote a single Unicode character
+	static final byte UDX = (byte) 0xF1; // Define a Window as extended
+	static final byte Urs = (byte) 0xF2; // reserved
+
+	/**
+	 * constant offsets for the 8 static windows
+	 */
+	static final int staticOffset[] =
+			{
+					0x0000, // ASCII for quoted tags
+					0x0080, // Latin - 1 Supplement (for access to punctuation)
+					0x0100, // Latin Extended-A
+					0x0300, // Combining Diacritical Marks
+					0x2000, // General Punctuation
+					0x2080, // Currency Symbols
+					0x2100, // Letterlike Symbols and Number Forms
+					0x3000  // CJK Symbols and punctuation
+			};
+
+	/**
+	 * initial offsets for the 8 dynamic (sliding) windows
+	 */
+	static final int initialDynamicOffset[] =
+			{
+					0x0080, // Latin-1
+					0x00C0, // Latin Extended A   //@005 fixed from 0x0100
+					0x0400, // Cyrillic
+					0x0600, // Arabic
+					0x0900, // Devanagari
+					0x3040, // Hiragana
+					0x30A0, // Katakana
+					0xFF00  // Fullwidth ASCII
+			};
+
+	/**
+	 * dynamic window offsets, intitialize to default values.
+	 */
+	int dynamicOffset[] =
+			{
+					initialDynamicOffset[0],
+					initialDynamicOffset[1],
+					initialDynamicOffset[2],
+					initialDynamicOffset[3],
+					initialDynamicOffset[4],
+					initialDynamicOffset[5],
+					initialDynamicOffset[6],
+					initialDynamicOffset[7]
+			};
+
+	// The following method is common to encoder and decoder
+	/**
+	 * Unicode code points from 3400 to E000 are not adressible by
+	 * dynamic window, since in these areas no short run alphabets are
+	 * found. Therefore add gapOffset to all values from gapThreshold
+	 */
+	static final int gapThreshold = 0x68;
+	static final int gapOffset = 0xAC00;
+	/* values between reservedStart and fixedThreshold are reserved */
+	static final int reservedStart = 0xA8;
+
+	/**
+	 * These values are used in defineWindow
+	 */
+	/* use table of predefined fixed offsets for values from fixedThreshold */
+	static final int fixedThreshold = 0xF9;
+	/**
+	 * Table of fixed predefined Offsets, and byte values that index into  *
+	 */
+	static final int fixedOffset[] =
+			{
+        /* 0xF9 */ 0x00C0, // Latin-1 Letters + half of Latin Extended A
+        /* 0xFA */ 0x0250, // IPA extensions
+        /* 0xFB */ 0x0370, // Greek
+        /* 0xFC */ 0x0530, // Armenian
+        /* 0xFD */ 0x3040, // Hiragana
+        /* 0xFE */ 0x30A0, // Katakana
+        /* 0xFF */ 0xFF60  // Halfwidth Katakana
+			};
+	private int iWindow = 0;    // current active window
+
+	/**
+	 * whether a character is compressible
+	 */
+	public static boolean isCompressible(char ch) {
+		return (ch < 0x3400 || ch >= 0xE000);
+	}
+
+	/**
+	 * select the active dynamic window *
+	 */
+	protected void selectWindow(int iWindow) {
+		this.iWindow = iWindow;
+	}
+
+	/**
+	 * select the active dynamic window *
+	 */
+	protected int getCurrentWindow() {
+		return this.iWindow;
+	}
+
+	/**
+	 * reset is only needed to bail out after an exception and
+	 * restart with new input
+	 */
+	protected void reset() {
+
+		// reset the dynamic windows
+		for (int i = 0; i < dynamicOffset.length; i++) {
+			dynamicOffset[i] = initialDynamicOffset[i];
+		}
+		this.iWindow = 0;
+	}
+}