add Unicode compressor

This commit is contained in:
Harald Hoyer 2014-07-31 13:27:40 +02:00
parent 1bb906f660
commit f2af959037
7 changed files with 1384 additions and 0 deletions

View file

@ -0,0 +1,44 @@
package org.unicode.scsu;
/**
* This sample software accompanies Unicode Technical Report #6 and
* distributed as is by Unicode, Inc., subject to the following:
*
* Copyright <EFBFBD> 1996-1997 Unicode, Inc.. All Rights Reserved.
*
* Permission to use, copy, modify, and distribute this software
* without fee is hereby granted provided that this copyright notice
* appears in all copies.
*
* UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
* SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
* UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
* SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
* INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
* OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
*
* @author Asmus Freytag
*
* @version 001 Dec 25 1996
* @version 002 Jun 25 1997
* @version 003 Jul 25 1997
* @version 004 Aug 25 1997
*
* Unicode and the Unicode logo are trademarks of Unicode, Inc.,
* and are registered in some jurisdictions.
**/
/**
* The assertion failed
*/
public class Assert extends RuntimeException {
public Assert(String assertion) {
super("Assertion failed: " + assertion);
}
public Assert() {
super();
}
}

View file

@ -0,0 +1,556 @@
package org.unicode.scsu;
/**
* This sample software accompanies Unicode Technical Report #6 and
* distributed as is by Unicode, Inc., subject to the following:
*
* Copyright <EFBFBD> 1996-1997 Unicode, Inc.. All Rights Reserved.
*
* Permission to use, copy, modify, and distribute this software
* without fee is hereby granted provided that this copyright notice
* appears in all copies.
*
* UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
* SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
* UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
* SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
* INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
* OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
*
* @author Asmus Freytag
*
* @version 001 Dec 25 1996
* @version 002 Jun 25 1997
* @version 003 Jul 25 1997
* @version 004 Aug 25 1997
*
* Unicode and the Unicode logo are trademarks of Unicode, Inc.,
* and are registered in some jurisdictions.
**/
/**
* This class implements a simple compression algorithm
*/
/*
Note on exception handling
This compressor is designed so that it can be restarted after
an exception. All operations advancing input and/or output cursor
(iIn and iOut) either complete an action, or set a state (fUnicodeMode)
before updating the cursors.
*/
public class Compress extends SCSU {
static int iNextWindow = 3;
/**
* next input character to be read *
*/
private int iIn;
/**
* next output byte to be written *
*/
private int iOut;
/**
* start index of Unicode mode in output array, or -1 if in single byte mode *
*/
private int iSCU = -1;
/**
* true if the next command byte is of the Uxx family
*/
private boolean fUnicodeMode = false;
/**
* locate a window for a character given a table of offsets
*
* @param ch - character
* @param offsetTable - table of window offsets
* @return true if the character fits a window from the table of windows
*/
private boolean locateWindow(int ch, int[] offsetTable) {
// always try the current window first
int iWin = getCurrentWindow();
// if the character fits the current window
// just use the current window
if (iWin != -1 && ch >= offsetTable[iWin] && ch < offsetTable[iWin] + 0x80) {
return true;
}
// try all windows in order
for (iWin = 0; iWin < offsetTable.length; iWin++) {
if (ch >= offsetTable[iWin] && ch < offsetTable[iWin] + 0x80) {
selectWindow(iWin);
return true;
}
}
// none found
return false;
}
/**
* returns true if the character is ASCII, but not a control other than CR, LF and TAB
*/
private boolean isAsciiCrLfOrTab(int ch) {
return (ch >= 0x20 && ch <= 0x7F) // ASCII
|| ch == 0x09 || ch == 0x0A || ch == 0x0D; // CR/LF or TAB
}
/**
* output a run of characters in single byte mode
* In single byte mode pass through characters in the ASCII range, but
* quote characters overlapping with compression command codes. Runs
* of characters fitting the current window are output as runs of bytes
* in the range 0x80-0xFF. Checks for and validates Surrogate Pairs.
* Uses and updates the current input and output cursors store in
* the instance variables <i>iIn</i> and <i>iOut</i>.
*
* @param in - input character array
* @param out - output byte array
* @return the next chaacter to be processed. This may be an extended character.
*/
public int outputSingleByteRun(char[] in, byte[] out)
throws EndOfOutputException, EndOfInputException, IllegalInputException {
int iWin = getCurrentWindow();
loop:
while (iIn < in.length) {
int outlen = 0;
byte byte1 = 0;
byte byte2 = 0;
// get the input character
int ch = in[iIn];
int inlen = 1;
// Check input for Surrogate pair
if ((ch & 0xF800) == 0xD800) {
if ((ch & 0xFC00) == 0xDC00) {
// low surrogate out of order
throw new IllegalInputException("Unpaired low surrogate: " + iIn);
} else {
// have high surrogate now get low surrogate
if (iIn >= in.length - 1) {
// premature end of input
throw new EndOfInputException();
}
// get the char
int ch2 = in[iIn + 1];
// make sure it's a low surrogate
if ((ch2 & 0xFC00) != 0xDC00) {
// a low surrogate was required
throw new IllegalInputException("Unpaired high surrogate: " + (iIn + 1));
}
// combine the two values
ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
// ch = ch<<10 + ch2 - 0x36F0000;
inlen = 2;
}
}
// ASCII Letter, NUL, CR, LF and TAB are always passed through
if (isAsciiCrLfOrTab(ch) || ch == 0) {
// pass through directcly
byte2 = (byte) (ch & 0x7F);
outlen = 1;
}
// All other control codes must be quoted
else if (ch < 0x20) {
byte1 = SQ0;
byte2 = (byte) (ch);
outlen = 2;
}
// Letters that fit the current dynamic window
else if (ch >= dynamicOffset[iWin] && ch < dynamicOffset[iWin] + 0x80) {
ch -= dynamicOffset[iWin];
byte2 = (byte) (ch | 0x80);
outlen = 1;
}
// check for room in the output array
if (iOut + outlen >= out.length) {
throw new EndOfOutputException();
}
switch (outlen) {
default:
// need to use some other compression mode for this
// character so we terminate this loop
return ch; // input not finished
// output the characters
case 2:
out[iOut++] = byte1;
// fall through
case 1:
out[iOut++] = byte2;
break;
}
// advance input pointer
iIn += inlen;
}
return 0; // input all used up
}
/**
* quote a single character in single byte mode
* Quoting a character (aka 'non-locking shift') gives efficient access
* to characters that occur in isolation--usually punctuation characters.
* When quoting a character from a dynamic window use 0x80 - 0xFF, when
* quoting a character from a static window use 0x00-0x7f.
*
* @param ch - character to be quoted
* @param out - output byte array
*/
private void quoteSingleByte(int ch, byte[] out)
throws EndOfOutputException {
int iWin = getCurrentWindow();
// check for room in the output array
if (iOut >= out.length - 2) {
throw new EndOfOutputException();
}
// Output command byte followed by
out[iOut++] = (byte) (SQ0 + iWin);
// Letter that fits the current dynamic window
if (ch >= dynamicOffset[iWin] && ch < dynamicOffset[iWin] + 0x80) {
ch -= dynamicOffset[iWin];
out[iOut++] = (byte) (ch | 0x80);
}
// Letter that fits the current static window
else if (ch >= staticOffset[iWin] && ch < staticOffset[iWin] + 0x80) {
ch -= staticOffset[iWin];
out[iOut++] = (byte) ch;
} else {
throw new Assert("ch = " + ch + " not valid in quoteSingleByte. Internal Compressor Error");
}
// advance input pointer
iIn++;
}
/**
* output a run of characters in Unicode mode
* A run of Unicode mode consists of characters which are all in the
* range of non-compressible characters or isolated occurrence
* of any other characters. Characters in the range 0xE00-0xF2FF must
* be quoted to avoid overlap with the Unicode mode compression command codes.
* Uses and updates the current input and output cursors store in
* the instance variables <i>iIn</i> and <i>iOut</i>.
* NOTE: Characters from surrogate pairs are passed through and unlike single
* byte mode no checks are made for unpaired surrogate characters.
*
* @param in - input character array
* @param out - output byte array
* @return the next input character to be processed
*/
public char outputUnicodeRun(char[] in, byte[] out)
throws EndOfOutputException {
// current character
char ch = 0;
while (iIn < in.length) {
// get current input and set default output length
ch = in[iIn];
int outlen = 2;
// Characters in these ranges could potentially be compressed.
// We require 2 or more compressible characters to break the run
if (isCompressible(ch)) {
// check whether we can look ahead
if (iIn < in.length - 1) {
// DEBUG
char ch2 = in[iIn + 1];
if (isCompressible(ch2)) {
// at least 2 characters are compressible
// break the run
break;
}
//DEBUG
}
// If we get here, the current character is only character
// left in the input or it is followed by a non-compressible
// character. In neither case do we gain by breaking the
// run, so we proceed to output the character.
if (ch >= 0xE000 && ch <= 0xF2FF) {
// Characters in this range need to be escaped
outlen = 3;
}
}
// check that there is enough room to output the character
if (iOut >= out.length - outlen) {
// DEBUG
// if we got here, we ran out of space in the output array
throw new EndOfOutputException();
}
// output any characters that cannot be compressed,
if (outlen == 3) {
// output the quote character
out[iOut++] = (byte) UQU;
}
// pass the Unicode character in MSB,LSB order
out[iOut++] = (byte) (ch >>> 8);
out[iOut++] = (byte) (ch & 0xFF);
// advance input cursor
iIn++;
}
// return the last character
return ch;
}
/**
* redefine a window so it surrounds a given character value
* For now, this function uses window 3 exclusively (window 4
* for extended windows);
*
* @param ch - character around which window is positioned
* @param out - output byte array
* @return true if a window was successfully defined
*/
private boolean positionWindow(int ch, byte[] out, boolean fUnicodeMode)
throws IllegalInputException, EndOfOutputException {
int iWin = iNextWindow % 8; // simple LRU
int iPosition = 0;
// iPosition 0 is a reserved value
if (ch < 0x80) {
throw new Assert("ch < 0x80");
//return false;
}
// Check the fixed offsets
for (int i = 0; i < fixedOffset.length; i++) {
if (ch >= fixedOffset[i] && ch < fixedOffset[i] + 0x80) {
iPosition = i;
break;
}
}
if (iPosition != 0) {
// DEBUG
// ch fits in a fixed offset window position
dynamicOffset[iWin] = fixedOffset[iPosition];
iPosition += 0xF9;
} else if (ch < 0x3400) {
// calculate a window position command and set the offset
iPosition = ch >>> 7;
dynamicOffset[iWin] = ch & 0xFF80;
} else if (ch < 0xE000) {
// attempt to place a window where none can go
return false;
} else if (ch <= 0xFFFF) {
// calculate a window position command, accounting
// for the gap in position values, and set the offset
iPosition = ((ch - gapOffset) >>> 7);
dynamicOffset[iWin] = ch & 0xFF80;
} else {
// if we get here, the character is in the extended range.
// Always use Window 4 to define an extended window
iPosition = (ch - 0x10000) >>> 7;
// DEBUG
iPosition |= iWin << 13;
dynamicOffset[iWin] = ch & 0x1FFF80;
}
// Outputting window defintion command for the general cases
if (iPosition < 0x100 && iOut < out.length - 1) {
out[iOut++] = (byte) ((fUnicodeMode ? UD0 : SD0) + iWin);
out[iOut++] = (byte) (iPosition & 0xFF);
}
// Output an extended window definiton command
else if (iPosition >= 0x100 && iOut < out.length - 2) {
out[iOut++] = (byte) (fUnicodeMode ? UDX : SDX);
out[iOut++] = (byte) ((iPosition >>> 8) & 0xFF);
out[iOut++] = (byte) (iPosition & 0xFF);
} else {
throw new EndOfOutputException();
}
selectWindow(iWin);
iNextWindow++;
return true;
}
/**
* compress a Unicode character array with some simplifying assumptions
*/
public int simpleCompress(char[] in, int iStartIn, byte[] out, int iStartOut)
throws IllegalInputException, EndOfInputException, EndOfOutputException {
iIn = iStartIn;
iOut = iStartOut;
while (iIn < in.length) {
int ch;
// previously we switched to a Unicode run
if (iSCU != -1) {
// output characters as Unicode
ch = outputUnicodeRun(in, out);
// for single character Unicode runs (3 bytes) use quote
if (iOut - iSCU == 3) {
// go back and fix up the SCU to an SQU instead
out[iSCU] = (byte) SQU;
iSCU = -1;
continue;
} else {
iSCU = -1;
fUnicodeMode = true;
}
}
// next, try to output characters as single byte run
else {
ch = outputSingleByteRun(in, out);
}
// check whether we still have input
if (iIn == in.length) {
break; // no more input
}
// if we get here, we have a consistent value for ch, whether or
// not it is an regular or extended character. Locate or define a
// Window for the current character
// Check that we have enough room to output the command byte
if (iOut >= out.length - 1) {
throw new EndOfOutputException();
}
// In order to switch away from Unicode mode, it is necessary
// to select (or define) a window. If the characters that follow
// the Unicode range are ASCII characters, we can't use them
// to decide which window to select, since ASCII characters don't
// influence window settings. This loop looks ahead until it finds
// one compressible character that isn't in the ASCII range.
for (int ich = iIn; ch < 0x80; ich++) {
if (ich == in.length || !isCompressible(in[ich])) {
// if there are only ASCII characters left,
ch = in[iIn];
break;
}
ch = in[ich]; // lookahead for next non-ASCII char
}
// The character value contained in ch here will only be used to select
// output modes. Actual output of characters starts with in[iIn] and
// only takes place near the top of the loop.
int iprevWindow = getCurrentWindow();
// try to locate a dynamic window
if (ch < 0x80 || locateWindow(ch, dynamicOffset)) {
// lookahead to use SQn instead of SCn for single
// character interruptions of runs in current window
if (!fUnicodeMode && iIn < in.length - 1) {
char ch2 = in[iIn + 1];
if (ch2 >= dynamicOffset[iprevWindow] &&
ch2 < dynamicOffset[iprevWindow] + 0x80) {
quoteSingleByte(ch, out);
selectWindow(iprevWindow);
continue;
}
}
out[iOut++] = (byte) ((fUnicodeMode ? UC0 : SC0) + getCurrentWindow());
fUnicodeMode = false;
}
// try to locate a static window
else if (!fUnicodeMode && locateWindow(ch, staticOffset)) {
// static windows are not accessible from Unicode mode
quoteSingleByte(ch, out);
selectWindow(iprevWindow); // restore current Window settings
continue;
}
// try to define a window around ch
else if (positionWindow(ch, out, fUnicodeMode)) {
fUnicodeMode = false;
}
// If all else fails, start a Unicode run
else {
iSCU = iOut;
out[iOut++] = (byte) SCU;
continue;
}
}
return iOut - iStartOut;
}
public byte[] compress(String inStr)
throws IllegalInputException, EndOfInputException {
// Running out of room for output can cause non-optimal
// compression. In order to not slow down compression too
// much, not all intermediate state is constantly saved.
byte[] out = new byte[inStr.length() * 2];
char[] in = inStr.toCharArray();
//DEBUG
int iLen = 0;
reset();
while (true) {
try {
simpleCompress(in, charsRead(), out, bytesWritten());
// if we get here things went fine.
break;
} catch (EndOfOutputException e) {
// create a larger output buffer and continue
byte[] largerOut = new byte[out.length * 2];
System.arraycopy(out, 0, largerOut, 0, out.length);
out = largerOut;
}
}
byte[] trimmedOut = new byte[bytesWritten()];
System.arraycopy(out, 0, trimmedOut, 0, trimmedOut.length);
out = trimmedOut;
return out;
}
/**
* reset is only needed to bail out after an exception and
* restart with new input
*/
public void reset() {
super.reset();
fUnicodeMode = false;
iSCU = -1;
}
/**
* returns the number of bytes written *
*/
public int bytesWritten() {
return iOut;
}
/**
* returns the number of bytes written *
*/
public int charsRead() {
return iIn;
}
}

View file

@ -0,0 +1,45 @@
package org.unicode.scsu;
/**
* This sample software accompanies Unicode Technical Report #6 and
* distributed as is by Unicode, Inc., subject to the following:
*
* Copyright <EFBFBD> 1996-1997 Unicode, Inc.. All Rights Reserved.
*
* Permission to use, copy, modify, and distribute this software
* without fee is hereby granted provided that this copyright notice
* appears in all copies.
*
* UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
* SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
* UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
* SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
* INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
* OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
*
* @author Asmus Freytag
*
* @version 001 Dec 25 1996
* @version 002 Jun 25 1997
* @version 003 Jul 25 1997
* @version 004 Aug 25 1997
*
* Unicode and the Unicode logo are trademarks of Unicode, Inc.,
* and are registered in some jurisdictions.
**/
/**
* The input string or input byte array ended prematurely
*/
public class EndOfInputException
extends java.lang.Exception {
public EndOfInputException() {
super("The input string or input byte array ended prematurely");
}
public EndOfInputException(String s) {
super(s);
}
}

View file

@ -0,0 +1,46 @@
package org.unicode.scsu;
/**
* This sample software accompanies Unicode Technical Report #6 and
* distributed as is by Unicode, Inc., subject to the following:
*
* Copyright <EFBFBD> 1996-1997 Unicode, Inc.. All Rights Reserved.
*
* Permission to use, copy, modify, and distribute this software
* without fee is hereby granted provided that this copyright notice
* appears in all copies.
*
* UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
* SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
* UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
* SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
* INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
* OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
*
* @author Asmus Freytag
*
* @version 001 Dec 25 1996
* @version 002 Jun 25 1997
* @version 003 Jul 25 1997
*
* Unicode and the Unicode logo are trademarks of Unicode, Inc.,
* and are registered in some jurisdictions.
**/
/**
* The input string or input byte array ended prematurely
*/
public class EndOfOutputException
extends java.lang.Exception
{
public EndOfOutputException() {
super("The input string or input byte array ended prematurely");
}
public EndOfOutputException(String s) {
super(s);
}
}

View file

@ -0,0 +1,388 @@
package org.unicode.scsu;
/**
* This sample software accompanies Unicode Technical Report #6 and
* distributed as is by Unicode, Inc., subject to the following:
*
* Copyright <EFBFBD> 1996-1998 Unicode, Inc.. All Rights Reserved.
*
* Permission to use, copy, modify, and distribute this software
* without fee is hereby granted provided that this copyright notice
* appears in all copies.
*
* UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
* SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
* UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
* SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
* INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
* OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
*
* @author Asmus Freytag
*
* @version 001 Dec 25 1996
* @version 002 Jun 25 1997
* @version 003 Jul 25 1997
* @version 004 Aug 25 1997
* @version 005 Sep 30 1998
*
* Unicode and the Unicode logo are trademarks of Unicode, Inc.,
* and are registered in some jurisdictions.
**/
/**
* Reference decoder for the Standard Compression Scheme for Unicode (SCSU)
* <p/>
* <H2>Notes on the Java implementation</H2>
* <p/>
* A limitation of Java is the exclusive use of a signed byte data type.
* The following work arounds are required:
* <p/>
* Copying a byte to an integer variable and adding 256 for 'negative'
* bytes gives an integer in the range 0-255.
* <p/>
* Values of char are between 0x0000 and 0xFFFF in Java. Arithmetic on
* char values is unsigned.
* <p/>
* Extended characters require an int to store them. The sign is not an
* issue because only 1024*1024 + 65536 extended characters exist.
*/
public class Expand extends SCSU {
/**
* string buffer length used by the following functions
*/
protected int iOut = 0;
/**
* input cursor used by the following functions
*/
protected int iIn = 0;
/**
* assemble a char from two bytes
* In Java bytes are signed quantities, while chars are unsigned
*
* @param hi most significant byte
* @param lo least significant byte
* @return the character
*/
public static char charFromTwoBytes(byte hi, byte lo) {
char ch = (char) (lo >= 0 ? lo : 256 + lo);
return (char) (ch + (char) ((hi >= 0 ? hi : 256 + hi) << 8));
}
/**
* (re-)define (and select) a dynamic window
* A sliding window position cannot start at any Unicode value,
* so rather than providing an absolute offset, this function takes
* an index value which selects among the possible starting values.
* <p/>
* Most scripts in Unicode start on or near a half-block boundary
* so the default behaviour is to multiply the index by 0x80. Han,
* Hangul, Surrogates and other scripts between 0x3400 and 0xDFFF
* show very poor locality--therefore no sliding window can be set
* there. A jumpOffset is added to the index value to skip that region,
* and only 167 index values total are required to select all eligible
* half-blocks.
* <p/>
* Finally, a few scripts straddle half block boundaries. For them, a
* table of fixed offsets is used, and the index values from 0xF9 to
* 0xFF are used to select these special offsets.
* <p/>
* After (re-)defining a windows location it is selected so it is ready
* for use.
* <p/>
* Recall that all Windows are of the same length (128 code positions).
*
* @param iWindow - index of the window to be (re-)defined
* @param bOffset - index for the new offset value
*/
// @005 protected <-- private here and elsewhere
protected void defineWindow(int iWindow, byte bOffset)
throws IllegalInputException {
int iOffset = (bOffset < 0 ? bOffset + 256 : bOffset);
// 0 is a reserved value
if (iOffset == 0) {
throw new IllegalInputException();
} else if (iOffset < gapThreshold) {
dynamicOffset[iWindow] = iOffset << 7;
} else if (iOffset < reservedStart) {
dynamicOffset[iWindow] = (iOffset << 7) + gapOffset;
} else if (iOffset < fixedThreshold) {
// more reserved values
throw new IllegalInputException("iOffset == " + iOffset);
} else {
dynamicOffset[iWindow] = fixedOffset[iOffset - fixedThreshold];
}
// make the redefined window the active one
selectWindow(iWindow);
}
/**
* (re-)define (and select) a window as an extended dynamic window
* The surrogate area in Unicode allows access to 2**20 codes beyond the
* first 64K codes by combining one of 1024 characters from the High
* Surrogate Area with one of 1024 characters from the Low Surrogate
* Area (see Unicode 2.0 for the details).
* <p/>
* The tags SDX and UDX set the window such that each subsequent byte in
* the range 80 to FF represents a surrogate pair. The following diagram
* shows how the bits in the two bytes following the SDX or UDX, and a
* subsequent data byte, map onto the bits in the resulting surrogate pair.
* <p/>
* hbyte lbyte data
* nnnwwwww zzzzzyyy 1xxxxxxx
* <p/>
* high-surrogate low-surrogate
* 110110wwwwwzzzzz 110111yyyxxxxxxx
*
* @param chOffset - Since the three top bits of chOffset are not needed to
* set the location of the extended Window, they are used instead
* to select the window, thereby reducing the number of needed command codes.
* The bottom 13 bits of chOffset are used to calculate the offset relative to
* a 7 bit input data byte to yield the 20 bits expressed by each surrogate pair.
*/
protected void defineExtendedWindow(char chOffset) {
// The top 3 bits of iOffsetHi are the window index
int iWindow = chOffset >>> 13;
// Calculate the new offset
dynamicOffset[iWindow] = ((chOffset & 0x1FFF) << 7) + (1 << 16);
// make the redefined window the active one
selectWindow(iWindow);
}
/**
* expand input that is in Unicode mode
*
* @param in input byte array to be expanded
* @param iCur starting index
* @param sb string buffer to which to append expanded input
* @return the index for the lastc byte processed
*/
protected int expandUnicode(byte[] in, int iCur, StringBuffer sb)
throws IllegalInputException, EndOfInputException {
for (; iCur < in.length - 1; iCur += 2) // step by 2:
{
byte b = in[iCur];
if (b >= UC0 && b <= UC7) {
selectWindow(b - UC0);
return iCur;
} else if (b >= UD0 && b <= UD7) {
defineWindow(b - UD0, in[iCur + 1]);
return iCur + 1;
} else if (b == UDX) {
if (iCur >= in.length - 2) {
break; // buffer error
}
defineExtendedWindow(charFromTwoBytes(in[iCur + 1], in[iCur + 2]));
return iCur + 2;
} else if (b == UQU) {
if (iCur >= in.length - 2) {
break; // error
}
// Skip command byte and output Unicode character
iCur++;
}
// output a Unicode character
char ch = charFromTwoBytes(in[iCur], in[iCur + 1]);
sb.append((char) ch);
iOut++;
}
if (iCur == in.length) {
return iCur;
}
// Error condition
throw new EndOfInputException();
}
/**
* expand portion of the input that is in single byte mode *
*/
protected String expandSingleByte(byte[] in)
throws IllegalInputException, EndOfInputException {
/* Allocate the output buffer. Because of control codes, generally
each byte of input results in fewer than one character of
output. Using in.length as an intial allocation length should avoid
the need to reallocate in mid-stream. The exception to this rule are
surrogates. */
StringBuffer sb = new StringBuffer(in.length);
iOut = 0;
// Loop until all input is exhausted or an error occurred
int iCur;
Loop:
for (iCur = 0; iCur < in.length; iCur++) {
// DEBUG Debug.out("Expanding: ", iCur);
// Default behaviour is that ASCII characters are passed through
// (staticOffset[0] == 0) and characters with the high bit on are
// offset by the current dynamic (or sliding) window (this.iWindow)
int iStaticWindow = 0;
int iDynamicWindow = getCurrentWindow();
switch (in[iCur]) {
// Quote from a static Window
case SQ0:
case SQ1:
case SQ2:
case SQ3:
case SQ4:
case SQ5:
case SQ6:
case SQ7:
// skip the command byte and check for length
if (iCur >= in.length - 1) {
break Loop; // buffer length error
}
// Select window pair to quote from
iDynamicWindow = iStaticWindow = in[iCur] - SQ0;
iCur++;
// FALL THROUGH
default:
// output as character
if (in[iCur] >= 0) {
// use static window
int ch = in[iCur] + staticOffset[iStaticWindow];
sb.append((char) ch);
iOut++;
} else {
// use dynamic window
int ch = (in[iCur] + 256); // adjust for signed bytes
ch -= 0x80; // reduce to range 00..7F
ch += dynamicOffset[iDynamicWindow];
//DEBUG
if (ch < 1 << 16) {
// in Unicode range, output directly
sb.append((char) ch);
iOut++;
} else {
// this is an extension character
// compute and append the two surrogates:
// translate from 10000..10FFFF to 0..FFFFF
ch -= 0x10000;
// high surrogate = top 10 bits added to D800
sb.append((char) (0xD800 + (ch >> 10)));
iOut++;
// low surrogate = bottom 10 bits added to DC00
sb.append((char) (0xDC00 + (ch & ~0xFC00)));
iOut++;
}
}
break;
// define a dynamic window as extended
case SDX:
iCur += 2;
if (iCur >= in.length) {
break Loop; // buffer length error
}
defineExtendedWindow(charFromTwoBytes(in[iCur - 1], in[iCur]));
break;
// Position a dynamic Window
case SD0:
case SD1:
case SD2:
case SD3:
case SD4:
case SD5:
case SD6:
case SD7:
iCur++;
if (iCur >= in.length) {
break Loop; // buffer length error
}
defineWindow(in[iCur - 1] - SD0, in[iCur]);
break;
// Select a new dynamic Window
case SC0:
case SC1:
case SC2:
case SC3:
case SC4:
case SC5:
case SC6:
case SC7:
selectWindow(in[iCur] - SC0);
break;
case SCU:
// switch to Unicode mode and continue parsing
iCur = expandUnicode(in, iCur + 1, sb);
// DEBUG Debug.out("Expanded Unicode range until: ", iCur);
break;
case SQU:
// directly extract one Unicode character
iCur += 2;
if (iCur >= in.length) {
break Loop; // buffer length error
} else {
char ch = charFromTwoBytes(in[iCur - 1], in[iCur]);
sb.append((char) ch);
iOut++;
}
break;
case Srs:
throw new IllegalInputException();
// break;
}
}
if (iCur >= in.length) {
//SUCCESS: all input used up
sb.setLength(iOut);
iIn = iCur;
return sb.toString();
}
//ERROR: premature end of input
throw new EndOfInputException();
}
/**
* expand a byte array containing compressed Unicode
*/
public String expand(byte[] in)
throws IllegalInputException, EndOfInputException {
String str = expandSingleByte(in);
return str;
}
/**
* reset is called to start with new input, w/o creating a new
* instance
*/
public void reset() {
iOut = 0;
iIn = 0;
super.reset();
}
public int charsWritten() {
return iOut;
}
public int bytesRead() {
return iIn;
}
}

View file

@ -0,0 +1,45 @@
package org.unicode.scsu;
/**
* This sample software accompanies Unicode Technical Report #6 and
* distributed as is by Unicode, Inc., subject to the following:
*
* Copyright <EFBFBD> 1996-1997 Unicode, Inc.. All Rights Reserved.
*
* Permission to use, copy, modify, and distribute this software
* without fee is hereby granted provided that this copyright notice
* appears in all copies.
*
* UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
* SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
* UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
* SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
* INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
* OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
*
* @author Asmus Freytag
*
* @version 001 Dec 25 1996
* @version 002 Jun 25 1997
* @version 003 Jul 25 1997
* @version 004 Aug 25 1997
*
* Unicode and the Unicode logo are trademarks of Unicode, Inc.,
* and are registered in some jurisdictions.
**/
/**
* The input character array or input byte array contained
* illegal sequences of bytes or characters
*/
public class IllegalInputException extends java.lang.Exception {
public IllegalInputException() {
super("The input character array or input byte array contained illegal sequences of bytes or characters");
}
public IllegalInputException(String s) {
super(s);
}
}

View file

@ -0,0 +1,260 @@
package org.unicode.scsu;
/**
* This sample software accompanies Unicode Technical Report #6 and
* distributed as is by Unicode, Inc., subject to the following:
*
* Copyright <EFBFBD> 1996-1998 Unicode, Inc.. All Rights Reserved.
*
* Permission to use, copy, modify, and distribute this software
* without fee is hereby granted provided that this copyright notice
* appears in all copies.
*
* UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
* SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
* UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
* SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
* INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
* OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
*
* @author Asmus Freytag
*
* @version 001 Dec 25 1996
* @version 002 Jun 25 1997
* @version 003 Jul 25 1997
* @version 004 Aug 25 1997
* @version 005 Sep 30 1998
*
* Unicode and the Unicode logo are trademarks of Unicode, Inc.,
* and are registered in some jurisdictions.
**/
/**
* Encoding text data in Unicode often requires more storage than using
* an existing 8-bit character set and limited to the subset of characters
* actually found in the text. The Unicode Compression Algorithm reduces
* the necessary storage while retaining the universality of Unicode.
* A full description of the algorithm can be found in document
* http://www.unicode.org/unicode/reports/tr6.html
* <p/>
* Summary
* <p/>
* The goal of the Unicode Compression Algorithm is the abilty to
* Express all code points in Unicode
* Approximate storage size for traditional character sets
* Work well for short strings
* Provide transparency for Latin-1 data
* Support very simple decoders
* Support simple as well as sophisticated encoders
* <p/>
* If needed, further compression can be achieved by layering standard
* file or disk-block based compression algorithms on top.
* <p/>
* <H2>Features</H2>
* <p/>
* Languages using small alphabets would contain runs of characters that
* are coded close together in Unicode. These runs are interrupted only
* by punctuation characters, which are themselves coded in proximity to
* each other in Unicode (usually in the ASCII range).
* <p/>
* Two basic mechanisms in the compression algorithm account for these two
* cases, sliding windows and static windows. A window is an area of 128
* consecutive characters in Unicode. In the compressed data stream, each
* character from a sliding window would be represented as a byte between
* 0x80 and 0xFF, while a byte from 0x20 to 0x7F (as well as CR, LF, and
* TAB) would always mean an ASCII character (or control).
* <p/>
* <H2>Notes on the Java implementation</H2>
* <p/>
* A limitation of Java is the exclusive use of a signed byte data type.
* The following work arounds are required:
* <p/>
* Copying a byte to an integer variable and adding 256 for 'negative'
* bytes gives an integer in the range 0-255.
* <p/>
* Values of char are between 0x0000 and 0xFFFF in Java. Arithmetic on
* char values is unsigned.
* <p/>
* Extended characters require an int to store them. The sign is not an
* issue because only 1024*1024 + 65536 extended characters exist.
*/
public abstract class SCSU {
/** Single Byte mode command values */
/**
* SQ<i>n</i> Quote from Window . <p>
* If the following byte is less than 0x80, quote from
* static window <i>n</i>, else quote from dynamic window <i>n</i>.
*/
static final byte SQ0 = 0x01; // Quote from window pair 0
static final byte SQ1 = 0x02; // Quote from window pair 1
static final byte SQ2 = 0x03; // Quote from window pair 2
static final byte SQ3 = 0x04; // Quote from window pair 3
static final byte SQ4 = 0x05; // Quote from window pair 4
static final byte SQ5 = 0x06; // Quote from window pair 5
static final byte SQ6 = 0x07; // Quote from window pair 6
static final byte SQ7 = 0x08; // Quote from window pair 7
static final byte SDX = 0x0B; // Define a window as extended
static final byte Srs = 0x0C; // reserved
static final byte SQU = 0x0E; // Quote a single Unicode character
static final byte SCU = 0x0F; // Change to Unicode mode
/**
* SC<i>n</i> Change to Window <i>n</i>. <p>
* If the following bytes are less than 0x80, interpret them
* as command bytes or pass them through, else add the offset
* for dynamic window <i>n</i>.
*/
static final byte SC0 = 0x10; // Select window 0
static final byte SC1 = 0x11; // Select window 1
static final byte SC2 = 0x12; // Select window 2
static final byte SC3 = 0x13; // Select window 3
static final byte SC4 = 0x14; // Select window 4
static final byte SC5 = 0x15; // Select window 5
static final byte SC6 = 0x16; // Select window 6
static final byte SC7 = 0x17; // Select window 7
static final byte SD0 = 0x18; // Define and select window 0
static final byte SD1 = 0x19; // Define and select window 1
static final byte SD2 = 0x1A; // Define and select window 2
static final byte SD3 = 0x1B; // Define and select window 3
static final byte SD4 = 0x1C; // Define and select window 4
static final byte SD5 = 0x1D; // Define and select window 5
static final byte SD6 = 0x1E; // Define and select window 6
static final byte SD7 = 0x1F; // Define and select window 7
static final byte UC0 = (byte) 0xE0; // Select window 0
static final byte UC1 = (byte) 0xE1; // Select window 1
static final byte UC2 = (byte) 0xE2; // Select window 2
static final byte UC3 = (byte) 0xE3; // Select window 3
static final byte UC4 = (byte) 0xE4; // Select window 4
static final byte UC5 = (byte) 0xE5; // Select window 5
static final byte UC6 = (byte) 0xE6; // Select window 6
static final byte UC7 = (byte) 0xE7; // Select window 7
static final byte UD0 = (byte) 0xE8; // Define and select window 0
static final byte UD1 = (byte) 0xE9; // Define and select window 1
static final byte UD2 = (byte) 0xEA; // Define and select window 2
static final byte UD3 = (byte) 0xEB; // Define and select window 3
static final byte UD4 = (byte) 0xEC; // Define and select window 4
static final byte UD5 = (byte) 0xED; // Define and select window 5
static final byte UD6 = (byte) 0xEE; // Define and select window 6
static final byte UD7 = (byte) 0xEF; // Define and select window 7
static final byte UQU = (byte) 0xF0; // Quote a single Unicode character
static final byte UDX = (byte) 0xF1; // Define a Window as extended
static final byte Urs = (byte) 0xF2; // reserved
/**
* constant offsets for the 8 static windows
*/
static final int staticOffset[] =
{
0x0000, // ASCII for quoted tags
0x0080, // Latin - 1 Supplement (for access to punctuation)
0x0100, // Latin Extended-A
0x0300, // Combining Diacritical Marks
0x2000, // General Punctuation
0x2080, // Currency Symbols
0x2100, // Letterlike Symbols and Number Forms
0x3000 // CJK Symbols and punctuation
};
/**
* initial offsets for the 8 dynamic (sliding) windows
*/
static final int initialDynamicOffset[] =
{
0x0080, // Latin-1
0x00C0, // Latin Extended A //@005 fixed from 0x0100
0x0400, // Cyrillic
0x0600, // Arabic
0x0900, // Devanagari
0x3040, // Hiragana
0x30A0, // Katakana
0xFF00 // Fullwidth ASCII
};
/**
* dynamic window offsets, intitialize to default values.
*/
int dynamicOffset[] =
{
initialDynamicOffset[0],
initialDynamicOffset[1],
initialDynamicOffset[2],
initialDynamicOffset[3],
initialDynamicOffset[4],
initialDynamicOffset[5],
initialDynamicOffset[6],
initialDynamicOffset[7]
};
// The following method is common to encoder and decoder
/**
* Unicode code points from 3400 to E000 are not adressible by
* dynamic window, since in these areas no short run alphabets are
* found. Therefore add gapOffset to all values from gapThreshold
*/
static final int gapThreshold = 0x68;
static final int gapOffset = 0xAC00;
/* values between reservedStart and fixedThreshold are reserved */
static final int reservedStart = 0xA8;
/**
* These values are used in defineWindow
*/
/* use table of predefined fixed offsets for values from fixedThreshold */
static final int fixedThreshold = 0xF9;
/**
* Table of fixed predefined Offsets, and byte values that index into *
*/
static final int fixedOffset[] =
{
/* 0xF9 */ 0x00C0, // Latin-1 Letters + half of Latin Extended A
/* 0xFA */ 0x0250, // IPA extensions
/* 0xFB */ 0x0370, // Greek
/* 0xFC */ 0x0530, // Armenian
/* 0xFD */ 0x3040, // Hiragana
/* 0xFE */ 0x30A0, // Katakana
/* 0xFF */ 0xFF60 // Halfwidth Katakana
};
private int iWindow = 0; // current active window
/**
* whether a character is compressible
*/
public static boolean isCompressible(char ch) {
return (ch < 0x3400 || ch >= 0xE000);
}
/**
* select the active dynamic window *
*/
protected void selectWindow(int iWindow) {
this.iWindow = iWindow;
}
/**
* select the active dynamic window *
*/
protected int getCurrentWindow() {
return this.iWindow;
}
/**
* reset is only needed to bail out after an exception and
* restart with new input
*/
protected void reset() {
// reset the dynamic windows
for (int i = 0; i < dynamicOffset.length; i++) {
dynamicOffset[i] = initialDynamicOffset[i];
}
this.iWindow = 0;
}
}