add Unicode compressor
This commit is contained in:
parent
1bb906f660
commit
f2af959037
44
app/src/main/java/org/unicode/scsu/Assert.java
Normal file
44
app/src/main/java/org/unicode/scsu/Assert.java
Normal file
|
@ -0,0 +1,44 @@
|
|||
package org.unicode.scsu;
|
||||
|
||||
/**
|
||||
* This sample software accompanies Unicode Technical Report #6 and
|
||||
* distributed as is by Unicode, Inc., subject to the following:
|
||||
*
|
||||
* Copyright <EFBFBD> 1996-1997 Unicode, Inc.. All Rights Reserved.
|
||||
*
|
||||
* Permission to use, copy, modify, and distribute this software
|
||||
* without fee is hereby granted provided that this copyright notice
|
||||
* appears in all copies.
|
||||
*
|
||||
* UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
|
||||
* SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
|
||||
* BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
|
||||
* UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
|
||||
* SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
|
||||
* INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
|
||||
* OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
|
||||
*
|
||||
* @author Asmus Freytag
|
||||
*
|
||||
* @version 001 Dec 25 1996
|
||||
* @version 002 Jun 25 1997
|
||||
* @version 003 Jul 25 1997
|
||||
* @version 004 Aug 25 1997
|
||||
*
|
||||
* Unicode and the Unicode logo are trademarks of Unicode, Inc.,
|
||||
* and are registered in some jurisdictions.
|
||||
**/
|
||||
|
||||
/**
|
||||
* The assertion failed
|
||||
*/
|
||||
public class Assert extends RuntimeException {
|
||||
public Assert(String assertion) {
|
||||
super("Assertion failed: " + assertion);
|
||||
}
|
||||
|
||||
public Assert() {
|
||||
super();
|
||||
}
|
||||
}
|
556
app/src/main/java/org/unicode/scsu/Compress.java
Normal file
556
app/src/main/java/org/unicode/scsu/Compress.java
Normal file
|
@ -0,0 +1,556 @@
|
|||
package org.unicode.scsu;
|
||||
|
||||
/**
|
||||
* This sample software accompanies Unicode Technical Report #6 and
|
||||
* distributed as is by Unicode, Inc., subject to the following:
|
||||
*
|
||||
* Copyright <EFBFBD> 1996-1997 Unicode, Inc.. All Rights Reserved.
|
||||
*
|
||||
* Permission to use, copy, modify, and distribute this software
|
||||
* without fee is hereby granted provided that this copyright notice
|
||||
* appears in all copies.
|
||||
*
|
||||
* UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
|
||||
* SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
|
||||
* BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
|
||||
* UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
|
||||
* SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
|
||||
* INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
|
||||
* OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
|
||||
*
|
||||
* @author Asmus Freytag
|
||||
*
|
||||
* @version 001 Dec 25 1996
|
||||
* @version 002 Jun 25 1997
|
||||
* @version 003 Jul 25 1997
|
||||
* @version 004 Aug 25 1997
|
||||
*
|
||||
* Unicode and the Unicode logo are trademarks of Unicode, Inc.,
|
||||
* and are registered in some jurisdictions.
|
||||
**/
|
||||
|
||||
/**
|
||||
* This class implements a simple compression algorithm
|
||||
*/
|
||||
/*
|
||||
Note on exception handling
|
||||
This compressor is designed so that it can be restarted after
|
||||
an exception. All operations advancing input and/or output cursor
|
||||
(iIn and iOut) either complete an action, or set a state (fUnicodeMode)
|
||||
before updating the cursors.
|
||||
*/
|
||||
public class Compress extends SCSU {
|
||||
|
||||
static int iNextWindow = 3;
|
||||
/**
|
||||
* next input character to be read *
|
||||
*/
|
||||
private int iIn;
|
||||
/**
|
||||
* next output byte to be written *
|
||||
*/
|
||||
private int iOut;
|
||||
/**
|
||||
* start index of Unicode mode in output array, or -1 if in single byte mode *
|
||||
*/
|
||||
private int iSCU = -1;
|
||||
/**
|
||||
* true if the next command byte is of the Uxx family
|
||||
*/
|
||||
private boolean fUnicodeMode = false;
|
||||
|
||||
/**
|
||||
* locate a window for a character given a table of offsets
|
||||
*
|
||||
* @param ch - character
|
||||
* @param offsetTable - table of window offsets
|
||||
* @return true if the character fits a window from the table of windows
|
||||
*/
|
||||
private boolean locateWindow(int ch, int[] offsetTable) {
|
||||
// always try the current window first
|
||||
int iWin = getCurrentWindow();
|
||||
|
||||
// if the character fits the current window
|
||||
// just use the current window
|
||||
if (iWin != -1 && ch >= offsetTable[iWin] && ch < offsetTable[iWin] + 0x80) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// try all windows in order
|
||||
for (iWin = 0; iWin < offsetTable.length; iWin++) {
|
||||
if (ch >= offsetTable[iWin] && ch < offsetTable[iWin] + 0x80) {
|
||||
selectWindow(iWin);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
// none found
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* returns true if the character is ASCII, but not a control other than CR, LF and TAB
|
||||
*/
|
||||
private boolean isAsciiCrLfOrTab(int ch) {
|
||||
return (ch >= 0x20 && ch <= 0x7F) // ASCII
|
||||
|| ch == 0x09 || ch == 0x0A || ch == 0x0D; // CR/LF or TAB
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* output a run of characters in single byte mode
|
||||
* In single byte mode pass through characters in the ASCII range, but
|
||||
* quote characters overlapping with compression command codes. Runs
|
||||
* of characters fitting the current window are output as runs of bytes
|
||||
* in the range 0x80-0xFF. Checks for and validates Surrogate Pairs.
|
||||
* Uses and updates the current input and output cursors store in
|
||||
* the instance variables <i>iIn</i> and <i>iOut</i>.
|
||||
*
|
||||
* @param in - input character array
|
||||
* @param out - output byte array
|
||||
* @return the next chaacter to be processed. This may be an extended character.
|
||||
*/
|
||||
public int outputSingleByteRun(char[] in, byte[] out)
|
||||
throws EndOfOutputException, EndOfInputException, IllegalInputException {
|
||||
int iWin = getCurrentWindow();
|
||||
loop:
|
||||
while (iIn < in.length) {
|
||||
int outlen = 0;
|
||||
byte byte1 = 0;
|
||||
byte byte2 = 0;
|
||||
|
||||
// get the input character
|
||||
int ch = in[iIn];
|
||||
|
||||
int inlen = 1;
|
||||
|
||||
// Check input for Surrogate pair
|
||||
if ((ch & 0xF800) == 0xD800) {
|
||||
if ((ch & 0xFC00) == 0xDC00) {
|
||||
// low surrogate out of order
|
||||
throw new IllegalInputException("Unpaired low surrogate: " + iIn);
|
||||
} else {
|
||||
// have high surrogate now get low surrogate
|
||||
if (iIn >= in.length - 1) {
|
||||
// premature end of input
|
||||
throw new EndOfInputException();
|
||||
}
|
||||
// get the char
|
||||
int ch2 = in[iIn + 1];
|
||||
|
||||
// make sure it's a low surrogate
|
||||
if ((ch2 & 0xFC00) != 0xDC00) {
|
||||
// a low surrogate was required
|
||||
throw new IllegalInputException("Unpaired high surrogate: " + (iIn + 1));
|
||||
}
|
||||
|
||||
// combine the two values
|
||||
ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
|
||||
// ch = ch<<10 + ch2 - 0x36F0000;
|
||||
|
||||
inlen = 2;
|
||||
}
|
||||
}
|
||||
|
||||
// ASCII Letter, NUL, CR, LF and TAB are always passed through
|
||||
if (isAsciiCrLfOrTab(ch) || ch == 0) {
|
||||
// pass through directcly
|
||||
byte2 = (byte) (ch & 0x7F);
|
||||
outlen = 1;
|
||||
}
|
||||
|
||||
// All other control codes must be quoted
|
||||
else if (ch < 0x20) {
|
||||
byte1 = SQ0;
|
||||
byte2 = (byte) (ch);
|
||||
outlen = 2;
|
||||
}
|
||||
|
||||
// Letters that fit the current dynamic window
|
||||
else if (ch >= dynamicOffset[iWin] && ch < dynamicOffset[iWin] + 0x80) {
|
||||
ch -= dynamicOffset[iWin];
|
||||
byte2 = (byte) (ch | 0x80);
|
||||
outlen = 1;
|
||||
}
|
||||
|
||||
// check for room in the output array
|
||||
if (iOut + outlen >= out.length) {
|
||||
throw new EndOfOutputException();
|
||||
}
|
||||
|
||||
switch (outlen) {
|
||||
default:
|
||||
// need to use some other compression mode for this
|
||||
// character so we terminate this loop
|
||||
|
||||
return ch; // input not finished
|
||||
|
||||
// output the characters
|
||||
case 2:
|
||||
out[iOut++] = byte1;
|
||||
// fall through
|
||||
case 1:
|
||||
out[iOut++] = byte2;
|
||||
break;
|
||||
}
|
||||
// advance input pointer
|
||||
iIn += inlen;
|
||||
}
|
||||
return 0; // input all used up
|
||||
}
|
||||
|
||||
/**
|
||||
* quote a single character in single byte mode
|
||||
* Quoting a character (aka 'non-locking shift') gives efficient access
|
||||
* to characters that occur in isolation--usually punctuation characters.
|
||||
* When quoting a character from a dynamic window use 0x80 - 0xFF, when
|
||||
* quoting a character from a static window use 0x00-0x7f.
|
||||
*
|
||||
* @param ch - character to be quoted
|
||||
* @param out - output byte array
|
||||
*/
|
||||
|
||||
private void quoteSingleByte(int ch, byte[] out)
|
||||
throws EndOfOutputException {
|
||||
int iWin = getCurrentWindow();
|
||||
|
||||
// check for room in the output array
|
||||
if (iOut >= out.length - 2) {
|
||||
throw new EndOfOutputException();
|
||||
}
|
||||
|
||||
// Output command byte followed by
|
||||
out[iOut++] = (byte) (SQ0 + iWin);
|
||||
|
||||
// Letter that fits the current dynamic window
|
||||
if (ch >= dynamicOffset[iWin] && ch < dynamicOffset[iWin] + 0x80) {
|
||||
ch -= dynamicOffset[iWin];
|
||||
out[iOut++] = (byte) (ch | 0x80);
|
||||
}
|
||||
|
||||
// Letter that fits the current static window
|
||||
else if (ch >= staticOffset[iWin] && ch < staticOffset[iWin] + 0x80) {
|
||||
ch -= staticOffset[iWin];
|
||||
out[iOut++] = (byte) ch;
|
||||
} else {
|
||||
throw new Assert("ch = " + ch + " not valid in quoteSingleByte. Internal Compressor Error");
|
||||
}
|
||||
// advance input pointer
|
||||
iIn++;
|
||||
}
|
||||
|
||||
/**
|
||||
* output a run of characters in Unicode mode
|
||||
* A run of Unicode mode consists of characters which are all in the
|
||||
* range of non-compressible characters or isolated occurrence
|
||||
* of any other characters. Characters in the range 0xE00-0xF2FF must
|
||||
* be quoted to avoid overlap with the Unicode mode compression command codes.
|
||||
* Uses and updates the current input and output cursors store in
|
||||
* the instance variables <i>iIn</i> and <i>iOut</i>.
|
||||
* NOTE: Characters from surrogate pairs are passed through and unlike single
|
||||
* byte mode no checks are made for unpaired surrogate characters.
|
||||
*
|
||||
* @param in - input character array
|
||||
* @param out - output byte array
|
||||
* @return the next input character to be processed
|
||||
*/
|
||||
public char outputUnicodeRun(char[] in, byte[] out)
|
||||
throws EndOfOutputException {
|
||||
// current character
|
||||
char ch = 0;
|
||||
|
||||
while (iIn < in.length) {
|
||||
// get current input and set default output length
|
||||
ch = in[iIn];
|
||||
int outlen = 2;
|
||||
|
||||
// Characters in these ranges could potentially be compressed.
|
||||
// We require 2 or more compressible characters to break the run
|
||||
if (isCompressible(ch)) {
|
||||
// check whether we can look ahead
|
||||
if (iIn < in.length - 1) {
|
||||
// DEBUG
|
||||
char ch2 = in[iIn + 1];
|
||||
if (isCompressible(ch2)) {
|
||||
// at least 2 characters are compressible
|
||||
// break the run
|
||||
break;
|
||||
}
|
||||
//DEBUG
|
||||
}
|
||||
// If we get here, the current character is only character
|
||||
// left in the input or it is followed by a non-compressible
|
||||
// character. In neither case do we gain by breaking the
|
||||
// run, so we proceed to output the character.
|
||||
if (ch >= 0xE000 && ch <= 0xF2FF) {
|
||||
// Characters in this range need to be escaped
|
||||
outlen = 3;
|
||||
}
|
||||
|
||||
}
|
||||
// check that there is enough room to output the character
|
||||
if (iOut >= out.length - outlen) {
|
||||
// DEBUG
|
||||
// if we got here, we ran out of space in the output array
|
||||
throw new EndOfOutputException();
|
||||
}
|
||||
|
||||
// output any characters that cannot be compressed,
|
||||
if (outlen == 3) {
|
||||
// output the quote character
|
||||
out[iOut++] = (byte) UQU;
|
||||
}
|
||||
// pass the Unicode character in MSB,LSB order
|
||||
out[iOut++] = (byte) (ch >>> 8);
|
||||
out[iOut++] = (byte) (ch & 0xFF);
|
||||
|
||||
// advance input cursor
|
||||
iIn++;
|
||||
}
|
||||
|
||||
// return the last character
|
||||
return ch;
|
||||
}
|
||||
|
||||
/**
|
||||
* redefine a window so it surrounds a given character value
|
||||
* For now, this function uses window 3 exclusively (window 4
|
||||
* for extended windows);
|
||||
*
|
||||
* @param ch - character around which window is positioned
|
||||
* @param out - output byte array
|
||||
* @return true if a window was successfully defined
|
||||
*/
|
||||
private boolean positionWindow(int ch, byte[] out, boolean fUnicodeMode)
|
||||
throws IllegalInputException, EndOfOutputException {
|
||||
int iWin = iNextWindow % 8; // simple LRU
|
||||
int iPosition = 0;
|
||||
|
||||
// iPosition 0 is a reserved value
|
||||
if (ch < 0x80) {
|
||||
throw new Assert("ch < 0x80");
|
||||
//return false;
|
||||
}
|
||||
|
||||
// Check the fixed offsets
|
||||
for (int i = 0; i < fixedOffset.length; i++) {
|
||||
if (ch >= fixedOffset[i] && ch < fixedOffset[i] + 0x80) {
|
||||
iPosition = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (iPosition != 0) {
|
||||
// DEBUG
|
||||
|
||||
// ch fits in a fixed offset window position
|
||||
dynamicOffset[iWin] = fixedOffset[iPosition];
|
||||
iPosition += 0xF9;
|
||||
} else if (ch < 0x3400) {
|
||||
// calculate a window position command and set the offset
|
||||
iPosition = ch >>> 7;
|
||||
dynamicOffset[iWin] = ch & 0xFF80;
|
||||
|
||||
} else if (ch < 0xE000) {
|
||||
// attempt to place a window where none can go
|
||||
return false;
|
||||
} else if (ch <= 0xFFFF) {
|
||||
// calculate a window position command, accounting
|
||||
// for the gap in position values, and set the offset
|
||||
iPosition = ((ch - gapOffset) >>> 7);
|
||||
|
||||
dynamicOffset[iWin] = ch & 0xFF80;
|
||||
|
||||
} else {
|
||||
// if we get here, the character is in the extended range.
|
||||
// Always use Window 4 to define an extended window
|
||||
|
||||
iPosition = (ch - 0x10000) >>> 7;
|
||||
// DEBUG
|
||||
|
||||
iPosition |= iWin << 13;
|
||||
dynamicOffset[iWin] = ch & 0x1FFF80;
|
||||
}
|
||||
|
||||
// Outputting window defintion command for the general cases
|
||||
if (iPosition < 0x100 && iOut < out.length - 1) {
|
||||
out[iOut++] = (byte) ((fUnicodeMode ? UD0 : SD0) + iWin);
|
||||
out[iOut++] = (byte) (iPosition & 0xFF);
|
||||
}
|
||||
// Output an extended window definiton command
|
||||
else if (iPosition >= 0x100 && iOut < out.length - 2) {
|
||||
|
||||
out[iOut++] = (byte) (fUnicodeMode ? UDX : SDX);
|
||||
out[iOut++] = (byte) ((iPosition >>> 8) & 0xFF);
|
||||
out[iOut++] = (byte) (iPosition & 0xFF);
|
||||
} else {
|
||||
throw new EndOfOutputException();
|
||||
}
|
||||
selectWindow(iWin);
|
||||
iNextWindow++;
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* compress a Unicode character array with some simplifying assumptions
|
||||
*/
|
||||
public int simpleCompress(char[] in, int iStartIn, byte[] out, int iStartOut)
|
||||
throws IllegalInputException, EndOfInputException, EndOfOutputException {
|
||||
iIn = iStartIn;
|
||||
iOut = iStartOut;
|
||||
|
||||
|
||||
while (iIn < in.length) {
|
||||
int ch;
|
||||
|
||||
// previously we switched to a Unicode run
|
||||
if (iSCU != -1) {
|
||||
|
||||
|
||||
// output characters as Unicode
|
||||
ch = outputUnicodeRun(in, out);
|
||||
|
||||
// for single character Unicode runs (3 bytes) use quote
|
||||
if (iOut - iSCU == 3) {
|
||||
// go back and fix up the SCU to an SQU instead
|
||||
out[iSCU] = (byte) SQU;
|
||||
iSCU = -1;
|
||||
continue;
|
||||
} else {
|
||||
iSCU = -1;
|
||||
fUnicodeMode = true;
|
||||
}
|
||||
}
|
||||
// next, try to output characters as single byte run
|
||||
else {
|
||||
ch = outputSingleByteRun(in, out);
|
||||
}
|
||||
|
||||
// check whether we still have input
|
||||
if (iIn == in.length) {
|
||||
break; // no more input
|
||||
}
|
||||
|
||||
// if we get here, we have a consistent value for ch, whether or
|
||||
// not it is an regular or extended character. Locate or define a
|
||||
// Window for the current character
|
||||
|
||||
// Check that we have enough room to output the command byte
|
||||
if (iOut >= out.length - 1) {
|
||||
throw new EndOfOutputException();
|
||||
}
|
||||
|
||||
// In order to switch away from Unicode mode, it is necessary
|
||||
// to select (or define) a window. If the characters that follow
|
||||
// the Unicode range are ASCII characters, we can't use them
|
||||
// to decide which window to select, since ASCII characters don't
|
||||
// influence window settings. This loop looks ahead until it finds
|
||||
// one compressible character that isn't in the ASCII range.
|
||||
for (int ich = iIn; ch < 0x80; ich++) {
|
||||
if (ich == in.length || !isCompressible(in[ich])) {
|
||||
// if there are only ASCII characters left,
|
||||
ch = in[iIn];
|
||||
break;
|
||||
}
|
||||
ch = in[ich]; // lookahead for next non-ASCII char
|
||||
}
|
||||
// The character value contained in ch here will only be used to select
|
||||
// output modes. Actual output of characters starts with in[iIn] and
|
||||
// only takes place near the top of the loop.
|
||||
|
||||
int iprevWindow = getCurrentWindow();
|
||||
|
||||
// try to locate a dynamic window
|
||||
if (ch < 0x80 || locateWindow(ch, dynamicOffset)) {
|
||||
// lookahead to use SQn instead of SCn for single
|
||||
// character interruptions of runs in current window
|
||||
if (!fUnicodeMode && iIn < in.length - 1) {
|
||||
char ch2 = in[iIn + 1];
|
||||
if (ch2 >= dynamicOffset[iprevWindow] &&
|
||||
ch2 < dynamicOffset[iprevWindow] + 0x80) {
|
||||
quoteSingleByte(ch, out);
|
||||
selectWindow(iprevWindow);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
out[iOut++] = (byte) ((fUnicodeMode ? UC0 : SC0) + getCurrentWindow());
|
||||
fUnicodeMode = false;
|
||||
}
|
||||
// try to locate a static window
|
||||
else if (!fUnicodeMode && locateWindow(ch, staticOffset)) {
|
||||
// static windows are not accessible from Unicode mode
|
||||
quoteSingleByte(ch, out);
|
||||
selectWindow(iprevWindow); // restore current Window settings
|
||||
continue;
|
||||
}
|
||||
// try to define a window around ch
|
||||
else if (positionWindow(ch, out, fUnicodeMode)) {
|
||||
fUnicodeMode = false;
|
||||
}
|
||||
// If all else fails, start a Unicode run
|
||||
else {
|
||||
iSCU = iOut;
|
||||
out[iOut++] = (byte) SCU;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
return iOut - iStartOut;
|
||||
}
|
||||
|
||||
public byte[] compress(String inStr)
|
||||
throws IllegalInputException, EndOfInputException {
|
||||
// Running out of room for output can cause non-optimal
|
||||
// compression. In order to not slow down compression too
|
||||
// much, not all intermediate state is constantly saved.
|
||||
|
||||
byte[] out = new byte[inStr.length() * 2];
|
||||
char[] in = inStr.toCharArray();
|
||||
//DEBUG
|
||||
int iLen = 0;
|
||||
reset();
|
||||
while (true) {
|
||||
try {
|
||||
simpleCompress(in, charsRead(), out, bytesWritten());
|
||||
// if we get here things went fine.
|
||||
break;
|
||||
} catch (EndOfOutputException e) {
|
||||
// create a larger output buffer and continue
|
||||
byte[] largerOut = new byte[out.length * 2];
|
||||
System.arraycopy(out, 0, largerOut, 0, out.length);
|
||||
out = largerOut;
|
||||
}
|
||||
}
|
||||
byte[] trimmedOut = new byte[bytesWritten()];
|
||||
System.arraycopy(out, 0, trimmedOut, 0, trimmedOut.length);
|
||||
out = trimmedOut;
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
/**
|
||||
* reset is only needed to bail out after an exception and
|
||||
* restart with new input
|
||||
*/
|
||||
public void reset() {
|
||||
super.reset();
|
||||
fUnicodeMode = false;
|
||||
iSCU = -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* returns the number of bytes written *
|
||||
*/
|
||||
public int bytesWritten() {
|
||||
return iOut;
|
||||
}
|
||||
|
||||
/**
|
||||
* returns the number of bytes written *
|
||||
*/
|
||||
public int charsRead() {
|
||||
return iIn;
|
||||
}
|
||||
|
||||
}
|
45
app/src/main/java/org/unicode/scsu/EndOfInputException.java
Normal file
45
app/src/main/java/org/unicode/scsu/EndOfInputException.java
Normal file
|
@ -0,0 +1,45 @@
|
|||
package org.unicode.scsu;
|
||||
|
||||
/**
|
||||
* This sample software accompanies Unicode Technical Report #6 and
|
||||
* distributed as is by Unicode, Inc., subject to the following:
|
||||
*
|
||||
* Copyright <EFBFBD> 1996-1997 Unicode, Inc.. All Rights Reserved.
|
||||
*
|
||||
* Permission to use, copy, modify, and distribute this software
|
||||
* without fee is hereby granted provided that this copyright notice
|
||||
* appears in all copies.
|
||||
*
|
||||
* UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
|
||||
* SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
|
||||
* BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
|
||||
* UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
|
||||
* SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
|
||||
* INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
|
||||
* OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
|
||||
*
|
||||
* @author Asmus Freytag
|
||||
*
|
||||
* @version 001 Dec 25 1996
|
||||
* @version 002 Jun 25 1997
|
||||
* @version 003 Jul 25 1997
|
||||
* @version 004 Aug 25 1997
|
||||
*
|
||||
* Unicode and the Unicode logo are trademarks of Unicode, Inc.,
|
||||
* and are registered in some jurisdictions.
|
||||
**/
|
||||
|
||||
/**
|
||||
* The input string or input byte array ended prematurely
|
||||
*/
|
||||
public class EndOfInputException
|
||||
extends java.lang.Exception {
|
||||
public EndOfInputException() {
|
||||
super("The input string or input byte array ended prematurely");
|
||||
}
|
||||
|
||||
public EndOfInputException(String s) {
|
||||
super(s);
|
||||
}
|
||||
}
|
46
app/src/main/java/org/unicode/scsu/EndOfOutputException.java
Normal file
46
app/src/main/java/org/unicode/scsu/EndOfOutputException.java
Normal file
|
@ -0,0 +1,46 @@
|
|||
package org.unicode.scsu;
|
||||
|
||||
/**
|
||||
* This sample software accompanies Unicode Technical Report #6 and
|
||||
* distributed as is by Unicode, Inc., subject to the following:
|
||||
*
|
||||
* Copyright <EFBFBD> 1996-1997 Unicode, Inc.. All Rights Reserved.
|
||||
*
|
||||
* Permission to use, copy, modify, and distribute this software
|
||||
* without fee is hereby granted provided that this copyright notice
|
||||
* appears in all copies.
|
||||
*
|
||||
* UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
|
||||
* SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
|
||||
* BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
|
||||
* UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
|
||||
* SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
|
||||
* INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
|
||||
* OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
|
||||
*
|
||||
* @author Asmus Freytag
|
||||
*
|
||||
* @version 001 Dec 25 1996
|
||||
* @version 002 Jun 25 1997
|
||||
* @version 003 Jul 25 1997
|
||||
*
|
||||
* Unicode and the Unicode logo are trademarks of Unicode, Inc.,
|
||||
* and are registered in some jurisdictions.
|
||||
**/
|
||||
|
||||
/**
|
||||
* The input string or input byte array ended prematurely
|
||||
*/
|
||||
public class EndOfOutputException
|
||||
extends java.lang.Exception
|
||||
|
||||
{
|
||||
public EndOfOutputException() {
|
||||
super("The input string or input byte array ended prematurely");
|
||||
}
|
||||
|
||||
public EndOfOutputException(String s) {
|
||||
super(s);
|
||||
}
|
||||
}
|
388
app/src/main/java/org/unicode/scsu/Expand.java
Normal file
388
app/src/main/java/org/unicode/scsu/Expand.java
Normal file
|
@ -0,0 +1,388 @@
|
|||
package org.unicode.scsu;
|
||||
|
||||
/**
|
||||
* This sample software accompanies Unicode Technical Report #6 and
|
||||
* distributed as is by Unicode, Inc., subject to the following:
|
||||
*
|
||||
* Copyright <EFBFBD> 1996-1998 Unicode, Inc.. All Rights Reserved.
|
||||
*
|
||||
* Permission to use, copy, modify, and distribute this software
|
||||
* without fee is hereby granted provided that this copyright notice
|
||||
* appears in all copies.
|
||||
*
|
||||
* UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
|
||||
* SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
|
||||
* BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
|
||||
* UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
|
||||
* SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
|
||||
* INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
|
||||
* OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
|
||||
*
|
||||
* @author Asmus Freytag
|
||||
*
|
||||
* @version 001 Dec 25 1996
|
||||
* @version 002 Jun 25 1997
|
||||
* @version 003 Jul 25 1997
|
||||
* @version 004 Aug 25 1997
|
||||
* @version 005 Sep 30 1998
|
||||
*
|
||||
* Unicode and the Unicode logo are trademarks of Unicode, Inc.,
|
||||
* and are registered in some jurisdictions.
|
||||
**/
|
||||
|
||||
/**
|
||||
* Reference decoder for the Standard Compression Scheme for Unicode (SCSU)
|
||||
* <p/>
|
||||
* <H2>Notes on the Java implementation</H2>
|
||||
* <p/>
|
||||
* A limitation of Java is the exclusive use of a signed byte data type.
|
||||
* The following work arounds are required:
|
||||
* <p/>
|
||||
* Copying a byte to an integer variable and adding 256 for 'negative'
|
||||
* bytes gives an integer in the range 0-255.
|
||||
* <p/>
|
||||
* Values of char are between 0x0000 and 0xFFFF in Java. Arithmetic on
|
||||
* char values is unsigned.
|
||||
* <p/>
|
||||
* Extended characters require an int to store them. The sign is not an
|
||||
* issue because only 1024*1024 + 65536 extended characters exist.
|
||||
*/
|
||||
public class Expand extends SCSU {
|
||||
/**
|
||||
* string buffer length used by the following functions
|
||||
*/
|
||||
protected int iOut = 0;
|
||||
/**
|
||||
* input cursor used by the following functions
|
||||
*/
|
||||
protected int iIn = 0;
|
||||
|
||||
/**
|
||||
* assemble a char from two bytes
|
||||
* In Java bytes are signed quantities, while chars are unsigned
|
||||
*
|
||||
* @param hi most significant byte
|
||||
* @param lo least significant byte
|
||||
* @return the character
|
||||
*/
|
||||
public static char charFromTwoBytes(byte hi, byte lo) {
|
||||
char ch = (char) (lo >= 0 ? lo : 256 + lo);
|
||||
return (char) (ch + (char) ((hi >= 0 ? hi : 256 + hi) << 8));
|
||||
}
|
||||
|
||||
/**
|
||||
* (re-)define (and select) a dynamic window
|
||||
* A sliding window position cannot start at any Unicode value,
|
||||
* so rather than providing an absolute offset, this function takes
|
||||
* an index value which selects among the possible starting values.
|
||||
* <p/>
|
||||
* Most scripts in Unicode start on or near a half-block boundary
|
||||
* so the default behaviour is to multiply the index by 0x80. Han,
|
||||
* Hangul, Surrogates and other scripts between 0x3400 and 0xDFFF
|
||||
* show very poor locality--therefore no sliding window can be set
|
||||
* there. A jumpOffset is added to the index value to skip that region,
|
||||
* and only 167 index values total are required to select all eligible
|
||||
* half-blocks.
|
||||
* <p/>
|
||||
* Finally, a few scripts straddle half block boundaries. For them, a
|
||||
* table of fixed offsets is used, and the index values from 0xF9 to
|
||||
* 0xFF are used to select these special offsets.
|
||||
* <p/>
|
||||
* After (re-)defining a windows location it is selected so it is ready
|
||||
* for use.
|
||||
* <p/>
|
||||
* Recall that all Windows are of the same length (128 code positions).
|
||||
*
|
||||
* @param iWindow - index of the window to be (re-)defined
|
||||
* @param bOffset - index for the new offset value
|
||||
*/
|
||||
// @005 protected <-- private here and elsewhere
|
||||
protected void defineWindow(int iWindow, byte bOffset)
|
||||
throws IllegalInputException {
|
||||
int iOffset = (bOffset < 0 ? bOffset + 256 : bOffset);
|
||||
|
||||
// 0 is a reserved value
|
||||
if (iOffset == 0) {
|
||||
throw new IllegalInputException();
|
||||
} else if (iOffset < gapThreshold) {
|
||||
dynamicOffset[iWindow] = iOffset << 7;
|
||||
} else if (iOffset < reservedStart) {
|
||||
dynamicOffset[iWindow] = (iOffset << 7) + gapOffset;
|
||||
} else if (iOffset < fixedThreshold) {
|
||||
// more reserved values
|
||||
throw new IllegalInputException("iOffset == " + iOffset);
|
||||
} else {
|
||||
dynamicOffset[iWindow] = fixedOffset[iOffset - fixedThreshold];
|
||||
}
|
||||
|
||||
// make the redefined window the active one
|
||||
selectWindow(iWindow);
|
||||
}
|
||||
|
||||
/**
|
||||
* (re-)define (and select) a window as an extended dynamic window
|
||||
* The surrogate area in Unicode allows access to 2**20 codes beyond the
|
||||
* first 64K codes by combining one of 1024 characters from the High
|
||||
* Surrogate Area with one of 1024 characters from the Low Surrogate
|
||||
* Area (see Unicode 2.0 for the details).
|
||||
* <p/>
|
||||
* The tags SDX and UDX set the window such that each subsequent byte in
|
||||
* the range 80 to FF represents a surrogate pair. The following diagram
|
||||
* shows how the bits in the two bytes following the SDX or UDX, and a
|
||||
* subsequent data byte, map onto the bits in the resulting surrogate pair.
|
||||
* <p/>
|
||||
* hbyte lbyte data
|
||||
* nnnwwwww zzzzzyyy 1xxxxxxx
|
||||
* <p/>
|
||||
* high-surrogate low-surrogate
|
||||
* 110110wwwwwzzzzz 110111yyyxxxxxxx
|
||||
*
|
||||
* @param chOffset - Since the three top bits of chOffset are not needed to
|
||||
* set the location of the extended Window, they are used instead
|
||||
* to select the window, thereby reducing the number of needed command codes.
|
||||
* The bottom 13 bits of chOffset are used to calculate the offset relative to
|
||||
* a 7 bit input data byte to yield the 20 bits expressed by each surrogate pair.
|
||||
*/
|
||||
protected void defineExtendedWindow(char chOffset) {
|
||||
// The top 3 bits of iOffsetHi are the window index
|
||||
int iWindow = chOffset >>> 13;
|
||||
|
||||
// Calculate the new offset
|
||||
dynamicOffset[iWindow] = ((chOffset & 0x1FFF) << 7) + (1 << 16);
|
||||
|
||||
// make the redefined window the active one
|
||||
selectWindow(iWindow);
|
||||
}
|
||||
|
||||
/**
|
||||
* expand input that is in Unicode mode
|
||||
*
|
||||
* @param in input byte array to be expanded
|
||||
* @param iCur starting index
|
||||
* @param sb string buffer to which to append expanded input
|
||||
* @return the index for the lastc byte processed
|
||||
*/
|
||||
protected int expandUnicode(byte[] in, int iCur, StringBuffer sb)
|
||||
throws IllegalInputException, EndOfInputException {
|
||||
for (; iCur < in.length - 1; iCur += 2) // step by 2:
|
||||
{
|
||||
byte b = in[iCur];
|
||||
|
||||
if (b >= UC0 && b <= UC7) {
|
||||
selectWindow(b - UC0);
|
||||
return iCur;
|
||||
} else if (b >= UD0 && b <= UD7) {
|
||||
defineWindow(b - UD0, in[iCur + 1]);
|
||||
return iCur + 1;
|
||||
} else if (b == UDX) {
|
||||
if (iCur >= in.length - 2) {
|
||||
break; // buffer error
|
||||
}
|
||||
defineExtendedWindow(charFromTwoBytes(in[iCur + 1], in[iCur + 2]));
|
||||
return iCur + 2;
|
||||
} else if (b == UQU) {
|
||||
if (iCur >= in.length - 2) {
|
||||
break; // error
|
||||
}
|
||||
// Skip command byte and output Unicode character
|
||||
iCur++;
|
||||
}
|
||||
|
||||
// output a Unicode character
|
||||
char ch = charFromTwoBytes(in[iCur], in[iCur + 1]);
|
||||
sb.append((char) ch);
|
||||
iOut++;
|
||||
}
|
||||
|
||||
if (iCur == in.length) {
|
||||
return iCur;
|
||||
}
|
||||
|
||||
// Error condition
|
||||
throw new EndOfInputException();
|
||||
}
|
||||
|
||||
/**
|
||||
* expand portion of the input that is in single byte mode *
|
||||
*/
|
||||
protected String expandSingleByte(byte[] in)
|
||||
throws IllegalInputException, EndOfInputException {
|
||||
|
||||
/* Allocate the output buffer. Because of control codes, generally
|
||||
each byte of input results in fewer than one character of
|
||||
output. Using in.length as an intial allocation length should avoid
|
||||
the need to reallocate in mid-stream. The exception to this rule are
|
||||
surrogates. */
|
||||
StringBuffer sb = new StringBuffer(in.length);
|
||||
iOut = 0;
|
||||
|
||||
// Loop until all input is exhausted or an error occurred
|
||||
int iCur;
|
||||
Loop:
|
||||
for (iCur = 0; iCur < in.length; iCur++) {
|
||||
// DEBUG Debug.out("Expanding: ", iCur);
|
||||
|
||||
// Default behaviour is that ASCII characters are passed through
|
||||
// (staticOffset[0] == 0) and characters with the high bit on are
|
||||
// offset by the current dynamic (or sliding) window (this.iWindow)
|
||||
int iStaticWindow = 0;
|
||||
int iDynamicWindow = getCurrentWindow();
|
||||
|
||||
switch (in[iCur]) {
|
||||
// Quote from a static Window
|
||||
case SQ0:
|
||||
case SQ1:
|
||||
case SQ2:
|
||||
case SQ3:
|
||||
case SQ4:
|
||||
case SQ5:
|
||||
case SQ6:
|
||||
case SQ7:
|
||||
// skip the command byte and check for length
|
||||
if (iCur >= in.length - 1) {
|
||||
break Loop; // buffer length error
|
||||
}
|
||||
// Select window pair to quote from
|
||||
iDynamicWindow = iStaticWindow = in[iCur] - SQ0;
|
||||
iCur++;
|
||||
|
||||
// FALL THROUGH
|
||||
|
||||
default:
|
||||
// output as character
|
||||
if (in[iCur] >= 0) {
|
||||
// use static window
|
||||
int ch = in[iCur] + staticOffset[iStaticWindow];
|
||||
sb.append((char) ch);
|
||||
iOut++;
|
||||
} else {
|
||||
// use dynamic window
|
||||
int ch = (in[iCur] + 256); // adjust for signed bytes
|
||||
ch -= 0x80; // reduce to range 00..7F
|
||||
ch += dynamicOffset[iDynamicWindow];
|
||||
|
||||
//DEBUG
|
||||
|
||||
if (ch < 1 << 16) {
|
||||
// in Unicode range, output directly
|
||||
sb.append((char) ch);
|
||||
iOut++;
|
||||
} else {
|
||||
// this is an extension character
|
||||
|
||||
// compute and append the two surrogates:
|
||||
// translate from 10000..10FFFF to 0..FFFFF
|
||||
ch -= 0x10000;
|
||||
|
||||
// high surrogate = top 10 bits added to D800
|
||||
sb.append((char) (0xD800 + (ch >> 10)));
|
||||
iOut++;
|
||||
|
||||
// low surrogate = bottom 10 bits added to DC00
|
||||
sb.append((char) (0xDC00 + (ch & ~0xFC00)));
|
||||
iOut++;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
// define a dynamic window as extended
|
||||
case SDX:
|
||||
iCur += 2;
|
||||
if (iCur >= in.length) {
|
||||
break Loop; // buffer length error
|
||||
}
|
||||
defineExtendedWindow(charFromTwoBytes(in[iCur - 1], in[iCur]));
|
||||
break;
|
||||
|
||||
// Position a dynamic Window
|
||||
case SD0:
|
||||
case SD1:
|
||||
case SD2:
|
||||
case SD3:
|
||||
case SD4:
|
||||
case SD5:
|
||||
case SD6:
|
||||
case SD7:
|
||||
iCur++;
|
||||
if (iCur >= in.length) {
|
||||
break Loop; // buffer length error
|
||||
}
|
||||
defineWindow(in[iCur - 1] - SD0, in[iCur]);
|
||||
break;
|
||||
|
||||
// Select a new dynamic Window
|
||||
case SC0:
|
||||
case SC1:
|
||||
case SC2:
|
||||
case SC3:
|
||||
case SC4:
|
||||
case SC5:
|
||||
case SC6:
|
||||
case SC7:
|
||||
selectWindow(in[iCur] - SC0);
|
||||
break;
|
||||
case SCU:
|
||||
// switch to Unicode mode and continue parsing
|
||||
iCur = expandUnicode(in, iCur + 1, sb);
|
||||
// DEBUG Debug.out("Expanded Unicode range until: ", iCur);
|
||||
break;
|
||||
|
||||
case SQU:
|
||||
// directly extract one Unicode character
|
||||
iCur += 2;
|
||||
if (iCur >= in.length) {
|
||||
break Loop; // buffer length error
|
||||
} else {
|
||||
char ch = charFromTwoBytes(in[iCur - 1], in[iCur]);
|
||||
|
||||
sb.append((char) ch);
|
||||
iOut++;
|
||||
}
|
||||
break;
|
||||
|
||||
case Srs:
|
||||
throw new IllegalInputException();
|
||||
// break;
|
||||
}
|
||||
}
|
||||
|
||||
if (iCur >= in.length) {
|
||||
//SUCCESS: all input used up
|
||||
sb.setLength(iOut);
|
||||
iIn = iCur;
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
//ERROR: premature end of input
|
||||
throw new EndOfInputException();
|
||||
}
|
||||
|
||||
/**
|
||||
* expand a byte array containing compressed Unicode
|
||||
*/
|
||||
public String expand(byte[] in)
|
||||
throws IllegalInputException, EndOfInputException {
|
||||
String str = expandSingleByte(in);
|
||||
return str;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* reset is called to start with new input, w/o creating a new
|
||||
* instance
|
||||
*/
|
||||
public void reset() {
|
||||
iOut = 0;
|
||||
iIn = 0;
|
||||
super.reset();
|
||||
}
|
||||
|
||||
public int charsWritten() {
|
||||
return iOut;
|
||||
}
|
||||
|
||||
public int bytesRead() {
|
||||
return iIn;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,45 @@
|
|||
package org.unicode.scsu;
|
||||
|
||||
/**
|
||||
* This sample software accompanies Unicode Technical Report #6 and
|
||||
* distributed as is by Unicode, Inc., subject to the following:
|
||||
*
|
||||
* Copyright <EFBFBD> 1996-1997 Unicode, Inc.. All Rights Reserved.
|
||||
*
|
||||
* Permission to use, copy, modify, and distribute this software
|
||||
* without fee is hereby granted provided that this copyright notice
|
||||
* appears in all copies.
|
||||
*
|
||||
* UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
|
||||
* SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
|
||||
* BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
|
||||
* UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
|
||||
* SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
|
||||
* INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
|
||||
* OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
|
||||
*
|
||||
* @author Asmus Freytag
|
||||
*
|
||||
* @version 001 Dec 25 1996
|
||||
* @version 002 Jun 25 1997
|
||||
* @version 003 Jul 25 1997
|
||||
* @version 004 Aug 25 1997
|
||||
*
|
||||
* Unicode and the Unicode logo are trademarks of Unicode, Inc.,
|
||||
* and are registered in some jurisdictions.
|
||||
**/
|
||||
|
||||
/**
|
||||
* The input character array or input byte array contained
|
||||
* illegal sequences of bytes or characters
|
||||
*/
|
||||
public class IllegalInputException extends java.lang.Exception {
|
||||
public IllegalInputException() {
|
||||
super("The input character array or input byte array contained illegal sequences of bytes or characters");
|
||||
}
|
||||
|
||||
public IllegalInputException(String s) {
|
||||
super(s);
|
||||
}
|
||||
}
|
260
app/src/main/java/org/unicode/scsu/SCSU.java
Normal file
260
app/src/main/java/org/unicode/scsu/SCSU.java
Normal file
|
@ -0,0 +1,260 @@
|
|||
package org.unicode.scsu;
|
||||
|
||||
/**
|
||||
* This sample software accompanies Unicode Technical Report #6 and
|
||||
* distributed as is by Unicode, Inc., subject to the following:
|
||||
*
|
||||
* Copyright <EFBFBD> 1996-1998 Unicode, Inc.. All Rights Reserved.
|
||||
*
|
||||
* Permission to use, copy, modify, and distribute this software
|
||||
* without fee is hereby granted provided that this copyright notice
|
||||
* appears in all copies.
|
||||
*
|
||||
* UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
|
||||
* SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
|
||||
* BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
|
||||
* UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
|
||||
* SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
|
||||
* INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
|
||||
* OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
|
||||
*
|
||||
* @author Asmus Freytag
|
||||
*
|
||||
* @version 001 Dec 25 1996
|
||||
* @version 002 Jun 25 1997
|
||||
* @version 003 Jul 25 1997
|
||||
* @version 004 Aug 25 1997
|
||||
* @version 005 Sep 30 1998
|
||||
*
|
||||
* Unicode and the Unicode logo are trademarks of Unicode, Inc.,
|
||||
* and are registered in some jurisdictions.
|
||||
**/
|
||||
|
||||
/**
|
||||
* Encoding text data in Unicode often requires more storage than using
|
||||
* an existing 8-bit character set and limited to the subset of characters
|
||||
* actually found in the text. The Unicode Compression Algorithm reduces
|
||||
* the necessary storage while retaining the universality of Unicode.
|
||||
* A full description of the algorithm can be found in document
|
||||
* http://www.unicode.org/unicode/reports/tr6.html
|
||||
* <p/>
|
||||
* Summary
|
||||
* <p/>
|
||||
* The goal of the Unicode Compression Algorithm is the abilty to
|
||||
* Express all code points in Unicode
|
||||
* Approximate storage size for traditional character sets
|
||||
* Work well for short strings
|
||||
* Provide transparency for Latin-1 data
|
||||
* Support very simple decoders
|
||||
* Support simple as well as sophisticated encoders
|
||||
* <p/>
|
||||
* If needed, further compression can be achieved by layering standard
|
||||
* file or disk-block based compression algorithms on top.
|
||||
* <p/>
|
||||
* <H2>Features</H2>
|
||||
* <p/>
|
||||
* Languages using small alphabets would contain runs of characters that
|
||||
* are coded close together in Unicode. These runs are interrupted only
|
||||
* by punctuation characters, which are themselves coded in proximity to
|
||||
* each other in Unicode (usually in the ASCII range).
|
||||
* <p/>
|
||||
* Two basic mechanisms in the compression algorithm account for these two
|
||||
* cases, sliding windows and static windows. A window is an area of 128
|
||||
* consecutive characters in Unicode. In the compressed data stream, each
|
||||
* character from a sliding window would be represented as a byte between
|
||||
* 0x80 and 0xFF, while a byte from 0x20 to 0x7F (as well as CR, LF, and
|
||||
* TAB) would always mean an ASCII character (or control).
|
||||
* <p/>
|
||||
* <H2>Notes on the Java implementation</H2>
|
||||
* <p/>
|
||||
* A limitation of Java is the exclusive use of a signed byte data type.
|
||||
* The following work arounds are required:
|
||||
* <p/>
|
||||
* Copying a byte to an integer variable and adding 256 for 'negative'
|
||||
* bytes gives an integer in the range 0-255.
|
||||
* <p/>
|
||||
* Values of char are between 0x0000 and 0xFFFF in Java. Arithmetic on
|
||||
* char values is unsigned.
|
||||
* <p/>
|
||||
* Extended characters require an int to store them. The sign is not an
|
||||
* issue because only 1024*1024 + 65536 extended characters exist.
|
||||
*/
|
||||
public abstract class SCSU {
|
||||
/** Single Byte mode command values */
|
||||
|
||||
/**
|
||||
* SQ<i>n</i> Quote from Window . <p>
|
||||
* If the following byte is less than 0x80, quote from
|
||||
* static window <i>n</i>, else quote from dynamic window <i>n</i>.
|
||||
*/
|
||||
|
||||
static final byte SQ0 = 0x01; // Quote from window pair 0
|
||||
static final byte SQ1 = 0x02; // Quote from window pair 1
|
||||
static final byte SQ2 = 0x03; // Quote from window pair 2
|
||||
static final byte SQ3 = 0x04; // Quote from window pair 3
|
||||
static final byte SQ4 = 0x05; // Quote from window pair 4
|
||||
static final byte SQ5 = 0x06; // Quote from window pair 5
|
||||
static final byte SQ6 = 0x07; // Quote from window pair 6
|
||||
static final byte SQ7 = 0x08; // Quote from window pair 7
|
||||
|
||||
static final byte SDX = 0x0B; // Define a window as extended
|
||||
static final byte Srs = 0x0C; // reserved
|
||||
|
||||
static final byte SQU = 0x0E; // Quote a single Unicode character
|
||||
static final byte SCU = 0x0F; // Change to Unicode mode
|
||||
|
||||
/**
|
||||
* SC<i>n</i> Change to Window <i>n</i>. <p>
|
||||
* If the following bytes are less than 0x80, interpret them
|
||||
* as command bytes or pass them through, else add the offset
|
||||
* for dynamic window <i>n</i>.
|
||||
*/
|
||||
static final byte SC0 = 0x10; // Select window 0
|
||||
static final byte SC1 = 0x11; // Select window 1
|
||||
static final byte SC2 = 0x12; // Select window 2
|
||||
static final byte SC3 = 0x13; // Select window 3
|
||||
static final byte SC4 = 0x14; // Select window 4
|
||||
static final byte SC5 = 0x15; // Select window 5
|
||||
static final byte SC6 = 0x16; // Select window 6
|
||||
static final byte SC7 = 0x17; // Select window 7
|
||||
static final byte SD0 = 0x18; // Define and select window 0
|
||||
static final byte SD1 = 0x19; // Define and select window 1
|
||||
static final byte SD2 = 0x1A; // Define and select window 2
|
||||
static final byte SD3 = 0x1B; // Define and select window 3
|
||||
static final byte SD4 = 0x1C; // Define and select window 4
|
||||
static final byte SD5 = 0x1D; // Define and select window 5
|
||||
static final byte SD6 = 0x1E; // Define and select window 6
|
||||
static final byte SD7 = 0x1F; // Define and select window 7
|
||||
|
||||
static final byte UC0 = (byte) 0xE0; // Select window 0
|
||||
static final byte UC1 = (byte) 0xE1; // Select window 1
|
||||
static final byte UC2 = (byte) 0xE2; // Select window 2
|
||||
static final byte UC3 = (byte) 0xE3; // Select window 3
|
||||
static final byte UC4 = (byte) 0xE4; // Select window 4
|
||||
static final byte UC5 = (byte) 0xE5; // Select window 5
|
||||
static final byte UC6 = (byte) 0xE6; // Select window 6
|
||||
static final byte UC7 = (byte) 0xE7; // Select window 7
|
||||
static final byte UD0 = (byte) 0xE8; // Define and select window 0
|
||||
static final byte UD1 = (byte) 0xE9; // Define and select window 1
|
||||
static final byte UD2 = (byte) 0xEA; // Define and select window 2
|
||||
static final byte UD3 = (byte) 0xEB; // Define and select window 3
|
||||
static final byte UD4 = (byte) 0xEC; // Define and select window 4
|
||||
static final byte UD5 = (byte) 0xED; // Define and select window 5
|
||||
static final byte UD6 = (byte) 0xEE; // Define and select window 6
|
||||
static final byte UD7 = (byte) 0xEF; // Define and select window 7
|
||||
|
||||
static final byte UQU = (byte) 0xF0; // Quote a single Unicode character
|
||||
static final byte UDX = (byte) 0xF1; // Define a Window as extended
|
||||
static final byte Urs = (byte) 0xF2; // reserved
|
||||
|
||||
/**
|
||||
* constant offsets for the 8 static windows
|
||||
*/
|
||||
static final int staticOffset[] =
|
||||
{
|
||||
0x0000, // ASCII for quoted tags
|
||||
0x0080, // Latin - 1 Supplement (for access to punctuation)
|
||||
0x0100, // Latin Extended-A
|
||||
0x0300, // Combining Diacritical Marks
|
||||
0x2000, // General Punctuation
|
||||
0x2080, // Currency Symbols
|
||||
0x2100, // Letterlike Symbols and Number Forms
|
||||
0x3000 // CJK Symbols and punctuation
|
||||
};
|
||||
|
||||
/**
|
||||
* initial offsets for the 8 dynamic (sliding) windows
|
||||
*/
|
||||
static final int initialDynamicOffset[] =
|
||||
{
|
||||
0x0080, // Latin-1
|
||||
0x00C0, // Latin Extended A //@005 fixed from 0x0100
|
||||
0x0400, // Cyrillic
|
||||
0x0600, // Arabic
|
||||
0x0900, // Devanagari
|
||||
0x3040, // Hiragana
|
||||
0x30A0, // Katakana
|
||||
0xFF00 // Fullwidth ASCII
|
||||
};
|
||||
|
||||
/**
|
||||
* dynamic window offsets, intitialize to default values.
|
||||
*/
|
||||
int dynamicOffset[] =
|
||||
{
|
||||
initialDynamicOffset[0],
|
||||
initialDynamicOffset[1],
|
||||
initialDynamicOffset[2],
|
||||
initialDynamicOffset[3],
|
||||
initialDynamicOffset[4],
|
||||
initialDynamicOffset[5],
|
||||
initialDynamicOffset[6],
|
||||
initialDynamicOffset[7]
|
||||
};
|
||||
|
||||
// The following method is common to encoder and decoder
|
||||
/**
|
||||
* Unicode code points from 3400 to E000 are not adressible by
|
||||
* dynamic window, since in these areas no short run alphabets are
|
||||
* found. Therefore add gapOffset to all values from gapThreshold
|
||||
*/
|
||||
static final int gapThreshold = 0x68;
|
||||
static final int gapOffset = 0xAC00;
|
||||
/* values between reservedStart and fixedThreshold are reserved */
|
||||
static final int reservedStart = 0xA8;
|
||||
|
||||
/**
|
||||
* These values are used in defineWindow
|
||||
*/
|
||||
/* use table of predefined fixed offsets for values from fixedThreshold */
|
||||
static final int fixedThreshold = 0xF9;
|
||||
/**
|
||||
* Table of fixed predefined Offsets, and byte values that index into *
|
||||
*/
|
||||
static final int fixedOffset[] =
|
||||
{
|
||||
/* 0xF9 */ 0x00C0, // Latin-1 Letters + half of Latin Extended A
|
||||
/* 0xFA */ 0x0250, // IPA extensions
|
||||
/* 0xFB */ 0x0370, // Greek
|
||||
/* 0xFC */ 0x0530, // Armenian
|
||||
/* 0xFD */ 0x3040, // Hiragana
|
||||
/* 0xFE */ 0x30A0, // Katakana
|
||||
/* 0xFF */ 0xFF60 // Halfwidth Katakana
|
||||
};
|
||||
private int iWindow = 0; // current active window
|
||||
|
||||
/**
|
||||
* whether a character is compressible
|
||||
*/
|
||||
public static boolean isCompressible(char ch) {
|
||||
return (ch < 0x3400 || ch >= 0xE000);
|
||||
}
|
||||
|
||||
/**
|
||||
* select the active dynamic window *
|
||||
*/
|
||||
protected void selectWindow(int iWindow) {
|
||||
this.iWindow = iWindow;
|
||||
}
|
||||
|
||||
/**
|
||||
* select the active dynamic window *
|
||||
*/
|
||||
protected int getCurrentWindow() {
|
||||
return this.iWindow;
|
||||
}
|
||||
|
||||
/**
|
||||
* reset is only needed to bail out after an exception and
|
||||
* restart with new input
|
||||
*/
|
||||
protected void reset() {
|
||||
|
||||
// reset the dynamic windows
|
||||
for (int i = 0; i < dynamicOffset.length; i++) {
|
||||
dynamicOffset[i] = initialDynamicOffset[i];
|
||||
}
|
||||
this.iWindow = 0;
|
||||
}
|
||||
}
|
Loading…
Reference in a new issue