Files
harbour-core/tests/big5_gen.prg
Viktor Szakats cef75de547 2016-01-15 15:20 UTC+0100 Viktor Szakats (vszakats users.noreply.github.com)
* *
    % remove brandings and homepage from copyright header. Pass 3 - manual.
    * project homepage and name is described in README, amongst others
    ; this should make the diff between 3.4 and 3.2 easier to manage
2016-01-15 15:21:27 +01:00

310 lines
9.4 KiB
Plaintext

/*
* code to generate C source with conversion tables between BIG5 and UCS16
* using data defined by Unicode, Inc. in BIG5.TXT
*
* Copyright 2011 Przemyslaw Czerpak <druzus / at / priv.onet.pl>
*
*/
/* ***************************************************************************
these characters are mapped to U+FFFD REPLACEMENT CHARACTER in BIG5.TXT:
0xA15A SPACING UNDERSCORE duplicates A1C4
0xA1C3 SPACING HEAVY OVERSCORE not in Unicode
0xA1C5 SPACING HEAVY UNDERSCORE not in Unicode
0xA1FE LT DIAG UP RIGHT TO LOW LEFT duplicates A2AC
0xA240 LT DIAG UP LEFT TO LOW RIGHT duplicates A2AD
0xA2CC HANGZHOU NUMERAL TEN conflicts with A451 mapping
0xA2CE HANGZHOU NUMERAL THIRTY conflicts with A4CA mapping
duplicated character has the following mapping in BIG5.TXT:
0xA1C4 0xFF3F # FULLWIDTH LOW LINE
0xA2AC 0x2571 # BOX DRAWINGS LIGHT DIAGONAL UPPER RIGHT TO LOWER LEFT
0xA2AD 0x2572 # BOX DRAWINGS LIGHT DIAGONAL UPPER LEFT TO LOWER RIGHT
0xA451 0x5341 # <CJK>
0xA4CA 0x5345 # <CJK>
# iconv using the following mapping for them:
0xA15A 0x2574 # BOX DRAWINGS LIGHT LEFT
0xA1C3 0xFFE3 # FULLWIDTH MACRON
0xA1C5 0x02CD # MODIFIER LETTER LOW MACRON
0xA1FE 0xFF0F # FULLWIDTH SOLIDUS
0xA240 0xFF3C # FULLWIDTH REVERSE SOLIDUS
0xA2CC 0x5341 # <CJK>
0xA2CE 0x5345 # <CJK>
# this seems to be closer mapping:
0xA15A 0xFF3F # FULLWIDTH LOW LINE
0xA1C3 0xFFE3 # FULLWIDTH MACRON
0xA1C5 0x02CD # MODIFIER LETTER LOW MACRON
0xA1FE 0x2571 # BOX DRAWINGS LIGHT DIAGONAL UPPER RIGHT TO LOWER LEFT
0xA240 0x2572 # BOX DRAWINGS LIGHT DIAGONAL UPPER LEFT TO LOWER RIGHT
0xA2CC 0x5341 # <CJK>
0xA2CE 0x5345 # <CJK>
*************************************************************************** */
//#define DO_START_OPT
proc main()
local cLine, aVal, aVal2, aValU, aValU2, hVal, aInd, ;
n, nn, nBG5, nU16, nMin, nMax, nUMin, nUMax, cResult, nBit
aVal := afill( array( 0x10000 ), 0 )
aValU := afill( array( 0x10000 ), 0 )
nMin := nUMin := 0xFFFF
nMax := nUMax := 0x0000
for each cLine in hb_aTokens( hb_memoRead( "BIG5.TXT" ), hb_eol() )
if cLine = "0x" .and. ( n := hb_at( "0x", cLine, 7 ) ) != 0
nBG5 := hb_hexToNum( substr( cLine, 3, 4 ) )
nU16 := hb_hexToNum( substr( cLine, n + 2, 4 ) )
if nBG5 == 0 .or. nU16 == 0
? "unrecognized line:", cLine:__enumIndex()
return
elseif nBG5 > 0xFFFF .or. nU16 > 0xFFFF
? "wrong character range at line:", cLine:__enumIndex()
return
endif
aVal[ nBG5 ] := nU16
if nBG5 > nMax
nMax := nBG5
endif
if nBG5 < nMin
nMin := nBG5
endif
if nU16 > nUMax
nUMax := nU16
endif
if nU16 < nUMin
nUMin := nU16
endif
endif
next
for n := 1 to len( aVal )
if aVal[ n ] != 0
aValU[ aVal[ n ] ] := n
endif
next
#if 0
cResult += "static const HB_USHORT s_big5uni[ " + hb_ntos( nMax - nMin + 1 ) + " ] =" + hb_eol()
cResult += "{" + hb_eol()
l := 0
for n := nMin to nMax
if ++l > 8
l := 1
cResult += ","
cResult += hb_eol()
cResult += " "
elseif n == nMin
cResult += " "
else
cResult += ", "
endif
cResult += "0x"
cResult += hb_numToHex( aVal[ n ], 4 )
next
cResult += hb_eol()
cResult += "};" + hb_eol()
#endif
? "BIG5->UCS16 tables."
? "raw size:", hb_ntos( ( nMax - nMin + 1 ) * 2 )
#ifndef DO_START_OPT
nMin := min( nMin, 41280 ) // optimal
#endif
n := min_size( aVal, @nMin, nMax, @nBit )
? "minimal size:", hb_ntos( n ), ;
"for", hb_ntos( hb_bitshift( 1, nBit ) ), "byte blocks, (from: "+hb_ntos( nMin ) + ")"
calc_size( aVal, nMin, nMax, nBit, @hVal, @aInd, @nn )
aVal2 := hash_to_array( hVal )
cResult := ;
"/*" + hb_eol() + ;
" * BIG5 <-> UCS16 conversion tables" + hb_eol() + ;
" * code generated automatically by tests/big5_gen.prg" + hb_eol() + ;
" *" + hb_eol() + ;
" * Copyright 2011 Przemyslaw Czerpak <druzus / at / priv.onet.pl>" + hb_eol() + ;
" *" + hb_eol() + ;
" */" + hb_eol() + ;
hb_eol()
cResult += '#include "hbapi.h"' + hb_eol()
cResult += hb_eol()
cResult += "#define HB_BIG5_FIRST 0x" + hb_numToHex( nMin, 4 ) + hb_eol()
cResult += "#define HB_BIG5_LAST 0x" + hb_numToHex( nMax, 4 ) + hb_eol()
cResult += "#define HB_BIG5_BITS " + hb_ntos( nBit ) + hb_eol()
cResult += hb_eol()
cResult += array_to_code( aInd, "s_big5index", nn )
cResult += hb_eol()
cResult += array_to_code( aVal2, "s_big5_ucs16", 2 )
cResult += hb_eol()
cResult += index_func( "s_big5_to_ucs16", "s_big5index", "s_big5_ucs16", ;
"HB_BIG5_FIRST", "HB_BIG5_LAST", "HB_BIG5_BITS" )
check_conv( aVal, aInd, aVal2, nMin, nMax, nBit )
?
? "UCS16->BIG5 tables."
? "raw size:", hb_ntos( ( nUMax - nUMin + 1 ) * 2 )
#ifndef DO_START_OPT
nUMin := min( nUMin, 160 ) // optimal
#endif
n := min_size( aValU, @nUMin, nUMax, @nBit )
? "minimal size:", hb_ntos( n ), ;
"for", hb_ntos( hb_bitshift( 1, nBit ) ), "byte blocks, (from: "+hb_ntos( nUMin ) + ")"
calc_size( aValU, nUMin, nUMax, nBit, @hVal, @aInd, @nn )
aValU2 := hash_to_array( hVal )
cResult += hb_eol()
cResult += "#define HB_U16_FIRST 0x" + hb_numToHex( nUMin, 4 ) + hb_eol()
cResult += "#define HB_U16_LAST 0x" + hb_numToHex( nUMax, 4 ) + hb_eol()
cResult += "#define HB_U16_BITS " + hb_ntos( nBit ) + hb_eol()
cResult += hb_eol()
cResult += array_to_code( aInd, "s_ucs16index", nn )
cResult += hb_eol()
cResult += array_to_code( aValU2, "s_ucs16_big5", 2 )
cResult += hb_eol()
cResult += index_func( "s_ucs16_to_big5", "s_ucs16index", "s_ucs16_big5", ;
"HB_U16_FIRST", "HB_U16_LAST", "HB_U16_BITS" )
check_conv( aValU, aInd, aValU2, nUMin, nUMax, nBit )
hb_memowrit( "big5.c", cResult )
return
static function array_to_code( aVal, cName, nn )
local cResult, l, n
cResult := "static const " + ;
iif( nn == 1, "HB_BYTE", "HB_USHORT" ) + " " + ;
cName + "[ " + hb_ntos( len( aVal ) ) + " ] =" + hb_eol()
cResult += "{" + hb_eol()
l := 0
for n := 1 to len( aVal )
if ++l > iif( nn == 1, 12, 8 )
l := 1
cResult += ","
cResult += hb_eol()
cResult += " "
elseif n == 1
cResult += " "
else
cResult += ", "
endif
cResult += "0x"
cResult += hb_numToHex( aVal[ n ], nn * 2 )
next
cResult += hb_eol()
cResult += "};" + hb_eol()
return cResult;
static function hash_to_array( hVal )
local aVal := {}, cLine, n
for each cLine in hVal
for n := 1 to len( cLine ) step( 2 )
aadd( aVal, bin2w( substr( cLine, n, 2 ) ) )
next
next
return aVal
function min_size( aVal, nMin, nMax, nBit )
local n, nM, nS, nSize, nMinX
nSize := 0xFFFFFF
nMinX := nMin
#ifdef DO_START_OPT
for nM := hb_bitAnd( nMin, 0xFF00 ) to nMin
#else
for nM := nMin to nMin
#endif
for n := 1 to 16
nS := calc_size( aVal, nM, nMax, n )
if nS < nSize
nSize := nS
nBit := n
nMinX := nM
endif
next
next
nMin := nMinX
return nSize
function calc_size( aVal, nMin, nMax, nBit, hVal, aInd, nn )
local nLine, n, cLine, c
nLine := int( 2 ^ nBit )
cLine := ""
hVal := {=>}
aInd := {}
for n := nMin to nMax
cLine += i2bin( iif( n == 0, 0, aVal[ n ] ) )
if len( cLine ) == nLine * 2
hVal[ cLine ] := cLine
aadd( aInd, hb_hpos( hVal, cLine ) - 1 )
cLine := ""
endif
next
if ! cLine == ""
for each c in hVal
if c = cLine
cLine := c
exit
endif
next
hVal[ cLine ] := cLine
aadd( aInd, hb_hpos( hVal, cLine ) - 1 )
endif
nn := iif( len( aInd ) > 256, 2, 1 )
n := len( aInd ) * nn
for each c in hVal
n += len( c )
next
return n
static function index_func( cName, cNameInd, cNameConv, cMin, cMax, cBit )
local cResult
cResult := "static HB_USHORT " + cName + "( int n )" + hb_eol() + ;
"{" + hb_eol() + ;
" n -= " + cMin + ";" + hb_eol() + ;
" if( n >= 0 && n <= ( " + cMax + " - " + cMin + " ) )" + hb_eol() + ;
" {" + hb_eol() + ;
" return " + cNameConv + "[ ( " + cNameInd + ;
"[ n >> " + cBit + " ] << " + cBit + " ) +" + hb_eol() + ;
space( len( cNameConv ) + 15 ) + ;
"( n & ( ( 1 << " + cBit + " ) - 1 ) ) ];" + hb_eol() + ;
" }" + hb_eol() + ;
" return 0;" + hb_eol() + ;
"}" + hb_eol()
return cResult
static function conv_get( n, aInd, aVal2, nMin, nMax, nBit )
local nDiv
if n >= nMin .and. n <= nMax
nDiv := 2 ^ nBit
n -= nMin
return aVal2[ aInd[ n / nDiv + 1 ] * nDiv + n % nDiv + 1 ]
endif
return 0
static function check_conv( aVal, aInd, aVal2, nMin, nMax, nBit )
local n, nVal
for n := 1 to len( aVal )
nVal := conv_get( n, aInd, aVal2, nMin, nMax, nBit )
if aVal[ n ] != nVal
? "Wrong decoding:", n, aVal[ n ], nVal, len( aVal ), nMax, hb_eol()
break
endif
next
return nil