* src/rtl/cdpapi.c
+ added fallback translation table for different variants of Latin
character, now when translation is made between different encoding
and the variant of Latin character does not exist in destination
code page then it's translated to their base Latin form, i.e.:
hb_utf8ToStr( "ĄĆĘŁŃÓŚŹŻąćęłńóśźż", "EN" ) -> "ACELNOSZZacelnószz"
* tests/uc16_gen.prg
; updated comment
478 lines
14 KiB
Plaintext
478 lines
14 KiB
Plaintext
/*
|
|
* code to generate C source with tables UCS16 character flags using data
|
|
* defined by Unicode, Inc. https://unicode.org/Public/UNIDATA/UnicodeData.txt
|
|
* harbour/src/codepage/uc16def.c was generated by this code.
|
|
*
|
|
* Copyright 2011 Przemyslaw Czerpak <druzus / at / priv.onet.pl>
|
|
*
|
|
*/
|
|
|
|
|
|
//#define DO_START_OPT
|
|
|
|
/* character flags */
|
|
#define HB_CDP_DIGIT 0x01
|
|
#define HB_CDP_ALPHA 0x02
|
|
#define HB_CDP_LOWER 0x04
|
|
#define HB_CDP_UPPER 0x08
|
|
|
|
proc main()
|
|
local cLine, aLine
|
|
local cGenCat
|
|
local nCode, nUpper, nLower, nFlags
|
|
local nUppers, nLowers
|
|
local nMinCh, nMinUp, nMinLo
|
|
local nMaxCh, nMaxUp, nMaxLo
|
|
local nBitCh, nBitUp, nBitLo
|
|
local aLower, aUpper, aFlags
|
|
local cResult
|
|
local aInd, aVal, hVal
|
|
local n, nn
|
|
|
|
local nWarning := 2
|
|
local lConvAll := .f.
|
|
|
|
aLower := afill( array( 0x10000 ), 0 )
|
|
aUpper := afill( array( 0x10000 ), 0 )
|
|
aFlags := afill( array( 0x10000 ), 0 )
|
|
nMaxCh := nMaxUp := nMaxLo := 0x0000
|
|
nMinCh := nMinUp := nMinLo := 0xFFFF
|
|
nUppers := nLowers := 0
|
|
|
|
for each cLine in hb_aTokens( hb_memoRead( "UnicodeData.txt" ), hb_eol() )
|
|
if ! empty( cLine )
|
|
aLine := hb_aTokens( cLine, ";" )
|
|
if len( aLine ) == 15
|
|
nCode := hb_hexToNum( aLine[ 1 ] )
|
|
if nCode > 0 .and. nCode < 0xFFFF
|
|
nUpper := hb_hexToNum( aLine[ 13 ] )
|
|
nLower := hb_hexToNum( aLine[ 14 ] )
|
|
nFlags := 0
|
|
cGenCat := aLine[ 3 ]
|
|
if "Lu" $ cGenCat
|
|
nFlags := hb_bitOR( nFlags, HB_CDP_ALPHA, HB_CDP_UPPER )
|
|
if "Lt" $ cGenCat
|
|
? "title + upper, line:", cLine:__enumIndex()
|
|
endif
|
|
elseif nLower != 0
|
|
if "Lt" $ cGenCat .or. ! lConvAll
|
|
nLower := 0
|
|
elseif nWarning >= 2
|
|
? "lower for non upper, line:", cLine:__enumIndex()
|
|
endif
|
|
endif
|
|
if "Ll" $ cGenCat
|
|
if "Lt" $ cGenCat
|
|
? "title + lower, line:", cLine:__enumIndex()
|
|
endif
|
|
nFlags := hb_bitOR( nFlags, HB_CDP_ALPHA, HB_CDP_LOWER )
|
|
elseif nUpper != 0
|
|
if "Lt" $ cGenCat .or. ! lConvAll
|
|
nUpper := 0
|
|
elseif nWarning >= 2
|
|
? "upper for non lower, line:", cLine:__enumIndex()
|
|
endif
|
|
endif
|
|
if nCode >= asc( "0" ) .and. nCode <= asc( "9" )
|
|
nFlags := hb_bitOR( nFlags, HB_CDP_DIGIT )
|
|
endif
|
|
if nUpper >= 0xFFFF
|
|
? "Lower out of range, line:", cLine:__enumIndex()
|
|
endif
|
|
if nLower >= 0xFFFF
|
|
? "Upper out of range, line:", cLine:__enumIndex()
|
|
endif
|
|
if nCode > 0 .and. nCode < 0xFFFF
|
|
if nFlags > 0
|
|
aFlags[ nCode ] := nFlags
|
|
if nMaxCh < nCode
|
|
nMaxCh := nCode
|
|
endif
|
|
if nMinCh > nCode
|
|
nMinCh := nCode
|
|
endif
|
|
endif
|
|
if nUpper > 0
|
|
nUppers++
|
|
aUpper[ nCode ] := nUpper
|
|
if nMaxUp < nCode
|
|
nMaxUp := nCode
|
|
endif
|
|
if nMinUp > nCode
|
|
nMinUp := nCode
|
|
endif
|
|
endif
|
|
if nLower > 0
|
|
nLowers++
|
|
aLower[ nCode ] := nLower
|
|
if nMaxLo < nCode
|
|
nMaxLo := nCode
|
|
endif
|
|
if nMinLo > nCode
|
|
nMinLo := nCode
|
|
endif
|
|
endif
|
|
endif
|
|
endif
|
|
else
|
|
? "Wrong line:", cLine:__enumIndex()
|
|
endif
|
|
endif
|
|
next
|
|
|
|
? "uppers#:", nUppers
|
|
? "min upper:", nMinUp
|
|
? "max upper:", nMaxUp
|
|
? "TOUPPER tables."
|
|
? "raw size:", hb_ntos( ( nMaxUp - nMinUp + 1 ) * 2 )
|
|
#ifndef DO_START_OPT
|
|
nMinUp := min( nMinUp, 48 ) // optimal
|
|
#endif
|
|
n := min_size16( aUpper, @nMinUp, nMaxUp, @nBitUp )
|
|
? "minimal size:", hb_ntos( n ), ;
|
|
"for", hb_ntos( hb_bitshift( 1, nBitUp ) ), "byte blocks, (from: " + hb_ntos( nMinUp ) + ")"
|
|
?
|
|
|
|
? "lowers#:", nLowers
|
|
? "min lower:", nMinLo
|
|
? "max lower:", nMaxLo
|
|
? "TOLOWER tables."
|
|
? "raw size:", hb_ntos( ( nMaxLo - nMinLo + 1 ) * 2 )
|
|
#ifndef DO_START_OPT
|
|
nMinLo := min( nMinLo, 32 ) // optimal
|
|
#endif
|
|
n := min_size16( aLower, @nMinLo, nMaxLo, @nBitLo )
|
|
? "minimal size:", hb_ntos( n ), ;
|
|
"for", hb_ntos( hb_bitshift( 1, nBitLo ) ), "byte blocks, (from: "+hb_ntos( nMinLo ) + ")"
|
|
?
|
|
|
|
? "min char:", nMinCh
|
|
? "max char:", nMaxCh
|
|
? "ATTR tables."
|
|
? "raw size:", hb_ntos( int( ( nMaxCh - nMinCh + 2 ) / 2 ) )
|
|
#ifndef DO_START_OPT
|
|
nMinCh := min( nMinCh, 0 ) // optimal
|
|
#endif
|
|
n := min_size04( aFlags, @nMinCh, nMaxCh, @nBitCh )
|
|
? "minimal size:", hb_ntos( n ), ;
|
|
"for", hb_ntos( hb_bitshift( 1, nBitCh ) ), "byte blocks, (from: " + hb_ntos( nMinCh ) + ")"
|
|
?
|
|
*/
|
|
|
|
|
|
cResult := ;
|
|
"/*" + hb_eol() + ;
|
|
" * Unicode character tables" + hb_eol() + ;
|
|
" * code generated automatically by tests/uc16_gen.prg" + hb_eol() + ;
|
|
" *" + hb_eol() + ;
|
|
" * Copyright 2012 Przemyslaw Czerpak <druzus / at / priv.onet.pl>" + hb_eol() + ;
|
|
" *" + hb_eol() + ;
|
|
" */" + hb_eol() + ;
|
|
hb_eol()
|
|
|
|
cResult += '#include "hbapi.h"' + hb_eol()
|
|
cResult += hb_eol()
|
|
|
|
|
|
cResult += hb_eol()
|
|
cResult += "#define HB_UCUP_FIRST 0x" + hb_numToHex( nMinUp, 4 ) + hb_eol()
|
|
cResult += "#define HB_UCUP_LAST 0x" + hb_numToHex( nMaxUp, 4 ) + hb_eol()
|
|
cResult += "#define HB_UCUP_BITS " + hb_ntos( nBitUp ) + hb_eol()
|
|
cResult += hb_eol()
|
|
|
|
calc_size16( aUpper, nMinUp, nMaxUp, nBitUp, @hVal, @aInd, @nn )
|
|
aVal := hash_to_array16( hVal )
|
|
check_conv16( aUpper, aInd, aVal, nMinUp, nMaxUp, nBitUp )
|
|
|
|
cResult += array_to_code( aInd, "s_up_idx", nn )
|
|
cResult += hb_eol()
|
|
cResult += array_to_code( aVal, "s_up_val", 2 )
|
|
cResult += hb_eol()
|
|
cResult += index_func16( "s_uc_upper", "s_up_idx", "s_up_val", ;
|
|
"HB_UCUP_FIRST", "HB_UCUP_LAST", "HB_UCUP_BITS" )
|
|
|
|
|
|
cResult += hb_eol()
|
|
cResult += "#define HB_UCLO_FIRST 0x" + hb_numToHex( nMinLo, 4 ) + hb_eol()
|
|
cResult += "#define HB_UCLO_LAST 0x" + hb_numToHex( nMaxLo, 4 ) + hb_eol()
|
|
cResult += "#define HB_UCLO_BITS " + hb_ntos( nBitLo ) + hb_eol()
|
|
cResult += hb_eol()
|
|
|
|
calc_size16( aLower, nMinLo, nMaxLo, nBitLo, @hVal, @aInd, @nn )
|
|
aVal := hash_to_array16( hVal )
|
|
check_conv16( aLower, aInd, aVal, nMinLo, nMaxLo, nBitLo )
|
|
|
|
cResult += array_to_code( aInd, "s_lo_idx", nn )
|
|
cResult += hb_eol()
|
|
cResult += array_to_code( aVal, "s_lo_val", 2 )
|
|
cResult += hb_eol()
|
|
cResult += index_func16( "s_uc_lower", "s_lo_idx", "s_lo_val", ;
|
|
"HB_UCLO_FIRST", "HB_UCLO_LAST", "HB_UCLO_BITS" )
|
|
|
|
|
|
cResult += hb_eol()
|
|
cResult += "#define HB_UCFL_FIRST 0x" + hb_numToHex( nMinCh, 4 ) + hb_eol()
|
|
cResult += "#define HB_UCFL_LAST 0x" + hb_numToHex( nMaxCh, 4 ) + hb_eol()
|
|
cResult += "#define HB_UCFL_BITS " + hb_ntos( nBitCh ) + hb_eol()
|
|
cResult += hb_eol()
|
|
|
|
calc_size04( aFlags, nMinCh, nMaxCh, nBitCh, @hVal, @aInd, @nn )
|
|
aVal := hash_to_array04( hVal )
|
|
check_conv04( aFlags, aInd, aVal, nMinCh, nMaxCh, nBitCh )
|
|
|
|
cResult += array_to_code( aInd, "s_ch_idx", nn )
|
|
cResult += hb_eol()
|
|
cResult += array_to_code( aVal, "s_ch_val", 1 )
|
|
cResult += hb_eol()
|
|
cResult += index_func04( "s_uc_flags", "s_ch_idx", "s_ch_val", ;
|
|
"HB_UCFL_FIRST", "HB_UCFL_LAST", "HB_UCFL_BITS" )
|
|
|
|
|
|
hb_memowrit( "uc16def.c", cResult )
|
|
return
|
|
|
|
static function array_to_code( aVal, cName, nn )
|
|
local cResult, l, n
|
|
|
|
cResult := "static const " + ;
|
|
iif( nn == 1, "HB_BYTE", "HB_USHORT" ) + " " + ;
|
|
cName + "[ " + hb_ntos( len( aVal ) ) + " ] =" + hb_eol()
|
|
cResult += "{" + hb_eol()
|
|
l := 0
|
|
for n := 1 to len( aVal )
|
|
if ++l > iif( nn == 1, 12, 8 )
|
|
l := 1
|
|
cResult += ","
|
|
cResult += hb_eol()
|
|
cResult += " "
|
|
elseif n == 1
|
|
cResult += " "
|
|
else
|
|
cResult += ", "
|
|
endif
|
|
cResult += "0x"
|
|
cResult += hb_numToHex( aVal[ n ], nn * 2 )
|
|
next
|
|
cResult += hb_eol()
|
|
cResult += "};" + hb_eol()
|
|
return cResult;
|
|
|
|
static function hash_to_array16( hVal )
|
|
local aVal := {}, cLine, n
|
|
for each cLine in hVal
|
|
for n := 1 to len( cLine ) step( 2 )
|
|
aadd( aVal, bin2w( substr( cLine, n, 2 ) ) )
|
|
next
|
|
next
|
|
return aVal
|
|
|
|
static function hash_to_array04( hVal )
|
|
local aVal := {}, cLine, c
|
|
for each cLine in hVal
|
|
for each c in cLine
|
|
aadd( aVal, asc( c ) )
|
|
next
|
|
next
|
|
return aVal
|
|
|
|
static function index_func16( cName, cNameInd, cNameConv, cMin, cMax, cBit )
|
|
local cResult
|
|
|
|
cResult := "static HB_USHORT " + cName + "( int n )" + hb_eol() + ;
|
|
"{" + hb_eol() + ;
|
|
" n -= " + cMin + ";" + hb_eol() + ;
|
|
" if( n >= 0 && n <= ( " + cMax + " - " + cMin + " ) )" + hb_eol() + ;
|
|
" {" + hb_eol() + ;
|
|
" return " + cNameConv + "[ ( " + cNameInd + ;
|
|
"[ n >> " + cBit + " ] << " + cBit + " ) +" + hb_eol() + ;
|
|
space( len( cNameConv ) + 15 ) + ;
|
|
"( n & ( ( 1 << " + cBit + " ) - 1 ) ) ];" + hb_eol() + ;
|
|
" }" + hb_eol() + ;
|
|
" return 0;" + hb_eol() + ;
|
|
"}" + hb_eol()
|
|
|
|
return cResult
|
|
|
|
static function index_func04( cName, cNameInd, cNameConv, cMin, cMax, cBit )
|
|
local cResult
|
|
|
|
cResult := "static int " + cName + "( int n )" + hb_eol() + ;
|
|
"{" + hb_eol() + ;
|
|
" n -= " + cMin + ";" + hb_eol() + ;
|
|
" if( n >= 0 && n <= ( " + cMax + " - " + cMin + " ) )" + hb_eol() + ;
|
|
" {" + hb_eol() + ;
|
|
" HB_BYTE v;" + hb_eol() + ;
|
|
" v = " + cNameConv + "[ ( " + cNameInd + ;
|
|
"[ n >> " + cBit + " ] << ( " + cBit + " - 1 ) ) +" + hb_eol() + ;
|
|
space( len( cNameConv ) + 12 ) + ;
|
|
"( ( n & ( ( 1 << " + cBit + " ) - 1 ) ) >> 1 ) ];" + hb_eol() + ;
|
|
" return n & 1 ? v >> 4 : v & 0x0F;" + hb_eol() + ;
|
|
" }" + hb_eol() + ;
|
|
" return 0;" + hb_eol() + ;
|
|
"}" + hb_eol()
|
|
|
|
return cResult
|
|
|
|
function min_size16( aVal, nMin, nMax, nBit )
|
|
local n, nM, nS, nSize, nMinX
|
|
nSize := 0xFFFFFF
|
|
nMinX := nMin
|
|
#ifdef DO_START_OPT
|
|
for nM := 0 to nMin
|
|
#else
|
|
for nM := nMin to nMin
|
|
#endif
|
|
for n := 1 to 16
|
|
nS := calc_size16( aVal, nM, nMax, n )
|
|
if nS < nSize
|
|
nSize := nS
|
|
nBit := n
|
|
nMinX := nM
|
|
endif
|
|
next
|
|
next
|
|
nMin := nMinX
|
|
return nSize
|
|
|
|
function calc_size16( aVal, nMin, nMax, nBit, hVal, aInd, nn )
|
|
local nLine, n, cLine, c
|
|
|
|
nLine := int( 2 ^ ( nBit + 1 ) )
|
|
|
|
cLine := ""
|
|
hVal := {=>}
|
|
aInd := {}
|
|
for n := nMin to nMax
|
|
cLine += i2bin( iif( n == 0, 0, aVal[ n ] ) )
|
|
if len( cLine ) == nLine
|
|
hVal[ cLine ] := cLine
|
|
aadd( aInd, hb_hpos( hVal, cLine ) - 1 )
|
|
cLine := ""
|
|
endif
|
|
next
|
|
if ! cLine == ""
|
|
for each c in hVal
|
|
if c = cLine
|
|
cLine := c
|
|
exit
|
|
endif
|
|
next
|
|
hVal[ cLine ] := cLine
|
|
aadd( aInd, hb_hpos( hVal, cLine ) - 1 )
|
|
endif
|
|
nn := iif( len( aInd ) > 256, 2, 1 )
|
|
n := len( aInd ) * nn
|
|
for each c in hVal
|
|
n += len( c )
|
|
next
|
|
|
|
return n
|
|
|
|
function min_size04( aVal, nMin, nMax, nBit )
|
|
local n, nM, nS, nSize, nMinX
|
|
nSize := 0xFFFFFF
|
|
nMinX := nMin
|
|
#ifdef DO_START_OPT
|
|
for nM := 0 to nMin
|
|
#else
|
|
for nM := nMin to nMin
|
|
#endif
|
|
for n := 1 to 16
|
|
nS := calc_size04( aVal, nM, nMax, n )
|
|
if nS < nSize
|
|
nSize := nS
|
|
nBit := n
|
|
nMinX := nM
|
|
endif
|
|
next
|
|
next
|
|
nMin := nMinX
|
|
return nSize
|
|
|
|
function calc_size04( aVal, nMin, nMax, nBit, hVal, aInd, nn )
|
|
local nLine, n, cLine, c
|
|
|
|
nLine := int( 2 ^ ( nBit - 1 ) )
|
|
|
|
cLine := ""
|
|
hVal := {=>}
|
|
aInd := {}
|
|
for n := nMin to nMax step 2
|
|
cLine += chr( iif( n == 0, 0, aVal[ n ] ) + aVal[ n + 1 ] * 16 )
|
|
if len( cLine ) == nLine
|
|
hVal[ cLine ] := cLine
|
|
aadd( aInd, hb_hpos( hVal, cLine ) - 1 )
|
|
cLine := ""
|
|
endif
|
|
next
|
|
if ! cLine == ""
|
|
for each c in hVal
|
|
if c = cLine
|
|
cLine := c
|
|
exit
|
|
endif
|
|
next
|
|
hVal[ cLine ] := cLine
|
|
aadd( aInd, hb_hpos( hVal, cLine ) - 1 )
|
|
endif
|
|
nn := iif( len( aInd ) > 256, 2, 1 )
|
|
n := len( aInd ) * nn
|
|
for each c in hVal
|
|
n += len( c )
|
|
next
|
|
|
|
return n
|
|
|
|
static function conv_get16( n, aInd, aVal, nMin, nMax, nBit )
|
|
local nDiv
|
|
if n >= nMin .and. n <= nMax
|
|
nDiv := 2 ^ nBit
|
|
n -= nMin
|
|
return aVal[ aInd[ n / nDiv + 1 ] * nDiv + n % nDiv + 1 ]
|
|
endif
|
|
return 0
|
|
|
|
static function check_conv16( aConv, aInd, aVal, nMin, nMax, nBit )
|
|
local n, nVal
|
|
for n := 1 to len( aConv )
|
|
nVal := conv_get16( n, aInd, aVal, nMin, nMax, nBit )
|
|
if aConv[ n ] != nVal
|
|
? "Wrong decoding:", n, aConv[ n ], nVal, len( aConv ), nMax //, hb_eol()
|
|
break
|
|
// exit
|
|
endif
|
|
next
|
|
return nil
|
|
|
|
static function conv_get04( n, aInd, aVal, nMin, nMax, nBit )
|
|
local nDiv, nByte, nInd
|
|
if n >= nMin .and. n <= nMax
|
|
nDiv := int( 2 ^ nBit )
|
|
n -= nMin
|
|
// nInd := aInd[ n / nDiv + 1 ] * nDiv + n % nDiv
|
|
// nByte := aVal[ nInd / 2 + 1 ]
|
|
// return iif( n % 2 == 0, hb_bitAnd( nByte, 0x0F ), int( nByte / 16 ) )
|
|
|
|
nInd := aInd[ n / nDiv + 1 ] * nDiv / 2 + ( n % nDiv ) / 2
|
|
nByte := aVal[ nInd + 1 ]
|
|
return iif( n % 2 == 1, int( nByte / 16 ), hb_bitAnd( nByte, 0x0F ) )
|
|
|
|
// v = s_ch_val[ ( s_ch_idx[ n >> HB_UCFL_BITS ] << ( HB_UCFL_BITS - 1 ) ) +
|
|
// ( ( n & ( ( 1 << HB_UCFL_BITS ) - 1 ) ) >> 1 ) ];
|
|
// return n & 1 ? v >> 4 : v & 0x0F;
|
|
|
|
endif
|
|
return 0
|
|
|
|
static function check_conv04( aConv, aInd, aVal, nMin, nMax, nBit )
|
|
local n, nVal
|
|
for n := 1 to len( aConv )
|
|
nVal := conv_get04( n, aInd, aVal, nMin, nMax, nBit )
|
|
if aConv[ n ] != nVal
|
|
? "Wrong decoding:", n, aConv[ n ], nVal, len( aConv ), nMax //, hb_eol()
|
|
// break
|
|
// exit
|
|
endif
|
|
next
|
|
return nil
|