Authors: Tom Bishop (tbishop@wenlin.com) and Richard Cook (rscook@wenlin.com).
Updated October, 2009.
This is not a finalized specification. It is still at the "draft proposal" stage and may change.
The following Perl script implements conversion, in both directions, between code points and UTF-X encodings.
Command-line option | Set | Maximum code point | 8-bit | 16-bit | 32-bit |
---|---|---|---|---|---|
-6 (default) | UCS-M | U+10FFFF | UTF-8 | UTF-16 | UTF-32 |
-8 | UCS-G | U+7FFFFFFF | UTF-G-8 | UTF-G-16 | UTF-G-32 |
-16 | UCS-E | U+7FFFFFFFFFFFFFFF | UTF-E-8 | UTF-E-16 | UTF-E-32 |
-32, -64 or -128 | UCS-∞ | (maxNUD = 32, 64, or 128) | UTF-∞-8 | UTF-∞-16 | UTF-∞-32 |
In order to illustrate the conversion algorithm explicitly, this implementation intentionally avoids using Perl's built-in capabilities for processing Unicode. It does not require a 64-bit processor. It checks validity by verifying that the output of round-trip conversion is identical to the input. Some CGI and other details of the web-based version are not included here.
#!/usr/bin/perl -w use strict; use Time::HiRes qw(gettimeofday); my $startTime = gettimeofday; # Command line can have either "U+..." (e.g. "U+123456789"), # or a UTF code in hexadecimal, with spaces between code units # (e.g. "FE 84 A3 91 96 9E 89" for UTF-E-8, # "DD24 DED1 DEB3 DF89" for UTF-E-16, # or "F0000012 E3456789" for UTF-E-32). my $maxNUD = 6; my $utfName = 'UTF'; my $usv = '?'; my $u8 = '?'; my $u16 = '?'; my $u32 = '?'; if ($ARGV[0] eq '-6') { # max U+10FFFF shift; $maxNUD = 6; $utfName = 'UTF'; } elsif ($ARGV[0] eq '-8') { # max U+7FFFFFFF shift; $maxNUD = 8; $utfName = 'UTF-G'; } elsif ($ARGV[0] eq '-16') { # max U+7FFFFFFFFFFFFFFF shift; $maxNUD = 16; $utfName = 'UTF-E'; } elsif ($ARGV[0] eq '-32' || $ARGV[0] eq '-64' || $ARGV[0] eq '-128') { $maxNUD = shift; $maxNUD =~ s/^-//; $utfName = 'UTF-∞'; } if ($ARGV[0] =~ /^U\+/) { $usv = uc($ARGV[0]); unless ($usv =~ /^U\+[0-9A-F]+$/) { $usv = '?'; } } else { my $utf = uc(join(' ', @ARGV)); if ($utf !~ /^[0-9A-F ]+$/) { $usv = '?'; } elsif ($utf =~ /^[0-9A-F]{6}/) { $usv = ConvertUTFX32ToUSV($u32 = $utf, $maxNUD); } elsif ($utf =~ /^[0-9A-F]{4}/) { $usv = ConvertUTFX16ToUSV($u16 = $utf, $maxNUD); } elsif ($utf =~ /^[0-9A-F]{2}/) { $usv = ConvertUTFX8ToUSV($u8 = $utf, $maxNUD); } else { $usv = '?'; } } if ($usv ne '?') { $u8 = ConvertUSVToUTFX8($usv, $maxNUD) if ($u8 eq '?'); $u16 = ConvertUSVToUTFX16($usv, $maxNUD) if ($u16 eq '?'); $u32 = ConvertUSVToUTFX32($usv, $maxNUD) if ($u32 eq '?'); if ($u8 eq '?' || $u16 eq '?' || $u32 eq '?') { $usv = '?'; } } if ($usv eq '?') { print "Invalid input.\n"; } else { print "USV = $usv\n"; print "$utfName-8 = $u8\n"; print "$utfName-16 = $u16\n"; print "$utfName-32 = $u32\n"; } printf("Finished in %.5f seconds.\n", gettimeofday() - $startTime); ###### High-level routines with round-trip checking ###### sub ConvertUSVToUTFX8 { my ($usv, $maxNUD) = @_; my $utf = ConvertUSVToUTFX8LL($usv, $maxNUD); return '?' if ($utf eq '?'); return (ConvertUTFX8ToUSVLL($utf, $maxNUD) eq $usv) ? $utf : '?'; } # ConvertUSVToUTFX8 sub ConvertUSVToUTFX16 { my ($usv, $maxNUD) = @_; my $utf = ConvertUSVToUTFX16LL($usv, $maxNUD); return '?' if ($utf eq '?'); return (ConvertUTFX16ToUSVLL($utf, $maxNUD) eq $usv) ? $utf : '?'; } # ConvertUSVToUTFX16 sub ConvertUSVToUTFX32 { my ($usv, $maxNUD) = @_; my $utf = ConvertUSVToUTFX32LL($usv, $maxNUD); return '?' if ($utf eq '?'); return (ConvertUTFX32ToUSVLL($utf, $maxNUD) eq $usv) ? $utf : '?'; } # ConvertUSVToUTFX32 sub ConvertUTFX8ToUSV { my ($utf, $maxNUD) = @_; my $usv = ConvertUTFX8ToUSVLL($utf, $maxNUD); return '?' if ($usv eq '?'); return (ConvertUSVToUTFX8LL($usv, $maxNUD) eq $utf) ? $usv : '?'; } # ConvertUTFX8ToUSV sub ConvertUTFX16ToUSV { my ($utf, $maxNUD) = @_; my $usv = ConvertUTFX16ToUSVLL($utf, $maxNUD); return '?' if ($usv eq '?'); return (ConvertUSVToUTFX16LL($usv, $maxNUD) eq $utf) ? $usv : '?'; } # ConvertUTFX16ToUSV sub ConvertUTFX32ToUSV { my ($utf, $maxNUD) = @_; my $usv = ConvertUTFX32ToUSVLL($utf, $maxNUD); return '?' if ($usv eq '?'); return (ConvertUSVToUTFX32LL($usv, $maxNUD) eq $utf) ? $usv : '?'; } # ConvertUTFX32ToUSV # Following are low-level ("LL") subroutines. ####################### UTF-X-8 ####################### sub ConvertUSVToUTFX8LL { my ($h, $NUD) = USVPrep(@_); return '?' if ($h eq '?'); if ($NUD < 8 || ($NUD == 8 && $h =~ /^[1-7]/)) { return ConvertUSVToUTF8SixByte(hex($h)); } my $utf = ($NUD > 9) ? 'FF' : 'FE'; if ($NUD < 18 || ($NUD == 18 && $h =~ /^[1-7]/)) { # Make a seven-byte or thirteen-byte code. if ($NUD == 8) { # Left-pad $h with a zero to make nine digits $h = '0' . $h; } elsif ($NUD > 9 && $NUD < 18) { # Left-pad $h with zeros to make eighteen digits $h = ('0' x (18 - $NUD)) . $h; } } else { # Make fourteen-byte or longer code. my $NME = $NUD - 18; my @nmeHexArray = split(//, sprintf('%X', $NME)); $utf .= (' B4' x (@nmeHexArray - 1)); foreach my $nmeHexDigit (@nmeHexArray) { $utf .= ' A' . $nmeHexDigit; } # Left-pad $h with one or two zeros as needed to make multiple of 3 my $m = ($NUD % 3); if ($m == 2) { $h = '0' . $h; } elsif ($m == 1) { $h = '00' . $h; } } return PackThreeUdigitsPerBytePair($utf, $h); } # ConvertUSVToUTFX8LL sub ConvertUTFX8ToUSVLL { my ($utf, $maxNUD) = @_; @_ = split(/ /, $utf); my $init = $_[0]; return ConvertUTF8SixByteToUSV(@_) unless ($init eq 'FE' || $init eq 'FF'); shift; # $init = FE or FF my $USVStorageByteCount = 0; if ($init eq 'FE') { $USVStorageByteCount = 6; # seven-byte code } elsif ($_[0] =~ /^[89]/) { $USVStorageByteCount = 12; # thirteen-byte code } else { # fourteen-byte or longer code my $AxCount = 1; my $lengthByte; while (($lengthByte = shift) eq 'B4') { $AxCount++; } my $NME = 0; for (;;) { my $Ax = hex($lengthByte); return '?' unless (($Ax | 0x0F) == 0xAF); $NME = ($NME << 4) + ($Ax & 0x0F); return '?' if ($NME + 14 > $maxNUD); last if (--$AxCount == 0); $lengthByte = shift; } $USVStorageByteCount = int(($NME + 18 + 2) / 3) * 2; } return '?' unless ($USVStorageByteCount == @_); return UnpackThreeUdigitsPerBytePair($maxNUD, @_); } # ConvertUTFX8ToUSVLL sub PackThreeUdigitsPerBytePair { my ($utf, $usv) = @_; while ($usv =~ s/^(.)(.)(.)//) { my ($x, $y, $z) = (hex($1), hex($2), hex($3)); $utf .= sprintf(' %02X %02X', (0x80|($x<<2)|($y>>2)), (0x80|(($y&3)<<4)|$z)); } return $utf; } # PackThreeUdigitsPerBytePair sub UnpackThreeUdigitsPerBytePair { my $maxNUD = shift; my $usv = ''; my $NUD = 0; while (defined(my $u = shift) && defined(my $v = shift)) { $u = hex($u) & 0x3F; $v = hex($v) & 0x3F; $usv .= sprintf('%X%X%X', ($u>>2), ((($u&3)<<2)|($v>>4)), ($v&0x0F)); $NUD += 3; return '?' if ($NUD - 2 > $maxNUD); # minus 2 since leading zeros don't count } $usv =~ s/^0*//; $NUD = length($usv); return ($NUD > $maxNUD) ? '?' : "U+$usv"; } # UnpackThreeUdigitsPerBytePair # The remainder of the UTF-X-8 code just implements the original # form of UTF-8 up to six bytes for U+7FFFFFFF. sub ConvertUSVToUTF8SixByte { my $x = shift; if ($x <= 0x7F) { return sprintf("%02X", $x); } if ($x <= 0x7FF) { return sprintf("%02X %02X", 0xC0|($x>>6), 0x80|($x&0x3F)); } if ($x <= 0xFFFF) { return sprintf("%02X %02X %02X", 0xE0|($x>>12), 0x80|(($x>>6)&0x3F), 0x80|($x&0x3F)); } if ($x <= 0x1FFFFF) { return sprintf("%02X %02X %02X %02X", 0xF0|($x>>18), 0x80|(($x>>12)&0x3F), 0x80|(($x>>6)&0x3F), 0x80|($x&0x3F)); } if ($x <= 0x3FFFFFF) { return sprintf("%02X %02X %02X %02X %02X", 0xF8|($x>>24), 0x80|(($x>>18)&0x3F), 0x80|(($x>>12)&0x3F), 0x80|(($x>>6)&0x3F), 0x80|($x&0x3F)); } if ($x <= 0x7FFFFFFF) { return sprintf("%02X %02X %02X %02X %02X %02X", 0xFC|($x>>30), 0x80|(($x>>24)&0x3F), 0x80|(($x>>18)&0x3F), 0x80|(($x>>12)&0x3F), 0x80|(($x>>6)&0x3F), 0x80|($x&0x3F)); } return '?'; } # ConvertUSVToUTF8SixByte sub ConvertUTF8SixByteToUSV { my $i = ConvertUTF8SixByteToInteger(@_); if ($i < 0 || ($i >= 0xD800 && $i <= 0xDFFF)) { return '?'; } return sprintf('U+%04X', $i); } # ConvertUTF8SixByteToUSV sub ConvertUTF8SixByteToInteger { return -1 unless defined $_[0]; my $b0 = hex(shift); if ($b0 <= 0x7F) { return $b0; } return -1 unless defined $_[0]; my $b1 = hex(shift) & 0x3F; if ($b0 <= 0xDF) { return (($b0&0x1F)<<6)|$b1; } return -1 unless defined $_[0]; my $b2 = hex(shift) & 0x3F; if ($b0 <= 0xEF) { return (($b0&0xF)<<12)|($b1<<6)|$b2; } return -1 unless defined $_[0]; my $b3 = hex(shift) & 0x3F; if ($b0 <= 0xF7) { return (($b0&7)<<18)|($b1<<12)|($b2<<6)|$b3; } return -1 unless defined $_[0]; my $b4 = hex(shift) & 0x3F; if ($b0 <= 0xFB) { return (($b0&3)<<24)|($b1<<18)|($b2<<12)|($b3<<6)|$b4; } return -1 unless defined $_[0]; my $b5 = hex(shift) & 0x3F; if ($b0 <= 0xFD) { return (($b0&1)<<30)|($b1<<24)|($b2<<18)|($b3<<12)|($b4<<6)|$b5; } return -1; } # ConvertUTF8SixByteToInteger ####################### UTF-X-16 ####################### sub ConvertUSVToUTFX16LL { my ($h, $NUD) = USVPrep(@_); return '?' if ($h eq '?'); if ($NUD <= 4) { # one-unit code (BMP) return ('0' x (4 - $NUD)) . $h; } if ($NUD < 6 || ($NUD == 6 && hex($h) <= 0x10ffff)) { my $n = hex($h); # two-unit code: surrogate pair return sprintf('%04X %04X', (0xd800 | (($n - 0x10000) >> 10)), (0xdc00 | ($n & 0x03ff))); } my $utf = ''; my $nUSVUnits; my @bin = split(//, HexToBin($h)); my $bitsNeeded = @bin; # Note: $bitsNeeded >= 21 if ($bitsNeeded <= 90) { # 3 to 11 units my $nUnits = 3 + int(($bitsNeeded - 19) / 8); my $unitBitsAvail = 11 - $nUnits; my $paddingBits = $unitBitsAvail + 9 * ($nUnits - 1) - $bitsNeeded; while ($paddingBits--) { unshift(@bin, '0'); } my $unitBits = '1101110' . ('1' x ($nUnits - 3)) . '0'; while ($unitBitsAvail--) { $unitBits .= shift @bin; } $utf = BinToHex($unitBits); $nUSVUnits = $nUnits - 1; } else { # 13-or-more-unit code my $NMT = $NUD - 23; my $beforeCount = ($NMT < 256) ? 0 : int(log($NMT)/log(256)); $utf = 'DDFF' . (' DFB4' x $beforeCount); for (my $i = $beforeCount; $i >= 0; $i--) { $utf .= sprintf(' DE%02X', ($NMT >> ($i << 3)) & 0xFF); } $nUSVUnits = int(($bitsNeeded + 8) / 9); my $paddingBits = 9 * $nUSVUnits - $bitsNeeded; # < 9 my $unitBits = '1101111' . ('0' x $paddingBits); for (my $i = 9 - $paddingBits; $i--; ) { $unitBits .= shift @bin; } $utf .= ' ' . BinToHex($unitBits); $nUSVUnits--; } while ($nUSVUnits--) { my $unitBits = '1101111'; for (my $i = 9; $i--; ) { $unitBits .= shift @bin; } $utf .= ' ' . BinToHex($unitBits) } return $utf; } # ConvertUSVToUTFX16LL sub ConvertUTFX16ToUSVLL { my ($utf, $maxNUD) = @_; @_ = split(/ /, $utf); my $unit = shift; my $h = hex($unit); if ($h < 0xDC00 || $h > 0xDFFF) { if ($h < 0xD800 || $h > 0xDFFF) { return 'U+' . $unit; # 1-unit code } # 2-unit code; combine surrogates return sprintf('U+%X', 0x10000 + ((($h - 0xd800)) << 10) + hex(shift) - 0xdc00); } my $usv = ''; my $nybble = 0; my $nybbleBitCount = 0; my $codeBitsAvail; my $nUSVUnits; if ($unit ne 'DDFF') { # 3 to 11-unit code my $nUnits = 3; my $mask = (1 << 8); while (($h & $mask) != 0) { ++$nUnits; $mask >>= 1; } my $unitBitsAvail = 11 - $nUnits; $nUSVUnits = $nUnits - 1; # not including the 1st unit my $unitsRemaining = @_; if ($nUSVUnits != $unitsRemaining) { return '?'; } $codeBitsAvail = $unitBitsAvail + 9 * $nUSVUnits; my $codeBitsMod4 = ($codeBitsAvail % 4); $nybbleBitCount = $codeBitsMod4 ? (4 - $codeBitsMod4) : 0; while ($unitBitsAvail--) { $mask >>= 1; $nybble = (($nybble << 1) | (($h & $mask) ? 1 : 0)); if (++$nybbleBitCount == 4) { $usv .= sprintf('%X', $nybble); $nybbleBitCount = $nybble = 0; } } } else { # 13-or-more-unit code my $beforeCount = 0; while (($unit = shift) eq 'DFB4') { $beforeCount++; } my $NMT = hex($unit) & 0xff; for (my $i = 0; $i < $beforeCount; $i++) { $NMT = ($NMT << 8) | (hex(shift) & 0xff); } my $NUD = $NMT + 23; return '?' if ($NUD > $maxNUD); my $unitsRemaining = @_; $codeBitsAvail = 9 * $unitsRemaining; if ($codeBitsAvail < $NUD * 4 - 3) { return '?'; } my $codeBitsMod4 = ($codeBitsAvail % 4); $nybbleBitCount = $codeBitsMod4 ? (4 - $codeBitsMod4) : 0; $nUSVUnits = $unitsRemaining; } while ($nUSVUnits--) { $h = hex(shift); my $mask = (1 << 8); while ($mask) { $nybble = (($nybble << 1) | (($h & $mask) ? 1 : 0)); $mask >>= 1; if (++$nybbleBitCount == 4) { $usv .= sprintf('%X', $nybble); $nybbleBitCount = $nybble = 0; } } } my $NUD; ($h, $NUD) = USVPrep($usv, $maxNUD); return ($h eq '?') ? '?' : "U+$h"; } # ConvertUTFX16ToUSVLL sub BinToHex { # Input is 16-bit string like '1010101010101010'; output is like 'AAAA'. return sprintf('%04X', unpack('n', pack('B16', shift))); } # BinToHex sub HexToBin { my $s = shift; # Maybe very long hex string; don't convert it all to an integer. $s =~ s/0/0000/g; $s =~ s/1/0001/g; $s =~ s/2/0010/g; $s =~ s/3/0011/g; $s =~ s/4/0100/g; $s =~ s/5/0101/g; $s =~ s/6/0110/g; $s =~ s/7/0111/g; $s =~ s/8/1000/g; $s =~ s/9/1001/g; $s =~ s/A/1010/g; $s =~ s/B/1011/g; $s =~ s/C/1100/g; $s =~ s/D/1101/g; $s =~ s/E/1110/g; $s =~ s/F/1111/g; $s =~ s/^0*//; return $s; } # HexToBin ####################### UTF-X-32 ####################### sub ConvertUSVToUTFX32LL { my ($h, $NUD) = USVPrep(@_); return '?' if ($h eq '?'); if ($NUD < 8 || ($NUD == 8 && $h !~ /^[EF]/)) { # one-unit code return ('0' x (8 - $NUD)) . $h; } if ($NUD < 14 || ($NUD == 14 && $h !~ /^[EF]/)) { # two-unit code $h = ('0' x (14 - $NUD)) . $h; # insert F and E prefix nybbles and insert space between units. $h =~ s/^(.......)/F$1 E/; return $h; } my $unitsNeeded; my $utf = 'FF'; if ($NUD < 20) { # 3-unit code $unitsNeeded = 3; $h = ('0' x (20 - $NUD)) . $h; } else { # 4-unit or longer code. my $NM = $NUD - 20; my $nmHex = ($NM <= 0) ? '0' : sprintf('%X', $NM); my $beforeCount = length($nmHex) - 1; my $nybblesNeeded = 1 + 2 * ($beforeCount + 1) + $NUD; $unitsNeeded = int(($nybblesNeeded + 6) / 7); my $padding = $unitsNeeded * 7 - $nybblesNeeded; $h = ('B' x $beforeCount) . 'A' . $nmHex . ('0' x $padding) . $h; } $h =~ s/^(......)//; $utf .= $1; while (--$unitsNeeded) { $h =~ s/^(.......)//; $utf .= ' E' . $1; } return $utf; } # ConvertUSVToUTFX32LL sub ConvertUTFX32ToUSVLL { my ($utf, $maxNUD) = @_; @_ = split(/ /, $utf); my $unit = shift; # initial unit if ($unit !~ /^[EF]/) { # one unit $unit =~ s/^0*//; # remove leading zeros my $len = length($unit); if ($len < 4) { $unit = ('0' x (4 - $len)) . $unit; } return '?' if ($maxNUD == 8 && hex($unit) > 0x7FFFFFFF); return 'U+' . $unit; } return '?' if ($maxNUD == 8); if ($unit !~ /^FF/) { # two units $unit =~ s/^.//; # remove prefix my $unit2 = shift; # second unit $unit2 =~ s/^.//; # remove prefix my ($h, $NUD) = USVPrep($unit . $unit2, $maxNUD); return ($h eq '?') ? '?' : "U+$h"; } # three or more units $unit =~ s/^..//; # remove prefix my @a = split(//, $unit); while (defined ($unit = shift)) { $unit = $unit; return '?' unless ($unit =~ /^E/); $unit =~ s/^.//; # remove prefix push(@a, split(//, $unit)); } my $NUD; if ($a[0] eq '0') { # 3 units shift(@a); return '?' unless (@a == 19); $NUD = 19; } else { # 4 or more units my $beforeCount = 0; my $lengthNybble; while (($lengthNybble = shift(@a)) eq 'B') { $beforeCount++; } return '?' unless ($lengthNybble eq 'A'); my $nmHex = ''; for (my $i = 0; $i <= $beforeCount; $i++) { $nmHex .= shift(@a); } $NUD = 20 + hex($nmHex); } while ($a[0] eq '0') { shift(@a); --$NUD if ($NUD < 20); } return '?' unless ($NUD <= $maxNUD && $NUD == @a); return 'U+' . join('', @a); } # ConvertUTFX32ToUSVLL sub USVPrep { # Input: $usv, $maxNUD = 6, 8, 16, or larger power of two. # If $usv is out of range, return ('?', 0). # If $usv is OK (in range), return ($h, $NUD), # where $h is $usv without "U+" or leading zeros. my ($h, $maxNUD) = @_; $h =~ s/^U\+//; # remove "U+" (if present) $h =~ s/^0*//; # remove any leading zeros (even if there was no U+) my $NUD = length($h); # count how many udigits if ($NUD > $maxNUD || ($NUD == $maxNUD && (($NUD == 6 && hex($h) > 0x10FFFF) || ($NUD >= 8 && $h !~ /^[1-7]/)))) { return ('?', 0); # out of range } return ($h, $NUD); # in range } # USVPrep __END__