syslinux/codepage/cptable.pl
<<
>>
Prefs
   1#!/usr/bin/perl
   2#
   3# Produce a codepage matching table.  For each 8-bit character, list
   4# a primary and an alternate match (the latter used for case-insensitive
   5# matching.)
   6#
   7# Usage:
   8#       cptable.pl UnicodeData console-cp.txt filesystem-cp.txt output.cp
   9#
  10# Note: for the format of the UnicodeData file, see:
  11# http://www.unicode.org/Public/UNIDATA/UCD.html
  12#
  13
  14($ucd, $cpco, $cpfs, $cpout) = @ARGV;
  15
  16if (!defined($cpout)) {
  17    die "Usage: $0 UnicodeData console-cp.txt fs-cp.txt output.cp\n";
  18}
  19
  20%ucase   = ();
  21%lcase   = ();
  22%tcase   = ();
  23%decomp  = ();
  24
  25open(UCD, '<', $ucd)
  26    or die "$0: could not open unicode data: $ucd: $!\n";
  27while (defined($line = <UCD>)) {
  28    chomp $line;
  29    @f = split(/;/, $line);
  30    $n = hex $f[0];
  31    $ucase{$n} = ($f[12] ne '') ? hex $f[12] : $n;
  32    $lcase{$n} = ($f[13] ne '') ? hex $f[13] : $n;
  33    $tcase{$n} = ($f[14] ne '') ? hex $f[14] : $n;
  34    if ($f[5] =~ /^[0-9A-F\s]+$/) {
  35        # This character has a canonical decomposition.
  36        # The regular expression rejects angle brackets, so other
  37        # decompositions aren't permitted.
  38        $decomp{$n} = [];
  39        foreach my $dch (split(' ', $f[5])) {
  40            push(@{$decomp{$n}}, hex $dch);
  41        }
  42    }
  43}
  44close(UCD);
  45
  46#
  47# Filesystem and console codepages.  The filesystem codepage is used
  48# for FAT shortnames, whereas the console codepage is whatever is used
  49# on the screen and keyboard.
  50#
  51@xtab = (undef) x 256;
  52%tabx = ();
  53open(CPFS, '<', $cpfs)
  54    or die "$0: could not open fs codepage: $cpfs: $!\n";
  55while (defined($line = <CPFS>)) {
  56    $line =~ s/\s*(\#.*|)$//;
  57    @f = split(/\s+/, $line);
  58    next if (scalar @f != 2);
  59    next if (hex $f[0] > 255);
  60    $xtab[hex $f[0]] = hex $f[1]; # Codepage -> Unicode
  61    $tabx{hex $f[1]} = hex $f[0]; # Unicode -> Codepage
  62}
  63close(CPFS);
  64
  65@ytab = (undef) x 256;
  66%taby = ();
  67open(CPCO, '<', $cpco)
  68    or die "$0: could not open console codepage: $cpco: $!\n";
  69while (defined($line = <CPCO>)) {
  70    $line =~ s/\s*(\#.*|)$//;
  71    @f = split(/\s+/, $line);
  72    next if (scalar @f != 2);
  73    next if (hex $f[0] > 255);
  74    $ytab[hex $f[0]] = hex $f[1]; # Codepage -> Unicode
  75    $taby{hex $f[1]} = hex $f[0]; # Unicode -> Codepage
  76}
  77close(CPCO);
  78
  79open(CPOUT, '>', $cpout)
  80    or die "$0: could not open output file: $cpout: $!\n";
  81#
  82# Magic number, in anticipation of being able to load these
  83# files dynamically...
  84#
  85print CPOUT pack("VV", 0x58a8b3d4, 0x51d21eb1);
  86
  87# Header fields available for future use...
  88print CPOUT pack("VVVVVV", 0, 0, 0, 0, 0, 0);
  89
  90#
  91# Self (shortname) uppercase table.
  92# This depends both on the console codepage and the filesystem codepage;
  93# the logical transcoding operation is:
  94#
  95# $tabx{$ucase{$ytab[$i]}}
  96#
  97# ... where @ytab is console codepage -> Unicode and
  98# %tabx is Unicode -> filesystem codepage.
  99#
 100@uctab = (undef) x 256;
 101for ($i = 0; $i < 256; $i++) {
 102    $uuc = $ucase{$ytab[$i]};   # Unicode upper case
 103    if (defined($tabx{$uuc})) {
 104        # Straight-forward conversion
 105        $u = $tabx{$uuc};
 106    } elsif (defined($tabx{${$decomp{$uuc}}[0]})) {
 107        # Upper case equivalent stripped of accents
 108        $u = $tabx{${$decomp{$uuc}}[0]};
 109    } else {
 110        # No equivalent at all found.  Assume it is a lower-case-only
 111        # character, like greek alpha in CP437.
 112        $u = $i;
 113    }
 114    $uctab[$i] = $u;
 115    print CPOUT pack("C", $u);
 116}
 117
 118#
 119# Self (shortname) lowercase table.
 120# This depends both on the console codepage and the filesystem codepage;
 121# the logical transcoding operation is:
 122#
 123# $taby{$lcase{$xtab[$i]}}
 124#
 125# ... where @ytab is console codepage -> Unicode and
 126# %tabx is Unicode -> filesystem codepage.
 127#
 128@lctab = (undef) x 256;
 129for ($i = 0; $i < 256; $i++) {
 130    $llc = $lcase{$xtab[$i]};   # Unicode lower case
 131    if (defined($l = $taby{$llc}) && $uctab[$l] == $i) {
 132        # Straight-forward conversion
 133    } elsif (defined($l = $tabx{${$decomp{$llc}}[0]}) && $uctab[$l] == $i) {
 134        # Lower case equivalent stripped of accents
 135    } else {
 136        # No equivalent at all found.  Find *anything* that matches the
 137        # bijection criterion...
 138        for ($l = 0; $l < 256; $l++) {
 139            last if ($uctab[$l] == $i);
 140        }
 141        $l = $i if ($l == 256); # If nothing, we're screwed anyway...
 142    }
 143    $lctab[$i] = $l;
 144    print CPOUT pack("C", $l);
 145}
 146
 147#
 148# Unicode (longname) matching table.
 149# This only depends on the console codepage.
 150#
 151$pp0 = '';  $pp1 = '';
 152for ($i = 0; $i < 256; $i++) {
 153    if (!defined($ytab[$i])) {
 154        $p0 = $p1 = 0xffff;
 155    } else {
 156        $p0 = $ytab[$i];
 157        if ($ucase{$p0} != $p0) {
 158            $p1 = $ucase{$p0};
 159        } elsif ($lcase{$p0} != $p0) {
 160            $p1 = $lcase{$p0};
 161        } elsif ($tcase{$p0} != $p0) {
 162            $p1 = $tcase{$p0};
 163        } else {
 164            $p1 = $p0;
 165        }
 166    }
 167    # Only the BMP is supported...
 168    $p0 = 0xffff if ($p0 > 0xffff);
 169    $p1 = 0xffff if ($p1 > 0xffff);
 170    $pp0 .= pack("v", $p0);
 171    $pp1 .= pack("v", $p1);
 172}
 173print CPOUT $pp0, $pp1;
 174close (CPOUT);
 175
 176    
 177