mjs | e8e1299 | 2001-12-14 06:57:51 +0000 | [diff] [blame] | 1 | #!/usr/bin/perl -w |
| 2 | |
darin | a3fcbc1 | 2006-03-29 15:09:35 +0000 | [diff] [blame] | 3 | # Copyright (C) 2003, 2004, 2005, 2006 Apple Computer, Inc. All rights reserved. |
| 4 | # |
| 5 | # Redistribution and use in source and binary forms, with or without |
| 6 | # modification, are permitted provided that the following conditions |
| 7 | # are met: |
| 8 | # |
| 9 | # 1. Redistributions of source code must retain the above copyright |
| 10 | # notice, this list of conditions and the following disclaimer. |
| 11 | # 2. Redistributions in binary form must reproduce the above copyright |
| 12 | # notice, this list of conditions and the following disclaimer in the |
| 13 | # documentation and/or other materials provided with the distribution. |
| 14 | # 3. Neither the name of Apple Computer, Inc. ("Apple") nor the names of |
| 15 | # its contributors may be used to endorse or promote products derived |
| 16 | # from this software without specific prior written permission. |
| 17 | # |
| 18 | # THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY |
| 19 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
| 20 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| 21 | # DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY |
| 22 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
| 23 | # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| 24 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
| 25 | # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| 26 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF |
| 27 | # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 28 | |
mjs | e8e1299 | 2001-12-14 06:57:51 +0000 | [diff] [blame] | 29 | use strict; |
| 30 | |
darin | 1583039 | 2002-08-13 23:49:19 +0000 | [diff] [blame] | 31 | my %aliasesFromCharsetsFile; |
| 32 | my %namesWritten; |
mjs | e8e1299 | 2001-12-14 06:57:51 +0000 | [diff] [blame] | 33 | |
darin | 1583039 | 2002-08-13 23:49:19 +0000 | [diff] [blame] | 34 | my $output = ""; |
mjs | e8e1299 | 2001-12-14 06:57:51 +0000 | [diff] [blame] | 35 | |
darin | 1583039 | 2002-08-13 23:49:19 +0000 | [diff] [blame] | 36 | my $error = 0; |
| 37 | |
| 38 | sub error ($) |
mjs | e8e1299 | 2001-12-14 06:57:51 +0000 | [diff] [blame] | 39 | { |
darin | 1583039 | 2002-08-13 23:49:19 +0000 | [diff] [blame] | 40 | print STDERR @_, "\n"; |
| 41 | $error = 1; |
mjs | e8e1299 | 2001-12-14 06:57:51 +0000 | [diff] [blame] | 42 | } |
| 43 | |
darin | c612544 | 2002-08-08 19:37:43 +0000 | [diff] [blame] | 44 | sub emit_line |
| 45 | { |
mjs | 4235891 | 2006-02-27 09:17:06 +0000 | [diff] [blame] | 46 | my ($name, $prefix, $encoding, $flags) = @_; |
darin | 1583039 | 2002-08-13 23:49:19 +0000 | [diff] [blame] | 47 | |
| 48 | error "$name shows up twice in output" if $namesWritten{$name}; |
| 49 | $namesWritten{$name} = 1; |
darin | d206196 | 2003-03-18 22:13:33 +0000 | [diff] [blame] | 50 | |
darin | b3547a3 | 2006-09-06 04:40:44 +0000 | [diff] [blame] | 51 | $output .= " { \"$name\", $prefix$encoding },\n"; |
darin | c612544 | 2002-08-08 19:37:43 +0000 | [diff] [blame] | 52 | } |
mjs | e8e1299 | 2001-12-14 06:57:51 +0000 | [diff] [blame] | 53 | |
mjs | 4235891 | 2006-02-27 09:17:06 +0000 | [diff] [blame] | 54 | sub process_platform_encodings |
mjs | e8e1299 | 2001-12-14 06:57:51 +0000 | [diff] [blame] | 55 | { |
mjs | 4235891 | 2006-02-27 09:17:06 +0000 | [diff] [blame] | 56 | my ($filename, $PlatformPrefix) = @_; |
| 57 | my $baseFilename = $filename; |
| 58 | $baseFilename =~ s|.*/||; |
mjs | e8e1299 | 2001-12-14 06:57:51 +0000 | [diff] [blame] | 59 | |
mjs | 4235891 | 2006-02-27 09:17:06 +0000 | [diff] [blame] | 60 | my %seenPlatformNames; |
darin | 1583039 | 2002-08-13 23:49:19 +0000 | [diff] [blame] | 61 | my %seenIANANames; |
| 62 | |
mjs | 4235891 | 2006-02-27 09:17:06 +0000 | [diff] [blame] | 63 | open PLATFORM_ENCODINGS, $filename or die; |
darin | 1583039 | 2002-08-13 23:49:19 +0000 | [diff] [blame] | 64 | |
mjs | 4235891 | 2006-02-27 09:17:06 +0000 | [diff] [blame] | 65 | while (<PLATFORM_ENCODINGS>) { |
darin | 1583039 | 2002-08-13 23:49:19 +0000 | [diff] [blame] | 66 | chomp; |
darin | 5d81c88 | 2002-08-14 14:19:58 +0000 | [diff] [blame] | 67 | s/\#.*$//; |
| 68 | s/\s+$//; |
mjs | 4235891 | 2006-02-27 09:17:06 +0000 | [diff] [blame] | 69 | if (my ($PlatformName, undef, $flags, $IANANames) = /^(.+?)(, (.+))?: (.+)$/) { |
darin | 1583039 | 2002-08-13 23:49:19 +0000 | [diff] [blame] | 70 | my %aliases; |
| 71 | |
mjs | 4235891 | 2006-02-27 09:17:06 +0000 | [diff] [blame] | 72 | my $PlatformNameWithFlags = $PlatformName; |
darin | d206196 | 2003-03-18 22:13:33 +0000 | [diff] [blame] | 73 | if ($flags) { |
mjs | 4235891 | 2006-02-27 09:17:06 +0000 | [diff] [blame] | 74 | $PlatformNameWithFlags .= ", " . $flags; |
darin | d206196 | 2003-03-18 22:13:33 +0000 | [diff] [blame] | 75 | } else { |
| 76 | $flags = "NoEncodingFlags"; |
| 77 | } |
darin | b3547a3 | 2006-09-06 04:40:44 +0000 | [diff] [blame] | 78 | error "Platform encoding name $PlatformName is mentioned twice in $baseFilename" if $seenPlatformNames{$PlatformNameWithFlags}; |
mjs | 4235891 | 2006-02-27 09:17:06 +0000 | [diff] [blame] | 79 | $seenPlatformNames{$PlatformNameWithFlags} = 1; |
darin | 1583039 | 2002-08-13 23:49:19 +0000 | [diff] [blame] | 80 | |
| 81 | # Build the aliases list. |
| 82 | # Also check that no two names are part of the same entry in the charsets file. |
darin | 2ed70aa | 2002-08-28 07:30:22 +0000 | [diff] [blame] | 83 | my @IANANames = split ", ", $IANANames; |
darin | a7d237c | 2003-04-24 18:13:02 +0000 | [diff] [blame] | 84 | my $firstName = ""; |
| 85 | my $canonicalFirstName = ""; |
darin | 2ed70aa | 2002-08-28 07:30:22 +0000 | [diff] [blame] | 86 | my $prevName = ""; |
darin | 1583039 | 2002-08-13 23:49:19 +0000 | [diff] [blame] | 87 | for my $name (@IANANames) { |
darin | a7d237c | 2003-04-24 18:13:02 +0000 | [diff] [blame] | 88 | if ($firstName eq "") { |
darin | 2ed70aa | 2002-08-28 07:30:22 +0000 | [diff] [blame] | 89 | if ($name !~ /^[-A-Za-z0-9_]+$/) { |
mjs | 4235891 | 2006-02-27 09:17:06 +0000 | [diff] [blame] | 90 | error "$name, in $baseFilename, has illegal characters in it"; |
darin | a7d237c | 2003-04-24 18:13:02 +0000 | [diff] [blame] | 91 | next; |
| 92 | } |
| 93 | $firstName = $name; |
| 94 | } else { |
| 95 | if ($name !~ /^[a-z0-9]+$/) { |
mjs | 4235891 | 2006-02-27 09:17:06 +0000 | [diff] [blame] | 96 | error "$name, in $baseFilename, has illegal characters in it (must be all lowercase alphanumeric)"; |
darin | a7d237c | 2003-04-24 18:13:02 +0000 | [diff] [blame] | 97 | next; |
| 98 | } |
| 99 | if ($name le $prevName) { |
mjs | 4235891 | 2006-02-27 09:17:06 +0000 | [diff] [blame] | 100 | error "$name comes after $prevName in $baseFilename, but everything must be in alphabetical order"; |
darin | 2ed70aa | 2002-08-28 07:30:22 +0000 | [diff] [blame] | 101 | } |
| 102 | $prevName = $name; |
darin | 1583039 | 2002-08-13 23:49:19 +0000 | [diff] [blame] | 103 | } |
| 104 | |
darin | a7d237c | 2003-04-24 18:13:02 +0000 | [diff] [blame] | 105 | my $canonicalName = lc $name; |
| 106 | $canonicalName =~ tr/-_//d; |
darin | 2ed70aa | 2002-08-28 07:30:22 +0000 | [diff] [blame] | 107 | |
darin | a7d237c | 2003-04-24 18:13:02 +0000 | [diff] [blame] | 108 | $canonicalFirstName = $canonicalName if $canonicalFirstName eq ""; |
darin | 1583039 | 2002-08-13 23:49:19 +0000 | [diff] [blame] | 109 | |
mjs | 4235891 | 2006-02-27 09:17:06 +0000 | [diff] [blame] | 110 | error "$name is mentioned twice in $baseFilename" if $seenIANANames{$canonicalName}; |
darin | a7d237c | 2003-04-24 18:13:02 +0000 | [diff] [blame] | 111 | $seenIANANames{$canonicalName} = 1; |
| 112 | |
| 113 | $aliases{$canonicalName} = 1; |
| 114 | next if !$aliasesFromCharsetsFile{$canonicalName}; |
| 115 | for my $alias (@{$aliasesFromCharsetsFile{$canonicalName}}) { |
darin | 1583039 | 2002-08-13 23:49:19 +0000 | [diff] [blame] | 116 | $aliases{$alias} = 1; |
| 117 | } |
| 118 | for my $otherName (@IANANames) { |
darin | a7d237c | 2003-04-24 18:13:02 +0000 | [diff] [blame] | 119 | next if $canonicalName eq $otherName; |
darin | 1583039 | 2002-08-13 23:49:19 +0000 | [diff] [blame] | 120 | if ($aliasesFromCharsetsFile{$otherName} |
darin | a7d237c | 2003-04-24 18:13:02 +0000 | [diff] [blame] | 121 | && $aliasesFromCharsetsFile{$canonicalName} eq $aliasesFromCharsetsFile{$otherName} |
| 122 | && $canonicalName le $otherName) { |
mjs | 4235891 | 2006-02-27 09:17:06 +0000 | [diff] [blame] | 123 | error "$baseFilename lists both $name and $otherName under $PlatformName, but that aliasing is already specified in character-sets.txt"; |
darin | 1583039 | 2002-08-13 23:49:19 +0000 | [diff] [blame] | 124 | } |
| 125 | } |
| 126 | } |
| 127 | |
| 128 | # write out |
mjs | 4235891 | 2006-02-27 09:17:06 +0000 | [diff] [blame] | 129 | emit_line($firstName, $PlatformPrefix, $PlatformName, $flags); |
darin | dd417a2 | 2003-02-05 00:51:23 +0000 | [diff] [blame] | 130 | for my $alias (sort keys %aliases) { |
mjs | 4235891 | 2006-02-27 09:17:06 +0000 | [diff] [blame] | 131 | emit_line($alias, $PlatformPrefix, $PlatformName, $flags) if $alias ne $canonicalFirstName; |
darin | 1583039 | 2002-08-13 23:49:19 +0000 | [diff] [blame] | 132 | } |
darin | 3dce243 | 2003-04-15 23:45:18 +0000 | [diff] [blame] | 133 | } elsif (/^([a-zA-Z0-9_]+)(, (.+))?$/) { |
mjs | 4235891 | 2006-02-27 09:17:06 +0000 | [diff] [blame] | 134 | my $PlatformName = $1; |
darin | 1583039 | 2002-08-13 23:49:19 +0000 | [diff] [blame] | 135 | |
mjs | 4235891 | 2006-02-27 09:17:06 +0000 | [diff] [blame] | 136 | error "Platform encoding name $PlatformName is mentioned twice in $baseFilename" if $seenPlatformNames{$PlatformName}; |
| 137 | $seenPlatformNames{$PlatformName} = 1; |
darin | d206196 | 2003-03-18 22:13:33 +0000 | [diff] [blame] | 138 | } elsif (/./) { |
mjs | 4235891 | 2006-02-27 09:17:06 +0000 | [diff] [blame] | 139 | error "syntax error in platform-encodings.txt, line $."; |
darin | 1583039 | 2002-08-13 23:49:19 +0000 | [diff] [blame] | 140 | } |
mjs | e8e1299 | 2001-12-14 06:57:51 +0000 | [diff] [blame] | 141 | } |
darin | da8f4cd | 2002-08-05 23:19:16 +0000 | [diff] [blame] | 142 | |
mjs | 4235891 | 2006-02-27 09:17:06 +0000 | [diff] [blame] | 143 | close PLATFORM_ENCODINGS; |
mjs | e8e1299 | 2001-12-14 06:57:51 +0000 | [diff] [blame] | 144 | } |
| 145 | |
darin | 1583039 | 2002-08-13 23:49:19 +0000 | [diff] [blame] | 146 | sub process_iana_charset |
| 147 | { |
darin | dd417a2 | 2003-02-05 00:51:23 +0000 | [diff] [blame] | 148 | my ($canonical_name, @aliases) = @_; |
darin | 1583039 | 2002-08-13 23:49:19 +0000 | [diff] [blame] | 149 | |
| 150 | return if !$canonical_name; |
| 151 | |
| 152 | my @names = sort $canonical_name, @aliases; |
| 153 | |
| 154 | for my $name (@names) { |
darin | 1583039 | 2002-08-13 23:49:19 +0000 | [diff] [blame] | 155 | $aliasesFromCharsetsFile{$name} = \@names; |
| 156 | } |
| 157 | } |
| 158 | |
| 159 | sub process_iana_charsets |
| 160 | { |
| 161 | my ($filename) = @_; |
| 162 | |
| 163 | open CHARSETS, $filename or die; |
| 164 | |
| 165 | my %seen; |
| 166 | |
| 167 | my $canonical_name; |
darin | 1583039 | 2002-08-13 23:49:19 +0000 | [diff] [blame] | 168 | my @aliases; |
| 169 | |
darin | a7d237c | 2003-04-24 18:13:02 +0000 | [diff] [blame] | 170 | my %exceptions = ( isoir91 => 1, isoir92 => 1 ); |
| 171 | |
mjs | e8e1299 | 2001-12-14 06:57:51 +0000 | [diff] [blame] | 172 | while (<CHARSETS>) { |
darin | 1583039 | 2002-08-13 23:49:19 +0000 | [diff] [blame] | 173 | chomp; |
ddkilzer | de0cefa | 2006-06-20 03:20:14 +0000 | [diff] [blame] | 174 | if ((my $new_canonical_name) = /Name: ([^ \t]*).*/) { |
darin | 1583039 | 2002-08-13 23:49:19 +0000 | [diff] [blame] | 175 | $new_canonical_name = lc $new_canonical_name; |
darin | a7d237c | 2003-04-24 18:13:02 +0000 | [diff] [blame] | 176 | $new_canonical_name =~ tr/a-z0-9//cd; |
darin | 1583039 | 2002-08-13 23:49:19 +0000 | [diff] [blame] | 177 | |
| 178 | error "saw $new_canonical_name twice in character-sets.txt", if $seen{$new_canonical_name}; |
darin | a7d237c | 2003-04-24 18:13:02 +0000 | [diff] [blame] | 179 | $seen{$new_canonical_name} = $new_canonical_name; |
darin | 1583039 | 2002-08-13 23:49:19 +0000 | [diff] [blame] | 180 | |
ddkilzer | de0cefa | 2006-06-20 03:20:14 +0000 | [diff] [blame] | 181 | process_iana_charset $canonical_name, @aliases; |
mjs | e8e1299 | 2001-12-14 06:57:51 +0000 | [diff] [blame] | 182 | |
ddkilzer | de0cefa | 2006-06-20 03:20:14 +0000 | [diff] [blame] | 183 | $canonical_name = $new_canonical_name; |
| 184 | @aliases = (); |
| 185 | } elsif ((my $new_alias) = /Alias: ([^ \t]*).*/) { |
darin | 1583039 | 2002-08-13 23:49:19 +0000 | [diff] [blame] | 186 | $new_alias = lc $new_alias; |
darin | a7d237c | 2003-04-24 18:13:02 +0000 | [diff] [blame] | 187 | $new_alias =~ tr/a-z0-9//cd; |
darin | 1583039 | 2002-08-13 23:49:19 +0000 | [diff] [blame] | 188 | |
ddkilzer | de0cefa | 2006-06-20 03:20:14 +0000 | [diff] [blame] | 189 | # do this after normalizing the alias, sometimes character-sets.txt |
| 190 | # has weird escape characters, e.g. \b after None |
| 191 | next if $new_alias eq "none"; |
| 192 | |
darin | a7d237c | 2003-04-24 18:13:02 +0000 | [diff] [blame] | 193 | error "saw $new_alias twice in character-sets.txt $seen{$new_alias}, $canonical_name", if $seen{$new_alias} && $seen{$new_alias} ne $canonical_name && !$exceptions{$new_alias}; |
| 194 | push @aliases, $new_alias if !$seen{$new_alias}; |
| 195 | $seen{$new_alias} = $canonical_name; |
ddkilzer | de0cefa | 2006-06-20 03:20:14 +0000 | [diff] [blame] | 196 | } |
mjs | e8e1299 | 2001-12-14 06:57:51 +0000 | [diff] [blame] | 197 | } |
darin | 1583039 | 2002-08-13 23:49:19 +0000 | [diff] [blame] | 198 | |
darin | dd417a2 | 2003-02-05 00:51:23 +0000 | [diff] [blame] | 199 | process_iana_charset $canonical_name, @aliases; |
darin | 1583039 | 2002-08-13 23:49:19 +0000 | [diff] [blame] | 200 | |
| 201 | close CHARSETS; |
mjs | e8e1299 | 2001-12-14 06:57:51 +0000 | [diff] [blame] | 202 | } |
| 203 | |
| 204 | # Program body |
| 205 | |
darin | 1583039 | 2002-08-13 23:49:19 +0000 | [diff] [blame] | 206 | process_iana_charsets($ARGV[0]); |
mjs | 4235891 | 2006-02-27 09:17:06 +0000 | [diff] [blame] | 207 | process_platform_encodings($ARGV[1], $ARGV[2]); |
mjs | e8e1299 | 2001-12-14 06:57:51 +0000 | [diff] [blame] | 208 | |
darin | 1583039 | 2002-08-13 23:49:19 +0000 | [diff] [blame] | 209 | exit 1 if $error; |
mjs | e8e1299 | 2001-12-14 06:57:51 +0000 | [diff] [blame] | 210 | |
darin | a3fcbc1 | 2006-03-29 15:09:35 +0000 | [diff] [blame] | 211 | print <<EOF |
| 212 | // File generated by make-charset-table.pl. Do not edit! |
| 213 | |
| 214 | #include "config.h" |
| 215 | #include "CharsetData.h" |
| 216 | |
| 217 | namespace WebCore { |
| 218 | |
| 219 | const CharsetEntry CharsetTable[] = { |
| 220 | $output |
darin | b3547a3 | 2006-09-06 04:40:44 +0000 | [diff] [blame] | 221 | { 0, 0 } |
darin | a3fcbc1 | 2006-03-29 15:09:35 +0000 | [diff] [blame] | 222 | }; |
| 223 | |
| 224 | } |
| 225 | EOF |