blob: e3b1ad2490d79c81a9069922314885b915045e25 [file] [log] [blame]
mjse8e12992001-12-14 06:57:51 +00001#!/usr/bin/perl -w
2
darina3fcbc12006-03-29 15:09:35 +00003# Copyright (C) 2003, 2004, 2005, 2006 Apple Computer, Inc. All rights reserved.
4#
5# Redistribution and use in source and binary forms, with or without
6# modification, are permitted provided that the following conditions
7# are met:
8#
9# 1. Redistributions of source code must retain the above copyright
10# notice, this list of conditions and the following disclaimer.
11# 2. Redistributions in binary form must reproduce the above copyright
12# notice, this list of conditions and the following disclaimer in the
13# documentation and/or other materials provided with the distribution.
14# 3. Neither the name of Apple Computer, Inc. ("Apple") nor the names of
15# its contributors may be used to endorse or promote products derived
16# from this software without specific prior written permission.
17#
18# THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
19# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21# DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
22# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
25# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
27# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
mjse8e12992001-12-14 06:57:51 +000029use strict;
30
darin15830392002-08-13 23:49:19 +000031my %aliasesFromCharsetsFile;
32my %namesWritten;
mjse8e12992001-12-14 06:57:51 +000033
darin15830392002-08-13 23:49:19 +000034my $output = "";
mjse8e12992001-12-14 06:57:51 +000035
darin15830392002-08-13 23:49:19 +000036my $error = 0;
37
38sub error ($)
mjse8e12992001-12-14 06:57:51 +000039{
darin15830392002-08-13 23:49:19 +000040 print STDERR @_, "\n";
41 $error = 1;
mjse8e12992001-12-14 06:57:51 +000042}
43
darinc6125442002-08-08 19:37:43 +000044sub emit_line
45{
mjs42358912006-02-27 09:17:06 +000046 my ($name, $prefix, $encoding, $flags) = @_;
darin15830392002-08-13 23:49:19 +000047
48 error "$name shows up twice in output" if $namesWritten{$name};
49 $namesWritten{$name} = 1;
darind2061962003-03-18 22:13:33 +000050
darinb3547a32006-09-06 04:40:44 +000051 $output .= " { \"$name\", $prefix$encoding },\n";
darinc6125442002-08-08 19:37:43 +000052}
mjse8e12992001-12-14 06:57:51 +000053
mjs42358912006-02-27 09:17:06 +000054sub process_platform_encodings
mjse8e12992001-12-14 06:57:51 +000055{
mjs42358912006-02-27 09:17:06 +000056 my ($filename, $PlatformPrefix) = @_;
57 my $baseFilename = $filename;
58 $baseFilename =~ s|.*/||;
mjse8e12992001-12-14 06:57:51 +000059
mjs42358912006-02-27 09:17:06 +000060 my %seenPlatformNames;
darin15830392002-08-13 23:49:19 +000061 my %seenIANANames;
62
mjs42358912006-02-27 09:17:06 +000063 open PLATFORM_ENCODINGS, $filename or die;
darin15830392002-08-13 23:49:19 +000064
mjs42358912006-02-27 09:17:06 +000065 while (<PLATFORM_ENCODINGS>) {
darin15830392002-08-13 23:49:19 +000066 chomp;
darin5d81c882002-08-14 14:19:58 +000067 s/\#.*$//;
68 s/\s+$//;
mjs42358912006-02-27 09:17:06 +000069 if (my ($PlatformName, undef, $flags, $IANANames) = /^(.+?)(, (.+))?: (.+)$/) {
darin15830392002-08-13 23:49:19 +000070 my %aliases;
71
mjs42358912006-02-27 09:17:06 +000072 my $PlatformNameWithFlags = $PlatformName;
darind2061962003-03-18 22:13:33 +000073 if ($flags) {
mjs42358912006-02-27 09:17:06 +000074 $PlatformNameWithFlags .= ", " . $flags;
darind2061962003-03-18 22:13:33 +000075 } else {
76 $flags = "NoEncodingFlags";
77 }
darinb3547a32006-09-06 04:40:44 +000078 error "Platform encoding name $PlatformName is mentioned twice in $baseFilename" if $seenPlatformNames{$PlatformNameWithFlags};
mjs42358912006-02-27 09:17:06 +000079 $seenPlatformNames{$PlatformNameWithFlags} = 1;
darin15830392002-08-13 23:49:19 +000080
81 # Build the aliases list.
82 # Also check that no two names are part of the same entry in the charsets file.
darin2ed70aa2002-08-28 07:30:22 +000083 my @IANANames = split ", ", $IANANames;
darina7d237c2003-04-24 18:13:02 +000084 my $firstName = "";
85 my $canonicalFirstName = "";
darin2ed70aa2002-08-28 07:30:22 +000086 my $prevName = "";
darin15830392002-08-13 23:49:19 +000087 for my $name (@IANANames) {
darina7d237c2003-04-24 18:13:02 +000088 if ($firstName eq "") {
darin2ed70aa2002-08-28 07:30:22 +000089 if ($name !~ /^[-A-Za-z0-9_]+$/) {
mjs42358912006-02-27 09:17:06 +000090 error "$name, in $baseFilename, has illegal characters in it";
darina7d237c2003-04-24 18:13:02 +000091 next;
92 }
93 $firstName = $name;
94 } else {
95 if ($name !~ /^[a-z0-9]+$/) {
mjs42358912006-02-27 09:17:06 +000096 error "$name, in $baseFilename, has illegal characters in it (must be all lowercase alphanumeric)";
darina7d237c2003-04-24 18:13:02 +000097 next;
98 }
99 if ($name le $prevName) {
mjs42358912006-02-27 09:17:06 +0000100 error "$name comes after $prevName in $baseFilename, but everything must be in alphabetical order";
darin2ed70aa2002-08-28 07:30:22 +0000101 }
102 $prevName = $name;
darin15830392002-08-13 23:49:19 +0000103 }
104
darina7d237c2003-04-24 18:13:02 +0000105 my $canonicalName = lc $name;
106 $canonicalName =~ tr/-_//d;
darin2ed70aa2002-08-28 07:30:22 +0000107
darina7d237c2003-04-24 18:13:02 +0000108 $canonicalFirstName = $canonicalName if $canonicalFirstName eq "";
darin15830392002-08-13 23:49:19 +0000109
mjs42358912006-02-27 09:17:06 +0000110 error "$name is mentioned twice in $baseFilename" if $seenIANANames{$canonicalName};
darina7d237c2003-04-24 18:13:02 +0000111 $seenIANANames{$canonicalName} = 1;
112
113 $aliases{$canonicalName} = 1;
114 next if !$aliasesFromCharsetsFile{$canonicalName};
115 for my $alias (@{$aliasesFromCharsetsFile{$canonicalName}}) {
darin15830392002-08-13 23:49:19 +0000116 $aliases{$alias} = 1;
117 }
118 for my $otherName (@IANANames) {
darina7d237c2003-04-24 18:13:02 +0000119 next if $canonicalName eq $otherName;
darin15830392002-08-13 23:49:19 +0000120 if ($aliasesFromCharsetsFile{$otherName}
darina7d237c2003-04-24 18:13:02 +0000121 && $aliasesFromCharsetsFile{$canonicalName} eq $aliasesFromCharsetsFile{$otherName}
122 && $canonicalName le $otherName) {
mjs42358912006-02-27 09:17:06 +0000123 error "$baseFilename lists both $name and $otherName under $PlatformName, but that aliasing is already specified in character-sets.txt";
darin15830392002-08-13 23:49:19 +0000124 }
125 }
126 }
127
128 # write out
mjs42358912006-02-27 09:17:06 +0000129 emit_line($firstName, $PlatformPrefix, $PlatformName, $flags);
darindd417a22003-02-05 00:51:23 +0000130 for my $alias (sort keys %aliases) {
mjs42358912006-02-27 09:17:06 +0000131 emit_line($alias, $PlatformPrefix, $PlatformName, $flags) if $alias ne $canonicalFirstName;
darin15830392002-08-13 23:49:19 +0000132 }
darin3dce2432003-04-15 23:45:18 +0000133 } elsif (/^([a-zA-Z0-9_]+)(, (.+))?$/) {
mjs42358912006-02-27 09:17:06 +0000134 my $PlatformName = $1;
darin15830392002-08-13 23:49:19 +0000135
mjs42358912006-02-27 09:17:06 +0000136 error "Platform encoding name $PlatformName is mentioned twice in $baseFilename" if $seenPlatformNames{$PlatformName};
137 $seenPlatformNames{$PlatformName} = 1;
darind2061962003-03-18 22:13:33 +0000138 } elsif (/./) {
mjs42358912006-02-27 09:17:06 +0000139 error "syntax error in platform-encodings.txt, line $.";
darin15830392002-08-13 23:49:19 +0000140 }
mjse8e12992001-12-14 06:57:51 +0000141 }
darinda8f4cd2002-08-05 23:19:16 +0000142
mjs42358912006-02-27 09:17:06 +0000143 close PLATFORM_ENCODINGS;
mjse8e12992001-12-14 06:57:51 +0000144}
145
darin15830392002-08-13 23:49:19 +0000146sub process_iana_charset
147{
darindd417a22003-02-05 00:51:23 +0000148 my ($canonical_name, @aliases) = @_;
darin15830392002-08-13 23:49:19 +0000149
150 return if !$canonical_name;
151
152 my @names = sort $canonical_name, @aliases;
153
154 for my $name (@names) {
darin15830392002-08-13 23:49:19 +0000155 $aliasesFromCharsetsFile{$name} = \@names;
156 }
157}
158
159sub process_iana_charsets
160{
161 my ($filename) = @_;
162
163 open CHARSETS, $filename or die;
164
165 my %seen;
166
167 my $canonical_name;
darin15830392002-08-13 23:49:19 +0000168 my @aliases;
169
darina7d237c2003-04-24 18:13:02 +0000170 my %exceptions = ( isoir91 => 1, isoir92 => 1 );
171
mjse8e12992001-12-14 06:57:51 +0000172 while (<CHARSETS>) {
darin15830392002-08-13 23:49:19 +0000173 chomp;
ddkilzerde0cefa2006-06-20 03:20:14 +0000174 if ((my $new_canonical_name) = /Name: ([^ \t]*).*/) {
darin15830392002-08-13 23:49:19 +0000175 $new_canonical_name = lc $new_canonical_name;
darina7d237c2003-04-24 18:13:02 +0000176 $new_canonical_name =~ tr/a-z0-9//cd;
darin15830392002-08-13 23:49:19 +0000177
178 error "saw $new_canonical_name twice in character-sets.txt", if $seen{$new_canonical_name};
darina7d237c2003-04-24 18:13:02 +0000179 $seen{$new_canonical_name} = $new_canonical_name;
darin15830392002-08-13 23:49:19 +0000180
ddkilzerde0cefa2006-06-20 03:20:14 +0000181 process_iana_charset $canonical_name, @aliases;
mjse8e12992001-12-14 06:57:51 +0000182
ddkilzerde0cefa2006-06-20 03:20:14 +0000183 $canonical_name = $new_canonical_name;
184 @aliases = ();
185 } elsif ((my $new_alias) = /Alias: ([^ \t]*).*/) {
darin15830392002-08-13 23:49:19 +0000186 $new_alias = lc $new_alias;
darina7d237c2003-04-24 18:13:02 +0000187 $new_alias =~ tr/a-z0-9//cd;
darin15830392002-08-13 23:49:19 +0000188
ddkilzerde0cefa2006-06-20 03:20:14 +0000189 # do this after normalizing the alias, sometimes character-sets.txt
190 # has weird escape characters, e.g. \b after None
191 next if $new_alias eq "none";
192
darina7d237c2003-04-24 18:13:02 +0000193 error "saw $new_alias twice in character-sets.txt $seen{$new_alias}, $canonical_name", if $seen{$new_alias} && $seen{$new_alias} ne $canonical_name && !$exceptions{$new_alias};
194 push @aliases, $new_alias if !$seen{$new_alias};
195 $seen{$new_alias} = $canonical_name;
ddkilzerde0cefa2006-06-20 03:20:14 +0000196 }
mjse8e12992001-12-14 06:57:51 +0000197 }
darin15830392002-08-13 23:49:19 +0000198
darindd417a22003-02-05 00:51:23 +0000199 process_iana_charset $canonical_name, @aliases;
darin15830392002-08-13 23:49:19 +0000200
201 close CHARSETS;
mjse8e12992001-12-14 06:57:51 +0000202}
203
204# Program body
205
darin15830392002-08-13 23:49:19 +0000206process_iana_charsets($ARGV[0]);
mjs42358912006-02-27 09:17:06 +0000207process_platform_encodings($ARGV[1], $ARGV[2]);
mjse8e12992001-12-14 06:57:51 +0000208
darin15830392002-08-13 23:49:19 +0000209exit 1 if $error;
mjse8e12992001-12-14 06:57:51 +0000210
darina3fcbc12006-03-29 15:09:35 +0000211print <<EOF
212// File generated by make-charset-table.pl. Do not edit!
213
214#include "config.h"
215#include "CharsetData.h"
216
217namespace WebCore {
218
219 const CharsetEntry CharsetTable[] = {
220$output
darinb3547a32006-09-06 04:40:44 +0000221 { 0, 0 }
darina3fcbc12006-03-29 15:09:35 +0000222 };
223
224}
225EOF