WebCore/platform/make-charset-table.pl - WebKit - Git at Google

 #!/usr/bin/perl -w

 # Copyright (C) 2003, 2004, 2005, 2006 Apple Computer, Inc. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
 # are met:
 #
 # 1.  Redistributions of source code must retain the above copyright
 #     notice, this list of conditions and the following disclaimer.
 # 2.  Redistributions in binary form must reproduce the above copyright
 #     notice, this list of conditions and the following disclaimer in the
 #     documentation and/or other materials provided with the distribution.
 # 3.  Neither the name of Apple Computer, Inc. ("Apple") nor the names of
 #     its contributors may be used to endorse or promote products derived
 #     from this software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 # DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
 # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 use strict;

 my %aliasesFromCharsetsFile;
 my %namesWritten;

 my $output = "";

 my $error = 0;

 sub error ($)
 {
     print STDERR @_, "\n";
     $error = 1;
 }

 sub emit_line
 {
     my ($name, $prefix, $encoding, $flags) = @_;

     error "$name shows up twice in output" if $namesWritten{$name};
     $namesWritten{$name} = 1;

     $output .= "        { \"$name\", $prefix$encoding },\n";
 }

 sub process_platform_encodings
 {
     my ($filename, $PlatformPrefix) = @_;
     my $baseFilename = $filename;
     $baseFilename =~ s|.*/||;

     my %seenPlatformNames;
     my %seenIANANames;

     open PLATFORM_ENCODINGS, $filename or die;

     while (<PLATFORM_ENCODINGS>) {
         chomp;
         s/\#.*$//;
         s/\s+$//;
 	if (my ($PlatformName, undef, $flags, $IANANames) = /^(.+?)(, (.+))?: (.+)$/) {
             my %aliases;

             my $PlatformNameWithFlags = $PlatformName;
             if ($flags) {
                 $PlatformNameWithFlags .= ", " . $flags;
             } else {
                 $flags = "NoEncodingFlags";
             }
             error "Platform encoding name $PlatformName is mentioned twice in $baseFilename" if $seenPlatformNames{$PlatformNameWithFlags};
             $seenPlatformNames{$PlatformNameWithFlags} = 1;

             # Build the aliases list.
             # Also check that no two names are part of the same entry in the charsets file.
 	    my @IANANames = split ", ", $IANANames;
             my $firstName = "";
             my $canonicalFirstName = "";
             my $prevName = "";
             for my $name (@IANANames) {
                 if ($firstName eq "") {
                     if ($name !~ /^[-A-Za-z0-9_]+$/) {
                         error "$name, in $baseFilename, has illegal characters in it";
                         next;
                     }
                     $firstName = $name;
                 } else {
                     if ($name !~ /^[a-z0-9]+$/) {
                         error "$name, in $baseFilename, has illegal characters in it (must be all lowercase alphanumeric)";
                         next;
                     }
                     if ($name le $prevName) {
                         error "$name comes after $prevName in $baseFilename, but everything must be in alphabetical order";
                     }
                     $prevName = $name;
                 }

                 my $canonicalName = lc $name;
                 $canonicalName =~ tr/-_//d;

                 $canonicalFirstName = $canonicalName if $canonicalFirstName eq "";

                 error "$name is mentioned twice in $baseFilename" if $seenIANANames{$canonicalName};
                 $seenIANANames{$canonicalName} = 1;

                 $aliases{$canonicalName} = 1;
                 next if !$aliasesFromCharsetsFile{$canonicalName};
                 for my $alias (@{$aliasesFromCharsetsFile{$canonicalName}}) {
                     $aliases{$alias} = 1;
                 }
                 for my $otherName (@IANANames) {
                     next if $canonicalName eq $otherName;
                     if ($aliasesFromCharsetsFile{$otherName}
                         && $aliasesFromCharsetsFile{$canonicalName} eq $aliasesFromCharsetsFile{$otherName}
                         && $canonicalName le $otherName) {
                         error "$baseFilename lists both $name and $otherName under $PlatformName, but that aliasing is already specified in character-sets.txt";
                     }
                 }
             }

             # write out
             emit_line($firstName, $PlatformPrefix, $PlatformName, $flags);
             for my $alias (sort keys %aliases) {
                 emit_line($alias, $PlatformPrefix, $PlatformName, $flags) if $alias ne $canonicalFirstName;
             }
 	} elsif (/^([a-zA-Z0-9_]+)(, (.+))?$/) {
             my $PlatformName = $1;

             error "Platform encoding name $PlatformName is mentioned twice in $baseFilename" if $seenPlatformNames{$PlatformName};
             $seenPlatformNames{$PlatformName} = 1;
         } elsif (/./) {
             error "syntax error in platform-encodings.txt, line $.";
         }
     }

     close PLATFORM_ENCODINGS;
 }

 sub process_iana_charset
 {
     my ($canonical_name, @aliases) = @_;

     return if !$canonical_name;

     my @names = sort $canonical_name, @aliases;

     for my $name (@names) {
         $aliasesFromCharsetsFile{$name} = \@names;
     }
 }

 sub process_iana_charsets
 {
     my ($filename) = @_;

     open CHARSETS, $filename or die;

     my %seen;

     my $canonical_name;
     my @aliases;

     my %exceptions = ( isoir91 => 1, isoir92 => 1 );

     while (<CHARSETS>) {
         chomp;
         if ((my $new_canonical_name) = /Name: ([^ \t]*).*/) {
             $new_canonical_name = lc $new_canonical_name;
             $new_canonical_name =~ tr/a-z0-9//cd;

             error "saw $new_canonical_name twice in character-sets.txt", if $seen{$new_canonical_name};
             $seen{$new_canonical_name} = $new_canonical_name;

             process_iana_charset $canonical_name, @aliases;

 	          $canonical_name = $new_canonical_name;
 	          @aliases = ();
         } elsif ((my $new_alias) = /Alias: ([^ \t]*).*/) {
             $new_alias = lc $new_alias;
             $new_alias =~ tr/a-z0-9//cd;

             # do this after normalizing the alias, sometimes character-sets.txt
             # has weird escape characters, e.g. \b after None
             next if $new_alias eq "none";

             error "saw $new_alias twice in character-sets.txt $seen{$new_alias}, $canonical_name", if $seen{$new_alias} && $seen{$new_alias} ne $canonical_name && !$exceptions{$new_alias};
             push @aliases, $new_alias if !$seen{$new_alias};
             $seen{$new_alias} = $canonical_name;
         }
     }

     process_iana_charset $canonical_name, @aliases;

     close CHARSETS;
 }

 # Program body

 process_iana_charsets($ARGV[0]);
 process_platform_encodings($ARGV[1], $ARGV[2]);

 exit 1 if $error;

 print <<EOF
 // File generated by make-charset-table.pl. Do not edit!

 #include "config.h"
 #include "CharsetData.h"

 namespace WebCore {

     const CharsetEntry CharsetTable[] = {
 $output
         { 0, 0 }
     };

 }
 EOF
	#!/usr/bin/perl -w

	# Copyright (C) 2003, 2004, 2005, 2006 Apple Computer, Inc. All rights reserved.
	#
	# Redistribution and use in source and binary forms, with or without
	# modification, are permitted provided that the following conditions
	# are met:
	#
	# 1. Redistributions of source code must retain the above copyright
	# notice, this list of conditions and the following disclaimer.
	# 2. Redistributions in binary form must reproduce the above copyright
	# notice, this list of conditions and the following disclaimer in the
	# documentation and/or other materials provided with the distribution.
	# 3. Neither the name of Apple Computer, Inc. ("Apple") nor the names of
	# its contributors may be used to endorse or promote products derived
	# from this software without specific prior written permission.
	#
	# THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
	# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	# DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
	# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

	use strict;

	my %aliasesFromCharsetsFile;
	my %namesWritten;

	my $output = "";

	my $error = 0;

	sub error ($)
	{
	print STDERR @_, "\n";
	$error = 1;
	}

	sub emit_line
	{
	my ($name, $prefix, $encoding, $flags) = @_;

	error "$name shows up twice in output" if $namesWritten{$name};
	$namesWritten{$name} = 1;

	$output .= " { \"$name\", $prefix$encoding },\n";
	}

	sub process_platform_encodings
	{
	my ($filename, $PlatformPrefix) = @_;
	my $baseFilename = $filename;
	$baseFilename =~ s\|.*/\|\|;

	my %seenPlatformNames;
	my %seenIANANames;

	open PLATFORM_ENCODINGS, $filename or die;

	while (<PLATFORM_ENCODINGS>) {
	chomp;
	s/\#.*$//;
	s/\s+$//;
	if (my ($PlatformName, undef, $flags, $IANANames) = /^(.+?)(, (.+))?: (.+)$/) {
	my %aliases;

	my $PlatformNameWithFlags = $PlatformName;
	if ($flags) {
	$PlatformNameWithFlags .= ", " . $flags;
	} else {
	$flags = "NoEncodingFlags";
	}
	error "Platform encoding name $PlatformName is mentioned twice in $baseFilename" if $seenPlatformNames{$PlatformNameWithFlags};
	$seenPlatformNames{$PlatformNameWithFlags} = 1;

	# Build the aliases list.
	# Also check that no two names are part of the same entry in the charsets file.
	my @IANANames = split ", ", $IANANames;
	my $firstName = "";
	my $canonicalFirstName = "";
	my $prevName = "";
	for my $name (@IANANames) {
	if ($firstName eq "") {
	if ($name !~ /^[-A-Za-z0-9_]+$/) {
	error "$name, in $baseFilename, has illegal characters in it";
	next;
	}
	$firstName = $name;
	} else {
	if ($name !~ /^[a-z0-9]+$/) {
	error "$name, in $baseFilename, has illegal characters in it (must be all lowercase alphanumeric)";
	next;
	}
	if ($name le $prevName) {
	error "$name comes after $prevName in $baseFilename, but everything must be in alphabetical order";
	}
	$prevName = $name;
	}

	my $canonicalName = lc $name;
	$canonicalName =~ tr/-_//d;

	$canonicalFirstName = $canonicalName if $canonicalFirstName eq "";

	error "$name is mentioned twice in $baseFilename" if $seenIANANames{$canonicalName};
	$seenIANANames{$canonicalName} = 1;

	$aliases{$canonicalName} = 1;
	next if !$aliasesFromCharsetsFile{$canonicalName};
	for my $alias (@{$aliasesFromCharsetsFile{$canonicalName}}) {
	$aliases{$alias} = 1;
	}
	for my $otherName (@IANANames) {
	next if $canonicalName eq $otherName;
	if ($aliasesFromCharsetsFile{$otherName}
	&& $aliasesFromCharsetsFile{$canonicalName} eq $aliasesFromCharsetsFile{$otherName}
	&& $canonicalName le $otherName) {
	error "$baseFilename lists both $name and $otherName under $PlatformName, but that aliasing is already specified in character-sets.txt";
	}
	}
	}

	# write out
	emit_line($firstName, $PlatformPrefix, $PlatformName, $flags);
	for my $alias (sort keys %aliases) {
	emit_line($alias, $PlatformPrefix, $PlatformName, $flags) if $alias ne $canonicalFirstName;
	}
	} elsif (/^([a-zA-Z0-9_]+)(, (.+))?$/) {
	my $PlatformName = $1;

	error "Platform encoding name $PlatformName is mentioned twice in $baseFilename" if $seenPlatformNames{$PlatformName};
	$seenPlatformNames{$PlatformName} = 1;
	} elsif (/./) {
	error "syntax error in platform-encodings.txt, line $.";
	}
	}

	close PLATFORM_ENCODINGS;
	}

	sub process_iana_charset
	{
	my ($canonical_name, @aliases) = @_;

	return if !$canonical_name;

	my @names = sort $canonical_name, @aliases;

	for my $name (@names) {
	$aliasesFromCharsetsFile{$name} = \@names;
	}
	}

	sub process_iana_charsets
	{
	my ($filename) = @_;

	open CHARSETS, $filename or die;

	my %seen;

	my $canonical_name;
	my @aliases;

	my %exceptions = ( isoir91 => 1, isoir92 => 1 );

	while (<CHARSETS>) {
	chomp;
	if ((my $new_canonical_name) = /Name: ([^ \t])./) {
	$new_canonical_name = lc $new_canonical_name;
	$new_canonical_name =~ tr/a-z0-9//cd;

	error "saw $new_canonical_name twice in character-sets.txt", if $seen{$new_canonical_name};
	$seen{$new_canonical_name} = $new_canonical_name;

	process_iana_charset $canonical_name, @aliases;

	$canonical_name = $new_canonical_name;
	@aliases = ();
	} elsif ((my $new_alias) = /Alias: ([^ \t])./) {
	$new_alias = lc $new_alias;
	$new_alias =~ tr/a-z0-9//cd;

	# do this after normalizing the alias, sometimes character-sets.txt
	# has weird escape characters, e.g. \b after None
	next if $new_alias eq "none";

	error "saw $new_alias twice in character-sets.txt $seen{$new_alias}, $canonical_name", if $seen{$new_alias} && $seen{$new_alias} ne $canonical_name && !$exceptions{$new_alias};
	push @aliases, $new_alias if !$seen{$new_alias};
	$seen{$new_alias} = $canonical_name;
	}
	}

	process_iana_charset $canonical_name, @aliases;

	close CHARSETS;
	}

	# Program body

	process_iana_charsets($ARGV[0]);
	process_platform_encodings($ARGV[1], $ARGV[2]);

	exit 1 if $error;

	print <<EOF
	// File generated by make-charset-table.pl. Do not edit!

	#include "config.h"
	#include "CharsetData.h"

	namespace WebCore {

	const CharsetEntry CharsetTable[] = {
	$output
	{ 0, 0 }
	};

	}
	EOF