created directory with DM's ISO 639-3 parsing Perl script

git-svn-id: https://www.crosswire.org/svn/sword-tools/trunk@232 07627401-56e2-0310-80f4-f8cd0041bdcd
author: Chris Little <chrislit@crosswire.org> 2009-11-10 07:46:41 +0000
committer: Chris Little <chrislit@crosswire.org> 2009-11-10 07:46:41 +0000
commit: f3e7b8d748d47e62d0a043022b46e084be8f8824 (patch)
tree: 3e5df0827036c94e0c702c9400afe460ce1f6bfc /locales/makeCodeList.pl
parent: eaae68469d37ebdd404b571dbfa159a45574ed27 (diff)
download: sword-tools-f3e7b8d748d47e62d0a043022b46e084be8f8824.tar.gz
1 files changed, 80 insertions, 0 deletions
diff --git a/locales/makeCodeList.pl b/locales/makeCodeList.pl
new file mode 100644
index 0000000..f02751e
--- /dev/null
+++ b/locales/makeCodeList.pl
@@ -0,0 +1,80 @@
+#!/usr/bin/perl
+# This file is used to create a Java property file from SIL's ISO639-3 files.
+# That file changes frequently both in content and layout.
+# Adjust this program as needed.
+#
+# The files are currently downloaded from:
+#       http://www.sil.org/iso639-3/iso-639-3_20090210.tab
+#       http://www.sil.org/iso639-3/iso-639-3_Name_Index_20090210.tab
+#       http://www.sil.org/iso639-3/iso-639-3_Retirements_20090126.tab
+#
+# Run the program as:
+#       makeISO639.pl > iso639.txt
+#
+# Sort the file if desired with:
+#       makeISO639.pl | sort -t = -k 2 > iso639.txt
+#
+# Convert it from UTF-8 to Java's ASCII representation with:
+#       native2ascii -encoding utf-8 iso639.txt > iso639.properties
+
+use strict;
+use Unicode::Normalize;
+binmode(STDOUT, ":utf8");
+
+my $nameIndex = "iso-639-3_Name_Index_20090210.tab";
+my $langCodes = "iso-639-3_20090210.tab";
+my $deadCodes = "iso-639-3_Retirements_20090126.tab";
+my %names = ();
+open(my $nameIndexFile, "<:utf8", $nameIndex);
+# skip the first line
+my $firstLine = <$nameIndexFile>;
+while (<$nameIndexFile>)
+{
+        # chomp ms-dos line endings
+        s/\r//o;
+        chomp();
+        # Skip blank lines
+        next if (/^$/o);
+        # ensure it is normalized to NFC
+        $_ = NFC($_);
+        my @line = split(/\t/o, $_);
+        $names{$line[0],$line[1]} = $line[2];
+}
+
+open(my $langFile, "<:utf8", $langCodes);
+# skip the first line
+$firstLine = <$langFile>;
+while (<$langFile>)
+{
+        # chomp ms-dos line endings
+        s/\r//o;
+        chomp();
+        # Skip blank lines
+        next if (/^$/o);
+        # ensure it is normalized to NFC
+        $_ = NFC($_);
+        my @line = split(/\t/o, $_);
+        # exclude extinct languages
+        next if ($line[5] eq 'E');
+        my $name = $names{$line[0],$line[6]};
+        print "$line[3]=$name\n" if ($line[3]);
+        print "$line[0]=$name\n";
+}
+
+# The dead codes file is iso-8859-1. This may change at some date.
+open(my $deadFile, "<:encoding(iso-8859-1)", $deadCodes);
+# skip the first line
+$firstLine = <$deadFile>;
+while (<$deadFile>)
+{
+        # chomp ms-dos line endings
+        s/\r//o;
+        chomp();
+        # Skip blank lines
+        next if (/^$/o);
+        # ensure it is normalized to NFC
+        $_ = NFC($_);
+        my @line = split(/\t/o, $_);
+        print "$line[0]=$line[1]\n";
+}
+
author	Chris Little <chrislit@crosswire.org>	2009-11-10 07:46:41 +0000
committer	Chris Little <chrislit@crosswire.org>	2009-11-10 07:46:41 +0000
commit	f3e7b8d748d47e62d0a043022b46e084be8f8824 (patch)
tree	3e5df0827036c94e0c702c9400afe460ce1f6bfc /locales/makeCodeList.pl
parent	eaae68469d37ebdd404b571dbfa159a45574ed27 (diff)
download	sword-tools-f3e7b8d748d47e62d0a043022b46e084be8f8824.tar.gz