#!/usr/bin/perl ## Licensed under the standard BSD license: # Copyright (c) 2002-2011 CrossWire Bible Society # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in # the documentation and/or other materials provided with the # distribution. # * Neither the name of the CrossWire Bible Society nor the names of # its contributors may be used to endorse or promote products # derived from this software without specific prior written # permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS # IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED # TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A # PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ## For general inquiries, comments, suggestions, bug reports, etc. email: ## sword-support@crosswire.org ######################################################################### use strict; use Unicode::UCD 'charinfo'; binmode (STDOUT,":utf8"); ## Obtain arguments if (scalar(@ARGV) < 1) { print "\nusfm_charmap.pl [-o outputfile]\n\n"; print "- prints a list of characters in text parts of an USFM file, ignoring tags etc\n"; print "- Arguments in braces < > are required. Arguments in brackets [ ] are optional.\n"; print "- If no -o option is specified the output goes to .\n"; exit (-1); } my $file = @ARGV[0]; my $outputFilename; my %list; if (@ARGV[1] eq "-o") { $outputFilename = "@ARGV[2]"; open (OUTF , ">:utf8", "$outputFilename") or die "Could not open file @ARGV[2] for writing."; select(OUTF); } ## Initialise USFM file open INF, "<:utf8",$file; my @doc = ; close INF; my $doc=join("\n",@doc); # count out the characters in text nodes only $doc =~ s/\\id (.+)?\n//g; $doc =~ s/\\rem (.+)?\n//g; $doc =~ s/\\c [0-9]+\s*\n//g; $doc =~ s/\\v [0-9]+(-[0-9]+)?\s+//g; $doc =~ s/\\[a-z]+\s*\n//g; $doc =~ s/\\[a-z]+([0-9])?\ //g; $doc =~ s/\\[a-z]+([0-9])?\*//g; my @complete = split(//,$doc); foreach (@complete) { my $char=$_; $list{$char}++; } # print results foreach my $key(sort keys %list) { my $c; if ($key =~ /\p{Cc}/) { $c = " "} else { $c = $key } my $ci = charinfo(ord($key)); print "\t".$c."\tU+".$ci->{'code'}."\t".$list{$key}."\t".$ci->{'script'}."\t".$ci->{'name'}."\n"; }