#!/usr/bin/perl ## Licensed under the standard BSD license: # Copyright (c) 2002-2011 CrossWire Bible Society # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in # the documentation and/or other materials provided with the # distribution. # * Neither the name of the CrossWire Bible Society nor the names of # its contributors may be used to endorse or promote products # derived from this software without specific prior written # permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS # IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED # TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A # PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ## For general inquiries, comments, suggestions, bug reports, etc. email: ## sword-support@crosswire.org ######################################################################### use XML::LibXML; use strict; use Unicode::UCD 'charinfo'; binmode (STDOUT,":utf8"); ## Obtain arguments if (scalar(@ARGV) < 1) { print "\ncharmap.pl [-o outputfile]\n\n"; print "- prints a list of characters in text nodes of an OSIS file, ignoring tags etc\n"; print "- Arguments in braces < > are required. Arguments in brackets [ ] are optional.\n"; print "- If no -o option is specified the output goes to .\n"; exit (-1); } my $file = @ARGV[0]; my $outputFilename; my %list; if (@ARGV[1] eq "-o") { $outputFilename = "@ARGV[2]"; open (OUTF , ">:utf8", "$outputFilename") or die "Could not open file @ARGV[2] for writing."; select(OUTF); } ## Initialise OSIS file my $parser = XML::LibXML->new(); my $doc = $parser->parse_file($file); # count out the characters in text nodes only &text_nodes($doc); # print results foreach my $key(sort keys %list) { my $c; if ($key =~ /\p{Cc}/) { $c = " "} else { $c = $key } my $ci = charinfo(ord($key)); print "\t".$c."\tU+".$ci->{'code'}."\t".$list{$key}."\t".$ci->{'script'}."\t".$ci->{'name'}."\n"; } ########################################## sub text_nodes(){ my $node = @_[0]; if ($node->nodeType==XML_TEXT_NODE) { my $text = $node->toString(); &addTextToCounter($text); } else { my @children = $node->childNodes(); foreach (@children) { &text_nodes($_); } } } ########################################### sub addTextToCounter() { my @complete = split(//,@_[0]); foreach (@complete) { my $char=$_; $list{$char}++; } }