#!/usr/bin/perl
## Licensed under the standard BSD license:
# Copyright (c) 2002-2011 CrossWire Bible Society
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in
# the documentation and/or other materials provided with the
# distribution.
# * Neither the name of the CrossWire Bible Society nor the names of
# its contributors may be used to endorse or promote products
# derived from this software without specific prior written
# permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
# IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
## For general inquiries, comments, suggestions, bug reports, etc. email:
## sword-support@crosswire.org
#########################################################################
use XML::LibXML;
use strict;
use Unicode::UCD 'charinfo';
binmode (STDOUT,":utf8");
## Obtain arguments
if (scalar(@ARGV) < 1) {
print "\ncharmap.pl [-o outputfile]\n\n";
print "- prints a list of characters in text nodes of an OSIS file, ignoring tags etc\n";
print "- Arguments in braces < > are required. Arguments in brackets [ ] are optional.\n";
print "- If no -o option is specified the output goes to .\n";
exit (-1);
}
my $file = @ARGV[0];
my $outputFilename;
my %list;
if (@ARGV[1] eq "-o") {
$outputFilename = "@ARGV[2]";
open (OUTF , ">:utf8", "$outputFilename") or die "Could not open file @ARGV[2] for writing.";
select(OUTF);
}
## Initialise OSIS file
my $parser = XML::LibXML->new();
my $doc = $parser->parse_file($file);
# count out the characters in text nodes only
&text_nodes($doc);
# print results
foreach my $key(sort keys %list) {
my $c;
if ($key =~ /\p{Cc}/) { $c = " "}
else { $c = $key }
my $ci = charinfo(ord($key));
print "\t".$c."\tU+".$ci->{'code'}."\t".$list{$key}."\t".$ci->{'script'}."\t".$ci->{'name'}."\n";
}
##########################################
sub text_nodes(){
my $node = @_[0];
if ($node->nodeType==XML_TEXT_NODE) {
my $text = $node->toString();
&addTextToCounter($text);
}
else {
my @children = $node->childNodes();
foreach (@children) {
&text_nodes($_);
}
}
}
###########################################
sub addTextToCounter() {
my @complete = split(//,@_[0]);
foreach (@complete) {
my $char=$_;
$list{$char}++;
}
}