summaryrefslogtreecommitdiffstats
path: root/modules/misc_cleanup/usfm_charmap.pl
blob: 305ce94e296c104d1fa8654a29e78e3be4a17834 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!/usr/bin/perl

## Licensed under the standard BSD license:

# Copyright (c) 2002-2011 CrossWire Bible Society <http://www.crosswire.org/>
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
#     * Redistributions of source code must retain the above copyright
#        notice, this list of conditions and the following disclaimer.
#     * Redistributions in binary form must reproduce the above copyright
#       notice, this list of conditions and the following disclaimer in
#       the documentation and/or other materials provided with the
#       distribution.
#     * Neither the name of the CrossWire Bible Society nor the names of
#       its contributors may be used to endorse or promote products
#       derived from this software without specific prior written
#       permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
# IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

## For general inquiries, comments, suggestions, bug reports, etc. email:
## sword-support@crosswire.org

#########################################################################

use strict;
use Unicode::UCD 'charinfo';
binmode (STDOUT,":utf8");

## Obtain arguments
if (scalar(@ARGV) < 1) {
    print "\nusfm_charmap.pl <usfmfile> [-o outputfile]\n\n"; 
    print "- prints a list of characters in text parts of an USFM file, ignoring tags etc\n";
    print "- Arguments in braces < > are required. Arguments in brackets [ ] are optional.\n";
    print "- If no -o option is specified the output goes to <STDOUT>.\n";
    exit (-1);
}

my $file = @ARGV[0];
my $outputFilename;
my %list;

if (@ARGV[1] eq "-o") {
    $outputFilename = "@ARGV[2]";
    open (OUTF , ">:utf8", "$outputFilename") or die "Could not open file @ARGV[2] for writing.";
    select(OUTF);
}

## Initialise USFM file
open INF, "<:utf8",$file;
my @doc = <INF>;
close INF;
my $doc=join("\n",@doc);
# count out the characters in text nodes only

$doc =~ s/\\id (.+)?\n//g;
$doc =~ s/\\rem (.+)?\n//g;
$doc =~ s/\\c [0-9]+\s*\n//g;
$doc =~ s/\\v [0-9]+(-[0-9]+)?\s+//g;
$doc =~ s/\\[a-z]+\s*\n//g;
$doc =~ s/\\[a-z]+([0-9])?\ //g;
$doc =~ s/\\[a-z]+([0-9])?\*//g;

my @complete = split(//,$doc);

foreach (@complete) {
    my $char=$_;
    $list{$char}++;
}
# print results

foreach my $key(sort keys %list) {
  my $c;
  if ($key =~ /\p{Cc}/) { $c = " "}
  else { $c = $key }
  
  my $ci = charinfo(ord($key)); 
        
  print "\t".$c."\tU+".$ci->{'code'}."\t".$list{$key}."\t".$ci->{'script'}."\t".$ci->{'name'}."\n";
  }