diff options
author | Peter von Kaehne <refdoc@gmx.net> | 2011-07-07 20:44:20 +0000 |
---|---|---|
committer | Peter von Kaehne <refdoc@gmx.net> | 2011-07-07 20:44:20 +0000 |
commit | a886a9646b152c9d4f7de88f19daa02ac0dbb017 (patch) | |
tree | 2661e8d5c3e65a6984c76f39c93e5bdd0944c3b8 /modules/misc_cleanup | |
parent | 90f1e14bb7379c3c0d524a92facca1e84b1da7b9 (diff) | |
download | sword-tools-a886a9646b152c9d4f7de88f19daa02ac0dbb017.tar.gz |
some small utilities to clean up OSIS files
git-svn-id: https://www.crosswire.org/svn/sword-tools/trunk@331 07627401-56e2-0310-80f4-f8cd0041bdcd
Diffstat (limited to 'modules/misc_cleanup')
-rw-r--r-- | modules/misc_cleanup/README | 8 | ||||
-rwxr-xr-x | modules/misc_cleanup/charmap.pl | 109 | ||||
-rwxr-xr-x | modules/misc_cleanup/numbers.pl | 86 | ||||
-rwxr-xr-x | modules/misc_cleanup/order.pl | 37 | ||||
-rwxr-xr-x | modules/misc_cleanup/osis_tr.pl | 92 |
5 files changed, 332 insertions, 0 deletions
diff --git a/modules/misc_cleanup/README b/modules/misc_cleanup/README new file mode 100644 index 0000000..aa3eaf0 --- /dev/null +++ b/modules/misc_cleanup/README @@ -0,0 +1,8 @@ +The scripts in this directory are meant to assist with minor clean up jobs +during module creation, motly from USFM files (Paratext). + +As these scripts have dependencies which are not commonly fulfilled on +normal machines, and also are often not needed in the first place, they +are kept out of usfm2osis.pl + +These scripts are maintained by Peter von Kaehne (refdoc@crosswire.org) diff --git a/modules/misc_cleanup/charmap.pl b/modules/misc_cleanup/charmap.pl new file mode 100755 index 0000000..24fb938 --- /dev/null +++ b/modules/misc_cleanup/charmap.pl @@ -0,0 +1,109 @@ +#!/usr/bin/perl + +## Licensed under the standard BSD license: + +# Copyright (c) 2002-2011 CrossWire Bible Society <http://www.crosswire.org/> +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of the CrossWire Bible Society nor the names of +# its contributors may be used to endorse or promote products +# derived from this software without specific prior written +# permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +# IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +## For general inquiries, comments, suggestions, bug reports, etc. email: +## sword-support@crosswire.org + +######################################################################### +use XML::LibXML; +use strict; +use Unicode::UCD 'charinfo'; +binmode (STDOUT,":utf8"); + +## Obtain arguments +if (scalar(@ARGV) < 1) { + print "\ncharmap.pl <osisfile> [-o outputfile]\n\n"; + print "- prints a list of characters in text nodes of an OSIS file, ignoring tags etc\n"; + print "- Arguments in braces < > are required. Arguments in brackets [ ] are optional.\n"; + print "- If no -o option is specified the output goes to <STDOUT>.\n"; + exit (-1); +} + +my $file = @ARGV[0]; +my $outputFilename; +my %list; + +if (@ARGV[1] eq "-o") { + $outputFilename = "@ARGV[2]"; + open (OUTF , ">:utf8", "$outputFilename") or die "Could not open file @ARGV[2] for writing."; + select(OUTF); +} + +## Initialise OSIS file + +my $parser = XML::LibXML->new(); +my $doc = $parser->parse_file($file); + +# count out the characters in text nodes only + +&text_nodes($doc); + +# print results + +foreach my $key(sort keys %list) { + my $c; + if ($key =~ /\p{Cc}/) { $c = " "} + else { $c = $key } + + my $ci = charinfo(ord($key)); + + print "\t".$c."\tU+".$ci->{'code'}."\t".$list{$key}."\t".$ci->{'script'}."\t".$ci->{'name'}."\n"; + } + +########################################## + +sub text_nodes(){ + my $node = @_[0]; + if ($node->nodeType==XML_TEXT_NODE) { + my $text = $node->toString(); + &addTextToCounter($text); + } + else { + my @children = $node->childNodes(); + foreach (@children) { + &text_nodes($_); + } + } +} + +########################################### + +sub addTextToCounter() { + my @complete = split(//,@_[0]); + + foreach (@complete) { + my $char=$_; + $list{$char}++; + } +} diff --git a/modules/misc_cleanup/numbers.pl b/modules/misc_cleanup/numbers.pl new file mode 100755 index 0000000..567494f --- /dev/null +++ b/modules/misc_cleanup/numbers.pl @@ -0,0 +1,86 @@ +#!/usr/bin/perl + +# numbers.pl translates Western numbers into Arabic-Indic numbers in the textnodes of XML files + +## Licensed under the standard BSD license: + +# Copyright (c) 2002-2011 CrossWire Bible Society <http://www.crosswire.org/> +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of the CrossWire Bible Society nor the names of +# its contributors may be used to endorse or promote products +# derived from this software without specific prior written +# permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +# IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +## For general inquiries, comments, suggestions, bug reports, etc. email: +## sword-support@crosswire.org + +######################################################################### +use XML::LibXML; +use utf8; +use strict; + +## Obtain arguments +if (scalar(@ARGV) < 1) { + print "\nnumbers.pl <osisfile> [-o outputfile]-- - fix Latin numbers in Arabic script text \n"; + print "- Arguments in braces < > are required. Arguments in brackets [ ] are optional.\n"; + print "- If no -o option is specified for the output filename, the default output file is: \n\t<osisfile>.new\n"; + exit (-1); +} + +my $file = @ARGV[0]; +my $nextarg = 1; +my $outputFilename; + +if (@ARGV[$nextarg] eq "-o") { + $outputFilename = "@ARGV[$nextarg+1]"; + open (OUTF, , ">", "$outputFilename") or die "Could not open file @ARGV[2] for writing."; + select(OUTF); +} + +## Initialise OSIS file + +my $parser = XML::LibXML->new(); +my $doc = $parser->parse_file($file); + +&delatinize($doc); + +print $doc->toString(); + +sub delatinize(){ + my $node = @_[0]; + if ($node->nodeType==XML_TEXT_NODE) { + my $text = $node->toString(); + $text =~ tr/[0123456789]/[۰۱۲۳۴۵۶۷۸۹]/; + $node->replaceDataString($node->toString,$text); + } + else { + my @children = $node->childNodes(); + foreach (@children) { + &delatinize($_); + } + } +} +
\ No newline at end of file diff --git a/modules/misc_cleanup/order.pl b/modules/misc_cleanup/order.pl index a650724..5aa296d 100755 --- a/modules/misc_cleanup/order.pl +++ b/modules/misc_cleanup/order.pl @@ -3,6 +3,43 @@ # order of the books of the Bible. You need to edit the script to prepare for different versififcations/caanons, # You also need to edit line 22 for your particular file naming scheme +## Licensed under the standard BSD license: + +# Copyright (c) 2002-2011 CrossWire Bible Society <http://www.crosswire.org/> +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of the CrossWire Bible Society nor the names of +# its contributors may be used to endorse or promote products +# derived from this software without specific prior written +# permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +# IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +## For general inquiries, comments, suggestions, bug reports, etc. email: +## sword-support@crosswire.org + +######################################################################### + use strict; use warnings; use File::Copy; diff --git a/modules/misc_cleanup/osis_tr.pl b/modules/misc_cleanup/osis_tr.pl new file mode 100755 index 0000000..742e034 --- /dev/null +++ b/modules/misc_cleanup/osis_tr.pl @@ -0,0 +1,92 @@ +#!/usr/bin/perl + +# osis_tr.pl does on the textnodes of OSIS files what tr does on normal text + +## Licensed under the standard BSD license: + +# Copyright (c) 2002-2011 CrossWire Bible Society <http://www.crosswire.org/> +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of the CrossWire Bible Society nor the names of +# its contributors may be used to endorse or promote products +# derived from this software without specific prior written +# permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +# IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +## For general inquiries, comments, suggestions, bug reports, etc. email: +## sword-support@crosswire.org + +######################################################################### +use XML::LibXML; +#use utf8; +use strict; + +## Obtain arguments +if (scalar(@ARGV) < 3) { + print "\nosis_tr.pl <osisfile> <in> <out> [-o outputfile]-- - exchange characters with others, only in text nodes \n"; + print "- Arguments in braces < > are required. Arguments in brackets [ ] are optional.\n"; + print "- If no -o option is specified, the default output is STDOUT\n"; + exit (-1); +} + +my $file = @ARGV[0]; +my $nextarg = 1; + +my $in = @ARGV[$nextarg]; +$nextarg++; + +my $out = @ARGV[$nextarg]; +$nextarg++; + +my $outputFilename; +if (@ARGV[$nextarg] eq "-o") { + $outputFilename = "@ARGV[$nextarg+1]"; + open (OUTF, , ">", "$outputFilename") or die "Could not open file @ARGV[2] for writing."; + select(OUTF); +} + +## Initialise OSIS file + +my $parser = XML::LibXML->new(); +my $doc = $parser->parse_file($file); + +&translate($doc); + +print $doc->toString(); + +sub translate(){ + my $node = @_[0]; + if ($node->nodeType==XML_TEXT_NODE) { + my $text = $node->toString(); + $text =~ tr/[$in]/[$out]/; + $node->replaceDataString($node->toString,$text); + } + else { + my @children = $node->childNodes(); + foreach (@children) { + &translate($_); + } + } +} +
\ No newline at end of file |