#!/usr/bin/perl
## USFM to OSIS (2.1.1) converter
## Licensed under the standard BSD license:
# Copyright (c) 2002-2012 CrossWire Bible Society <http://www.crosswire.org/>
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in
# the documentation and/or other materials provided with the
# distribution.
# * Neither the name of the CrossWire Bible Society nor the names of
# its contributors may be used to endorse or promote products
# derived from this software without specific prior written
# permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
# IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
## For general inquiries, comments, suggestions, bug reports, etc. email:
## sword-support@crosswire.org
#########################################################################
use utf8;
# Stores the script version and date
$version = "1.7.2";
$date = '$Date$';
$rev = '$Rev$';
$date =~ s/^.+?(\d{4}-\d{2}-\d{2}).+/$1/;
$rev =~ s/^.+?(\d+).+/$1/g;
# Sets the version of OSIS used in the OSIS header
$osisVersion = "2.1.1";
# Stores the USFM Version
$usfmVersion = "2.2"; # The USFM reference document can be found at http://www.ubs-icap.org/chm/usfm/2.2/ (as of 2009-10-26)
# This is the hash which maps the conversion of USFM book abbreviations to OSIS book abbreviations. ***I would like to add the ability to access an external file to provide options for other languages. In other words, in preparing a USFM file for conversion, a separate file could be created which could be used to map the conversion of abbreviated book names in other languages to OSIS. This would be especially useful for cross-references, but I haven't figured out how to do it yet.
%OSISbook = (
# Theoretically, these are laid out according to <BooksPresent>, but I can really only guess without a spec ***Need to check
"" => "", "GEN" => "Gen", "EXO" => "Exod", "LEV" => "Lev", "NUM" => "Num",
"DEU" => "Deut", "JOS" => "Josh", "JDG" => "Judg", "RUT" => "Ruth",
"1SA" => "1Sam", "2SA" => "2Sam", "1KI" => "1Kgs", "2KI" => "2Kgs",
"1CH" => "1Chr", "2CH" => "2Chr", "EZR" => "Ezra", "NEH" => "Neh",
"EST" => "Esth", "JOB" => "Job", "PSA" => "Ps", "PRO" => "Prov",
"ECC" => "Eccl", "SNG" => "Song", "ISA" => "Isa", "JER" => "Jer",
"LAM" => "Lam", "EZK" => "Ezek", "DAN" => "Dan", "HOS" => "Hos",
"JOL" => "Joel", "AMO" => "Amos", "OBA" => "Obad", "JON" => "Jonah",
"MIC" => "Mic", "NAM" => "Nah", "HAB" => "Hab", "ZEP" => "Zeph",
"HAG" => "Hag", "ZEC" => "Zech", "MAL" => "Mal", "MAT" => "Matt",
"MRK" => "Mark", "LUK" => "Luke", "JHN" => "John", "ACT" => "Acts",
"ROM" => "Rom", "1CO" => "1Cor", "2CO" => "2Cor", "GAL" => "Gal",
"EPH" => "Eph", "PHP" => "Phil", "COL" => "Col", "1TH" => "1Thess",
"2TH" => "2Thess", "1TI" => "1Tim", "2TI" => "2Tim", "TIT" => "Titus",
"PHM" => "Phlm", "HEB" => "Heb", "JAS" => "Jas", "1PE" => "1Pet",
"2PE" => "2Pet", "1JN" => "1John", "2JN" => "2John", "3JN" => "3John",
"JUD" => "Jude", "REV" => "Rev", "TOB" => "Tob", "JDT" => "Jdt",
"ESG" => "Esth", "WIS" => "Wis", "SIR" => "Sir", "BAR" => "Bar",
"LJE" => "EpJer", "S3Y" => "PrAzar", "SUS" => "Sus", "BEL" => "Bel",
"1MA" => "1Macc", "2MA" => "2Macc", "3MA" => "3Macc", "4MA" => "4Macc",
"1ES" => "1Esd", "2ES" => "2Esd", "MAN" => "PrMan",
# Following this is just an uneducated guess
"PS2" => "Ps151", "ODA" => "Odes", "PSS" => "PssSol", "JSA" => "Josh",
"JSB" => "Josh", "TBS" => "Tob", "SST" => "Sus", "DNT" => "Dan",
"BLT" => "Bel", "ADE" => "AddEsth"
);
$divOpen=false;
# Generates a list of available encodings.
use Encode;
@encodingList = Encode->encodings(":all");
foreach $enc (@encodingList) {
$encodings .= "$enc, ";
}
$encodings =~ s/\, $//;
# Syntax instructions
if (scalar(@ARGV) < 2) {
print "\nusfm2osis.pl -- USFM $usfmVersion to OSIS $osisVersion converter version $version\nRevision $rev ($date)\nSyntax: usfm2osis.pl <osisWork> [-o OSIS-file] [-e USFM encoding] [-t module type] <USFM filenames|wildcard>\n";
print "- Arguments in braces < > are required. Arguments in brackets [ ] are optional.\n";
print "- The osisWork is a short name with no spaces which will identify your module.\n";
print "- If no -o option is specified for the output filename, the default output file is: \n\tosisWork.osis.xml.\n";
print "- Supported encodings include:\n\t$encodings\n";
print "- If the encoding is omitted, utf8 is the default value.\n";
print "- The script can currently produce valid OSIS for usfm bibles and nearly valid OSIS for usfm commentaries. To set the module type use '-t' with either 'bible' or 'comment'. On ommision 'bible' is assumed.\n";
print "- USFM filenames with the SFM extension can be accessed using a wildcard: \n\t*.SFM\n";
print "As an example, if you want to generate the osisWork <bible> and your USFM files are encoded in utf8, located in the /Bible folder relative to this script with the file extension SFM, enter:\n\tperl usfm2osis.pl bible Bible/*.SFM\n\n";
exit (-1);
}
$osisWork = @ARGV[0];
$nextarg = 1;
if (@ARGV[$nextarg] eq "-o") {
$outputFilename = "@ARGV[$nextarg+1]";
$nextarg += 2;
}
else {
$outputFilename = "$osisWork.osis.xml";
}
open (OUTF, , ">:utf8", "$outputFilename") or die "Could not open file @ARGV[2] for writing.";
if (@ARGV[$nextarg] eq "-e") {
$inputEncoding = "@ARGV[$nextarg+1]";
$nextarg += 2;
}
else {
$inputEncoding = "utf8";
}
if (@ARGV[$nextarg] eq "-t") {
$moduleType = "@ARGV[$nextarg+1]";
$moduleType = lc($moduleType);
$nextarg += 2;
}
else {
$moduleType = "bible";
}
$encFound = 0;
foreach $enc (@encodingList) {
if ($enc eq $inputEncoding) {
$encFound = 1;
}
}
if ($encFound == 0) {
die "Encoding $inputEncoding not supported.\nSupported encodings include:\n\t$encodings\n";
}
else {
print "Encoding \"$inputEncoding\" is supported.\n"
}
for (; $nextarg < scalar(@ARGV); $nextarg++) {
push(@files, @ARGV[$nextarg]);
}
push (@outdata, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<osis xmlns=\"http://www.bibletechnologies.net/2003/OSIS/namespace\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://www.bibletechnologies.net/2003/OSIS/namespace http://www.bibletechnologies.net/osisCore.$osisVersion.xsd\">\n<osisText osisRefWork=\"Bible\" xml:lang=\"en\" osisIDWork=\"$osisWork\">\n<header>\n<work osisWork=\"$osisWork\"\/>\n<\/header>\n");
$tagStack = "<\/osisText><\/osis>";
$chapClose = "";
$versClose = "";
sub closeTag {
$tag = @_[0];
if ($tagStack =~ /$tag/) {
$tagStack =~ s/^(.*?$tag)//;
$taglist = $1;
$taglist =~ s/>/>\n/g;
$taglist =~ s/(<\/\w+)\s+[^>]+>/$1>/g;
return $taglist;
}
else {
return:
}
}
sub openTag {
$tag = @_[0];
$tagStack = $tag . $tagStack;
return;
}
foreach $file (@files) {
print "Processing $file.\n";
open (SFM, "$file");
my @filedata = "";
while (<SFM>) {
my $sfline;
$sfline = decode($inputEncoding, $_);
push (@filedata, $sfline);
}
close (SFM);
for ($i = 0; $i < scalar(@filedata);$i++) {
$line = @filedata[$i];
### Trim leading/trailing whitespace
# $line =~ s/[\r\n]+//g;
$line =~ s/\s*$//;
$line =~ s/^\s*//;
if ($line !~ /^\s*$/) {
if ($line !~ /^\\/) {
@filedata[$i-1] .= " $line";
splice(@filedata, $i, 1);
$i--
}
else {
@filedata[$i] = $line;
}
}
}
$ollevel = 0;
$vers = 0;
$chap = 0;
$book = "";
# Sets the initial value for the attribute "n" in footnotes.
$nFN = 0;
$xFN = 0;
# Creates array for the attribute "n" in cross-references
@nCR = (a .. z);
# Sets the initial value for the attribute "n" in cross-references.
$nCR = @nCR [0];
#delete BOM, if present
@filedata[1] =~ s/^//;
#encoding stuff
for ($i = 0; $i < scalar(@filedata); $i++) {
$line = @filedata[$i];
### Basic XML entity encoding
$line =~ s/&(?![a-zA-Z0-9])/&/g;
$line =~ s/<< ?/\@/g;
$line =~ s/>>/\#/g;
$line =~ s/</\$/g;
$line =~ s/>/\%/g;
# $line =~ s/(\w)\'(\w)/"$1" . chr(0x2019) . "$2"/eg;
$line =~ s/\\fr 1\/2 \\fr\*/chr(0xBD)/eg;
@filedata[$i] = $line;
}
for ($i = 0; $i < scalar(@filedata); $i++) {
$line = @filedata[$i];
### File Identification--Markers Supported: \id, \h, \ide, \sts, \rem, \toc1, \toc2, \toc3
if ($line =~ /\\v\b\s*(\d+)\,(\d+)/) {
if ($1 + 1 == $2) {
$line =~ s/\\v\b\s*(\d+)\,(\d+)/\\v $1\-$2/;
}
}
$line =~ s/\\v\b\s+(\d+)(\-\d+|\s*\\v\b\s+\d+)\s*\\v\b\s+(\d+)/\\v $1\-$3/;
$line =~ s/\\v\b\s+(\d+)\s*\\v\b\s+(\d+\-)?(\d+)/\\v $1\-$3/;
$line =~ s/^\\(p[is]|mi)\b/\\p/;
$line =~ s/^\\li\b/\\p/; #\li isn't part of USFM, so we'll make it \p
# \id (book marker)
if ($line =~ /^\\id\b\s*([^ ]*)/) {
$book = $OSISbook{$1};
$chap = 0;
if ($versClose =~ /<verse/) {
push (@outdata, $versClose); # close verse
$versClose = "";
}
# push (@outdata, closeTag("<\/div[^>]*?>")); # close section
if ($chapClose =~ /<chapter/) {
push (@outdata, $chapClose); # close chapter
$chapClose = "";
}
push (@outdata, closeTag("<\/div type=\"book\">")); #close book
if ($book eq "") {
$book = "UnknownUSFMBook";
}
push (@outdata, "<div type=\"book\" osisID=\"$book\">\n"); # open current book
openTag("<\/div type=\"book\">");
$line = "";
}
# \h (running header--discard)
if ($line =~ /^\\h\b/) {
$line = "";
}
# \ide Encoding (discard)
if ($line =~ /^\\ide\b/) {
$line = "";
}
# \sts Status (discard)
if ($line =~ /^\\sts\b/) {
$line = "";
}
# \rem Comments from translator (discard)
if ($line =~ /^\\rem\b/) {
$line = "";
}
# \restore Version control comments from translator, not a formal USFM marker but used by paratext (discard)
if ($line =~ /^\\restore\b/) {
$line = "";
}
# \toc1 Table of Contents (discard)
if ($line =~ /^\\toc\d\b/) {
$line = "";
}
### Introduction--Markers Supported: \imt#, \is#, \iot, \io#, \ip \ie
#### Markers Not Yet Supported: \ipi, \im, \imi, \ipq, \imq, \ipr, \iq#, \ib, \ili, \ior...\ior*, \iex, \imte
# \it title (DCO: Commented out because \it is for italics not introduction titles in USFM 2.1)
# if ($line =~ /^\\it\b\s*(.*)/) {
# $line = "<div type=\"introduction\">\n<title>$1<\/title>";
# openTag("<\/div>");
# }
# \imt major title
if ($line =~ /^\\imt\b\s*(.+)/) {
$line = "<div type=\"introduction\">\n<title>$1<\/title>";
openTag("<\/div>");
}
# \is introduction section title
if ($line =~ /^\\is(\d*)\b\s*(.*)/) {
$level = $1;
if ($level eq "") {
$level = "1";
}
$line = "<div type=\"section\"><title>$2<\/title>";
openTag("<\/div>");
}
# \iot introduction outline title
if ($line =~ /^\\iot\b\s*(.*)/) {
$line = "<div type=\"outline\">\n<title>$1<\/title>";
}
# \io\d+ introduction outline item
if ($line =~ /^\\io(\d+)\b\s*(.*)/) {
if ($ollevel == $1) {
$line = "<item>$2<\/item>";
}
elsif ($ollevel > $1) {
$line = "";
while ($ollevel > $1) {
$line .= "<\/list><\/item>\n";
$ollevel--;
}
$line .= "<item>$2<\/item>";
}
elsif ($ollevel < $1) {
$line = "";
if ($ollevel != 0) {
$line .= "<item>";
}
while ($ollevel < $1) {
$line .= "<list>\n";
$ollevel++;
}
$line .= "<item>$2<\/item>\n";
}
if (@filedata[$i+1] !~ /^\\io/) {
while ($ollevel > 0) {
$line .= "\n<\/list>";
if ($ollevel > 1) {$line .= "<\/item>";}
$ollevel--;
}
if ($ollevel == 0) {
$line .= "\n<\/div>";
}
}
}
# \ip introduction paragraph
if ($line =~ /^\\ip\b\s*(.*)/) {
$line = "<p>$1<\/p>";
}
# \im introduction paragraph ('left flush' - NOT IMPLENTED FLUSH)
if ($line =~ /^\\im\b\s*(.*)/) {
$line = "<p>$1<\/p>";
}
# \im introduction quotation (implemented as ordinary quotation)
if ($line =~ /^\\imq\b\s*(.*)/) {
$line = "<q>$1<\/q>";
}
# \iq line (including \iq#), adapted from \q (see below), needs more clean-up
if ($line =~ /^\\iq/) {
if ($l != 1) {
push (@outdata, "<lg>\n");
$l = 1;
}
if ($line =~ /\\iq(\d*)$/) {
if ($1 eq "") {
$line = "<l>\n";
}
else {
$line = "<l level=\"$1\">\n";
}
@filedata[$i+1] .= "<\/l>";
if (@filedata[$i+2] !~ /\\iq(?!t)/) {
@filedata[$i+1] .= "\n<\/lg>";
$l = 0;
}
}
else {
$line =~ s/\\iq\b\s*(.+)/<l>$1<\/l>/;
$line =~ s/\\iq(\d+)\b\s*(.+)/<l level=\"$1\">$2<\/l>/;
if (@filedata[$i+1] !~ /\\iq(?![ta])/) {
$line .= "\n<\/lg>";
$l = 0;
}
}
}
# \ie introduction end (discard)
if ($line =~ /^\\ie\b/) {
$line = "";
}
### Titles, Headings, and Labels (elsewhere?)--Markers Supported: \d, \ms#, \s#, \mt#, \r, \sp, \rq..\rq*
#### Markers Not Yet Supported: \mte#, \mr, \sr
# \ms majorSection
if ($line =~ /^\\ms\d?\b\s*(.+)/) {
push (@outdata, closeTag("<\/p>"));
push (@outdata, closeTag("<\/div type=\"majorSection\">"));
push (@outdata, "<div type=\"majorSection\">\n");
openTag("<\/div type=\"majorSection\">");
$line =~ s/\\ms\d?\b\s*(.+)/<title>$1<\/title>/;
}
# \d canonical title
if ($line =~ /^\\d\b\s*(.+)?(\\d\*)?/) {
push (@outdata, closeTag("<\/p>"));
$line =~ s/\\d\b\s*(.+)/<title type=\"psalm\">$1<\/title>/;
}
# \s \s1 section (From Chapters and Verses)
if ($line =~ /^\\s1?\b\s*(.+)/) {
push (@outdata, closeTag("<\/p>"));
push (@outdata, closeTag("<\/div type=\"section\">"));
push (@outdata, "<div type=\"section\">\n");
openTag("<\/div type=\"section\">");
$line =~ s/\\s1?\b\s*(.+)/<title>$1<\/title>/;
if ($line =~ /HEBREW TITLE/) {
$line =~ s/<title>/<title type=\"psalm\" canonical=\"true\">/;
}
}
# \ss \s2 subSection (From Chapters and Verses)
if ($line =~ /^\\s[s2]\b\s*(.+)/) {
push (@outdata, closeTag("<\/p>"));
push (@outdata, closeTag("<\/div type=\"subSection\">"));
push (@outdata, "<div type=\"subSection\">\n");
openTag("<\/div type=\"subSection\">");
$line =~ s/\\s[s2]\b\s*(.+)/<title>$1<\/title>/;
}
# \sss \s3 x-subSubSection (From Chapters and Verses)
# This will also handle deeper levels (4+) of subsections.
if ($line =~ /^\\s(ss|\d+)\b\s*(.+)/) {
my $ssLvl = $1;
my $ssType = "";
while ($ssLvl > 2) {
$ssLvl--;
$ssType .= "Sub";
}
$ssType = "x-sub" . $ssType . "Section";
push (@outdata, closeTag("<\/p>"));
push (@outdata, closeTag("<\/div type=\"$ssType\">"));
push (@outdata, "<div type=\"$ssType\">\n");
openTag("<\/div type=\"$ssType\">");
$line =~ s/\\s(ss|\d+)\b\s*(.+)/<title>$2<\/title>/;
}
# \mt\mt1 title
if ($line =~ /^\\mt[1234]?\b\s*(.+)/) {
$line = "<title type=\"main\">$1<\/title>";
}
# \mt2 title
if ($line =~ /^\\mt2\b\s*(.+)/) {
$line = "<title type=\"continued\">$1<\/title>";
}
# \st,\st2 title
if ($line =~ /^\\st2?\b\s*(.+)/) {
$line = "<title type=\"continued\">$1<\/title>";
}
# \st3 title
if ($line =~ /^\\st3\b\s*(.+)/) {
$line = "<title type=\"sub\">$1<\/title>";
}
# \r sub title
if ($line =~ /^\\mr\b\s*(.+)/) {
$line = "<title type=\"sub\">$1<\/title>";
}
# \r parallel title
if ($line =~ /^\\r\b\s*(.+)/) {
$line = "<title type=\"parallel\">$1<\/title>";
}
# \sp speaker
if ($line =~ /^\\sp\b\s*(.+)/) {
$line = "<speaker>$1<\/speaker>";
}
# \rq..\rq* inline reference
$line =~ s/\\rq( .*?)\\rq\*/<reference>$1<\/reference>/g;
### Chapters and Verses--Markers Supported: \c, \v, \vp...\vp*, \cl
#### Markers Not Yet Supported: \ca...\ca*, \cp, \cd, \va...\va*
# \c chapter
if ($line =~ /^\\c\b\s*([^ ]*)/) {
if ($1 ne "") {
$chap = $1;
}
else {
$chap++;
}
push (@outdata, $versClose);
$versClose = "";
if ($moduleType eq "bible") {
push (@outdata, closeTag("<\/p>"))
}
if ($chapClose =~ /<chapter/) {
push (@outdata, $chapClose); # close previous chapter
$chapClose = "";
} else {
push (@outdata, closeTag("<\/div>")); # close introduction div
}
push (@outdata, "<chapter sID=\"$book.$chap\" osisID=\"$book.$chap\"\/>\n");
$chapClose = "<chapter eID=\"$book.$chap\"\/>\n";
$line =~ s/\\c\b\s*([^ ]*)//;
}
# \cl chapter label
if ($line =~ /^\\cl\b\s*(.*)/) {
$line = "<title>$1<\/title>";
}
# \v verse
if ($line =~ /^\\v\b\s*(\d[^\\ ]*)?/) {
if ($1 ne "") {
$vers = $1;
}
else {
$vers++;
}
push (@outdata, $versClose);
$divOpen=false;
$versClose = "";
if ($vers =~ /(\d+[^\\\- ]*)\-(\d+[^\\ ]*)/) {
$vF = $1;
$vT = $2;
$vF =~ /^(\d+)/;
$vFn = scalar($1);
$vT =~ /^(\d+)/;
$vTn = scalar($1);
$osisID = "$book.$chap.$vF";
if ($vTn > $vFn && $vFn > 0) {
for ($j = $vFn + 1; $j < $vTn; $j++) {
$osisID .=" $book.$chap.$j";
}
}
$osisID .= " $book.$chap.$vT";
}
else {
$osisID = "$book.$chap.$vers";
}
if ($moduleType eq "bible") {
push (@outdata, "<verse sID=\"$osisID\" osisID=\"$osisID\"\/>\n");
$versClose = "<verse eID=\"$osisID\"\/>\n";
$line =~ s/\\v\b\s*(\d[^\\ ]*)? *//;
}
elsif ($moduleType eq "comment") {
closeTag("<\/p>");
push (@outdata, "<div type=\"section\" annotateType=\"commentary\" annotateRef=\"$osisID\">\n");
$versClose = "<\/p>\n<\/div>\n";
$line =~ s/\\v\b\s*(\d[^\\ ]*)? *//;
$divOpen=true;
}
else { print "usfm2osis.pl supports only the module types \"bible\" and \"comment\" \n";
exit 1;
}
}
# \vp...\vp# published verse numbers
$line =~ s/\\vp\*\s*//g;
$line =~ s/\\vp\b\s*(\d+[a-z]?|[a-z])\s*/<seg type="verseNumber">$1<\/seg>/g;
### Paragraphs--Markers Supported: \p, \b, \m, \nb, \cls
#### Markers Not Yet Supported: \m, \pmo, \pm, \pmc, \pmr, \pi#, \mi, \li#, \pc, \pr, \ph#, \b
# Hack to solve an issue in a module that used <R> for linebreaks in the usfm files--may be commented out (not USFM 2.1)
$line =~ s/\\lb\*/<lb \/>/g;
# \p paragraph (From Chapters and Verses)
if ($line =~ /^\\p\b\s*/) {
if ((($moduleType eq "comment") && ($divOpen))||($moduleType eq "bible")){
push (@outdata, closeTag("<\/p>"));
}
push (@outdata, "<p>\n");
openTag("<\/p>");
$line =~ s/\\p\b\s*//;
}
# \pc paragraph centered (From Chapters and Verses)
if ($line =~ /^\\pc\b\s*/) {
if ((($moduleType eq "comment") && ($divOpen))||($moduleType eq "bible")){
push (@outdata, closeTag("<\/p>"));
}
push (@outdata, "<p type=\"x-center\">\n");
openTag("<\/p>");
$line =~ s/\\pc\b\s*//;
}
# \mi2 paragraph flush left, no indentation
if ($line =~ /^\\mi2\b\s*/) {
if ((($moduleType eq "comment") && ($divOpen))||($moduleType eq "bible")){
push (@outdata, closeTag("<\/p>"));
}
push (@outdata, "<p type=\"x-noindent\">\n");
openTag("<\/p>");
$line =~ s/\\mi2\b\s*//;
}
# \cls paragraph (From Chapters and Verses)
if ($line =~ /^\\cls\b\s*/) {
if ((($moduleType eq "comment") && ($divOpen))||($moduleType eq "bible")){
push (@outdata, closeTag("<\/closer>"));
}
push (@outdata, "<closer>\n");
openTag("<\/closer>");
$line =~ s/\\cls\b\s*//;
}
# \b
$line =~ s/\\b\b//;
# \m
$line =~ s/\\m\b//;
# \nb
$line =~ s/\\nb\b//;
### Poetry--Markers Supported: \q#, \qs...\qs*, \qr, \qc, \qac...\qac*, \qa, \qm#
#### Markers Not Yet Supported: \b
# \qt...\qt*, OT quotation (handle early)
$line =~ s/\\qt\b\s*(.*?)\\qt\*/<seg type="otPassage">$1<\/seg>/g;
# \qa, acrostic heading
$line =~ s/^\\qa\b\s*(.*)/<title type=\"acrostic\">$1<\/title>/g;
# \qac...\qac*, acrostic character style (used within a line)
$line =~ s/\\qac\b\s*(.*?)\\qac\*/<hi type="acrostic">$1<\/hi>/g;
# \q line (including \q#, \qr, \qc, and \qs...\qs*)
if ($line =~ /^\\q/) {
if ($l != 1) {
push (@outdata, "<lg>\n");
$l = 1;
}
if ($line =~ /\\qm?(c|r|\d*)$/) {
if ($1 eq "") {
$line = "<l>\n";
}
elsif ($1 eq "c") {
$line = "<l type=\"x-centered\">";
}
elsif ($1 eq "r") {
$line = "<l type=\"x-right\">";
}
else {
$line = "<l level=\"$1\">\n";
}
@filedata[$i+1] .= "<\/l>";
if (@filedata[$i+2] !~ /\\q(?!t)/) {
@filedata[$i+1] .= "\n<\/lg>";
$l = 0;
}
}
else {
$line =~ s/\\q\b\s*(.+)/<l>$1<\/l>/;
$line =~ s/\\qm?(\d+)\b\s*(.+)/<l level=\"$1\">$2<\/l>/;
$line =~ s/\\qc\b\s*(.+)/<l type=\"x-centered\">$1<\/l>/;
$line =~ s/\\qr\b\s*(.+)/<l type=\"x-right\">$1<\/l>/;
$line =~ s/\\qs\b\s*(.+?)\s*\\qs\*/<l type="selah">$1<\/l>/;
if (@filedata[$i+1] !~ /\\q(?![ta])/) {
$line .= "\n<\/lg>";
$l = 0;
}
}
$line =~ s/\s*\\qs\b\s*(.+?)\s*\\qs\*/<\/l>\n<l type="selah">$1/;
}
$line =~ s/\s*\\qs\b\s*(.+?)\s*\\qs\*\s*/<lg><l type="selah">$1<\/l><\/lg>/;
### Tables--Markers Supported: \tr, \th#, \tc#, \tcr#
####Markers Not Yet Supported: \thr#
# \th table heading
if ($line =~ /^\\t/) {
if ($line =~ /^\\tr\b\s*(\\th.*)/) {
$line = "$1";
if ($table != 1) {
push (@outdata, "<table>\n");
$table = 1;
}
$line =~ s/\\th\d?\b\s*(.+?)\s*(?=(\\th|$))/<cell role=\"label\">$1<\/cell>/g;
$line = "<row>$line<\/row>";
}
if ($line =~ /^\\tr\b\s*(\\tc.*)/) {
$line = $1;
if ($table != 1) {
push (@outdata, "<table>\n");
$table = 1;
}
$line =~ s/\\tcr?\d?\b\s*(.+?)\s*(?=(\\tc|$))/<cell>$1<\/cell>/g;
$line = "<row>$line<\/row>";
if (@filedata[$i+1] !~ /\\tr/) {
$line .= "<\/table>\n";
$table = 0;
}
}
if ($line =~ /^\\th1\b\s*(.*)/) {
if ($table != 1) {
push (@outdata, "<table>\n");
$table = 1;
}
$line = "<row><cell role=\"label\">$1<\/cell>\n";
}
elsif ($line =~ /^\\th\d+\b\s*(.*)/) {
$line = "<cell role=\"label\">$1<\/cell>\n";
}
if ($line =~ /^\\tb1\b\s*(.*)/) {
if ($table != 1) {
push (@outdata, "<table>\n");
$table = 1;
}
else {
push (@outdata, "<\/row>");
}
$line = "<row><cell>$1<\/cell>\n";
if (@filedata[$i+1] !~ /\\tb/) {
$line .= "<\/row><\/table>\n";
$table = 0;
}
}
elsif ($line =~ /^\\tb\d+\b\s*(.*)/) {
$line = "<cell>$1<\/cell>\n";
if (@filedata[$i+1] !~ /\\tb/) {
$line .= "<\/row><\/table>\n";
$table = 0;
}
}
}
sub parseRef {
$ref = @_[0];
$ref =~ s/[:\.]\s*$//;
$ref =~ s/:/\./g;
$ref = "$book.$ref";
$ref =~ s/(\d+)\.(\d[^\,]+)\-(\d+)/$1.$2-$book.$1.$3/;
$ref =~ s/(\d+)\.(\d[^\-]+)\-+\s*(\d.+)/$1.$2\-$book.$1.$3/;
return $ref;
}
### Footnotes--Markers Supported: \fk, \fq, \f...\f*, \fv, \ft, \fqa
####Markers Not Yet Supported: \fe...\fe*, \fr, \fl, \fp, \fdc...\fdc*, \fm...\fm*
sub footnoteHandler {
$note = @_[0];
$note = "<note>$note</note>";
# \fk Catch Words
$note =~ s/\\fk\s(.+?)(\s*)\\fk\*/\\fX<catchWord>$1<\/catchWord>\\fX$2/g;
$note =~ s/\\fk\s(.+?)(\s*)(?=\\f)/\\fX<catchWord>$1<\/catchWord>$2\\fX/g;
$note =~ s/\\fk\*/\\fX/g;
# \fq Quotations in Footnotes
# CCL--I don't know the difference, aside from length, between catch words and quotations in footnotes. It may vary by document.
$note =~ s/\\fq\s(.+?)(\s*)\\fq\*/\\fX<catchWord>$1<\/catchWord>\\fX$2/g;
$note =~ s/\\fq\s(.+?)(\s*)(?=\\f)/\\fX<catchWord>$1<\/catchWord>$2\\fX/g;
$note =~ s/\\fq\*/\\fX/g;
# \fqa Alternate translations in Footnotes
$note =~ s/\\fqa\s(.+?)\\fqa\*/\\fX<rdg type=\"alternate\">$1<\/rdg>\\fX/g;
$note =~ s/\\fqa\s(.+?)(?=\\f)/\\fX<rdg type=\"alternate\">$1<\/rdg>\\fX/g;
$note =~ s/\\fqa\*/\\fX/g;
# \fv Footnote verse number
$note =~ s/\\fv\s(.+?)\\fv\*/\\fX<reference osisID=\"$book.$chap.$1\">$1<\/reference>\\fX/g;
$note =~ s/\\fv\s*(\d+)\b\s*(?=\\f)/\\fX<reference osisID=\"$book.$chap.$1\">$1<\/reference>\\fX/g;
$note =~ s/\\fv\*/\\fX/g;
# \fr Footnote origin reference (the verse where the fn appears)
while ($note =~ /\\fr\s*(.+?)\s*(?=\\f)/) {
$sourceVal = parseRef($1);
$nFN++;
# $note =~ s/\\fr\s*(.+?)\s*(?=\\f)//;
$note =~ s/\\fr\s*//;
$note =~ s/<note>/<note n="$nFN">/;
}
# \ft Footnote text
$note =~ s/\\ft\s//g;
$note =~ s/\\ft\*//g;
# \f* Footnote closer
$note =~ s/\s*\\f\*//;
# \f Footnote opener
$note =~ s/\\fe?\b\s*([^\s]\s*)?//;
# \fX was inserted above to mark former locations of various already-handled markers, which can now be removed
$note =~ s/\\fX//g;
return $note;
}
$line =~ s/(\\f\b.+?\\f\*)/footnoteHandler($1)/eg;
### Crossreferences--Markers Supported: \x + \xo...\x*, \xk, \xq, \xt
#### Markers Not Yet Supported: \xdc...\xdc*
sub xrefHandler {
$xref = @_[0];
$xref = "<note type=\"crossReference\">$xref</note>";
# \xk Catch Words
$xref =~ s/\\xk\s(.+?)(\s*)\\xk\*/<catchWord>$1<\/catchWord>$2/g;
$xref =~ s/\\xk\s(.+?)(\s*)(?=\\x)/<catchWord>$1<\/catchWord>$2/g;
$xref =~ s/\\xk\*//g;
# \xq Quotations in Footnotes
# CCL--I don't know the difference, aside from length, between catch words and quotations in footnotes. It may vary by document.
$xref =~ s/\\xq\s(.+?)(\s*)\\xq\*/<catchWord>$1<\/catchWord>$2/g;
$xref =~ s/\\xq\s(.+?)(\s*)(?=\\x)/<catchWord>$1<\/catchWord>$2/g;
$xref =~ s/\\xq\*//g;
# \xo Footnote origin reference (the verse where the fn appears)
while ($xref =~ /\\xo\s*(.+?)\s*(?=\\x)/) {
$sourceVal = parseRef($1);
$xFN++;
# $xref =~ s/\\xo\s*(.+?)\s*(?=\\x)//;
$xref =~ s/\\xo\s*//;
$xref =~ s/<note type=\"crossReference\">/<note type=\"crossReference\" n="$xFN">/;
}
# \xt Crossref itself
$xref =~ s/\\xt\s(.+?)\\xt\*/<reference>$1<\/reference>/g;
$xref =~ s/\\xt\s(.+?)(?=\\x)/<reference>$1<\/reference>/g;
$xref =~ s/\\xt\*//g;
# \x* Footnote closer
$xref =~ s/\\x\*//;
# \x Footnote opener
$xref =~ s/\\x\b\s*([^\s]\s*)?//;
return $xref;
}
$line =~ s/(\\x\b.+?\\x\*)/xrefHandler($1)/eg;
# crossReference osisRef=""
$line =~ s/<reference osisRef="">([^<]+)<\/reference>/<reference osisRef="$1">$1<\/reference>/g;
$line =~ s/osisRef="\s/osisRef="\s/g;
$line =~ s/\s">/">/g;
$line =~ s/<reference osisRef="([^\s\"]+)\s/<reference osisRef="$1\./g; # Changes space after book name to a period
$line =~ s/<reference osisRef="([^\"]+):([^\"]+)"/<reference osisRef="$1\.$2"/g; # Gen 1:1
$line =~ s/<reference osisRef="([^\.\"]+)\.(\d+)\.(\d+)-(\d+)"/<reference osisRef="$1\.$2\.$3-$1\.$2\.$4"/g; # Gen 1:1-2
$line =~ s/<reference osisRef="([^\.\"]+).(\d+):(\d+)-(\d+).(\d+)"/<reference osisRef="$1\.$2\.$3-$1\.$4\.$5"/g; # Gen 1:1-2:3
$line =~ s/<reference osisRef="([^\.\"]+)\.(\d+)\.([^\"]+)">([^<]+)<\/reference>; <reference osisRef="(\d+)\.(\d+)"/<reference osisRef="$1\.$2\.$3">$4<\/reference>; <reference osisRef="$1\.$5\.$6"/g; # Gen. 1:1, 2:3
$line =~ s/<reference osisRef="([^\.\"]+)\.(\d+)\.([^\"]+)">([^<]+)<\/reference>, <reference osisRef="(\d+)"/<reference osisRef="$1\.$2\.$3">$4<\/reference>, <reference osisRef="$1\.$2\.$5"/g; # Gen. 1:1, 3
$line =~ s/<reference osisRef="([^\"\.]+)\.(\d+)"/<reference osisRef="$1\.1\.$2"/g; # Jude 1
### Special Text and Character Styles--Markers Supported: \it...\it*, \nd...\nd*, \pn...\pn*, \tl...\tl*, \qt...\qt*, \add...\add*, \pb, \bk...\bk*, \sc..\sc*, \bd...\bd*, \wj...\wj*
#### Markers Not Yet Supported: Special Text: \k...\k*, \lit, \ord...\ord*, \sig...\sig*,; Character Styling: \em...\em*, \bdit...\bdit*, \no...\no*; Spacing and Breaks: !$, //; Special Features: \fig...\fig*, \ndx...\ndx*, \pro...\pro*, \w...\w*, \wg...\wg*, \wh...\wh*
# \dc...\dc*, inserted deuterocanonical text
$line =~ s/\\dc\b\s*(.*?)\\dc\*/<transChange type=\"added\" editions=\"dc\">$1<\/transChange>/g;
# \it...\it*, italic text
$line =~ s/\\it\b\s*(.*?)\\it\*/<hi type=\"italic\">$1<\/hi>/g;
# \bd...\bd*, bold text
$line =~ s/\\bd\b\s*(.*?)\\bd\*/<hi type=\"bold\">$1<\/hi>/g;
# \bk...\bk*, book name in text
$line =~ s/\\bk\b\s*(.*?)\\bk\*/<hi type=\"italic\">$1<\/hi>/g;
# \sc...\sc*, small-caps character style (used within a line)
$line =~ s/\\sc\b\s*(.*?)\\sc\*/<hi type="small-caps">$1<\/hi>/g;
# \nd...\nd*, Divine Name
$line =~ s/\\nd\b\s*(.*?)\\nd\*/<divineName>$1<\/divineName>/g;
# \pn...\pn*, Proper name
$line =~ s/\\pn\b\s*(.*?)\\pn\*/<name>$1<\/name>/g;
# \tl...\tl*, Foreign Langauge (treated here merely as transliterated text)
$line =~ s/\\tl\b\s*(.*?)\\tl\*/<hi type="italic">$1<\/hi>/g;
# \sls...\sls*, Text from alternative text source
$line =~ s/\\sls\b\s*(.*?)\\sls\*/<hi type="italic">$1<\/hi>/g;
# \add...\add*, text added for translation purposes
$line =~ s/\\add\b\s*(.*?)\\add\*/<transChange type=\"added\">$1<\/transChange>/g;
# \wj...\wj*, Words of Jesus
$line =~ s/\\wj\b\s*(.*?)\\wj\*/<q who="Jesus" marker="">$1<\/q>/g;
# \pb, page break
$line =~ s/\\pb\b/<milestone type=\"pb\"\/>/g;
### Other (probably non-standard) items
### Markers Supported: \zelastic
# \zelastic, elastic height marker for typesetting
$line =~ s/\\zelastic\b//g;
$line =~ s/_/ /g;
### End USFM 2.1 Items
if ($line !~ /^\s*$/) {
push (@outdata, "$line\n");
}
}
}
if ($versClose =~ /<verse/) {
push (@outdata, $versClose); # close verse
$versClose = "";
}
if ($chapClose =~ /<chapter/) {
push (@outdata, $chapClose); # close chapter
$chapClose = "";
}
push (@outdata, closeTag("<\/osis>"));
for ($i = 0; $i < scalar(@outdata); $i++) {
#@outdata[$i] =~ s/---/―/g; # m-dash
#@outdata[$i] =~ s/--/—/g; # n-dash
@outdata[$i] =~ s/([es]ID=\"[^\" ]+) [^\"]*\"/$1\"/;
}
for ($i = 0; $i < scalar(@outdata); $i++) {
if (@outdata[$i] !~ /^\s*$/) {
@outdata[$i] =~ s/[\r\n]+/\n/g;
@outdata[$i] =~ s/\n?$/\n/;
print OUTF @outdata[$i];
}
}
close (OUTF);
print "Doing some cleanup.\n";
open (INF, "<:utf8", "$outputFilename");
@filedata = <INF>;
close (INF);
open (OUTF, ">:utf8", "$outputFilename");
# bubble chapter down
for ($i = 0; $i < scalar(@filedata); $i++) {
if (@filedata[$i] =~ /^<\// && @filedata[$i-1] =~ /^<chapter.+\/>/) {
$temp = @filedata[$i];
@filedata[$i] = @filedata[$i-1];
@filedata[$i-1] = $temp;
$i -= 2;
}
}
# bubble verse end up
# CCL--this may require further attention, but works for the present
for ($i = 0; $i < scalar(@filedata); $i++) {
if (@filedata[$i-1] =~ /^(<title|<\/?div|<\/?p)/ && @filedata[$i] =~ /^<verse eID.+\/>/) {
$temp = @filedata[$i];
@filedata[$i] = @filedata[$i-1];
@filedata[$i-1] = $temp;
$i -= 2;
}
}
for ($i = 0; $i < scalar(@filedata); $i++) {
$fullfile .= @filedata[$i];
}
$fullfile =~ s/<\/osisText>\n<\/osis>\n(<chapter eID[^>]+>)/$1\n<\/osisText>\n<\/osis>/mg; #swap the chapter back up one before the osisText closer
$fullfile =~ s/<\/div>\n(<chapter eID[^>]+>)/$1\n<\/div>/mg; #swap the chapter back up one before the book closer
print "Tagging quotations.\n";
$q = 1;
$fullfile =~ s/\$([^\%]+?)\%/"<q level=\"2\" sID=\"q2." . $q . "\"\/>" . $1 . "<q level=\"2\" eID=\"q2." . $q++ . "\"\/>"/eg;
$fullfile =~ s/\$/"<milestone type=\"cQuote\" subType=\"x-level-2\"\/>"/eg;
$q = 1;
while ($fullfile =~ /(\@[^\@\#]+?)\@([^\@\#]+?)\#(([^\@\#]+?\@[^\@\#]+?\#)+[^\@\#]+?\#)/) {
$fullfile =~ s/(\@[^\@\#]+?)\@([^\@\#]+?)\#(([^\@\#]+?\@[^\@\#]+?\#)+[^\@\#]+?\#)/$1 . "<q level=\"1\" sID=\"q1." . $q . "\"\/>" . $2 . "<q level=\"1\" eID=\"q1." . $q++ . "\"\/>" . $3/eg;
}
while ($fullfile =~ /(\@[^\@\#]+?)\@([^\@\#]+?)\#([^\@\#]+?\#)/) {
$fullfile =~ s/(\@[^\@\#]+?)\@([^\@\#]+?)\#([^\@\#]+?\#)/$1 . "<q level=\"1\" sID=\"q1." . $q . "\"\/>" . $2 . "<q level=\"1\" eID=\"q1." . $q++ . "\"\/>" . $3/eg;
}
$fullfile =~ s/\@([^\#]+?)\#/"<q level=\"1\" sID=\"q1." . $q . "\"\/>" . $1 . "<q level=\"1\" eID=\"q1." . $q++ . "\"\/>"/eg;
$fullfile =~ s/\@/"<milestone type=\"cQuote\" subType=\"x-level-1\"\/>"/eg;
$fullfile =~ s/\^/"<q level=\"1\" eID=\"q1." . $q++ . ".false\"\/>"/eg;
print OUTF $fullfile;
close (OUTF);
print "All done! OSIS file: $outputFilename\n";