diff options
author | Peter von Kaehne <refdoc@gmx.net> | 2015-09-04 01:46:46 +0000 |
---|---|---|
committer | Peter von Kaehne <refdoc@gmx.net> | 2015-09-04 01:46:46 +0000 |
commit | b65ed23c2dd5798c0088205d29dd2bbb3a6dfa5a (patch) | |
tree | 9f1773336a825d1d1bd8953dc314aabe9f995fec /modules/conf/confmaker.pl | |
parent | cd6e034126b245124eb4fd50b5fda955338d14a5 (diff) | |
download | sword-tools-b65ed23c2dd5798c0088205d29dd2bbb3a6dfa5a.tar.gz |
added handling of UTF8 stripfilters.
git-svn-id: https://www.crosswire.org/svn/sword-tools/trunk@503 07627401-56e2-0310-80f4-f8cd0041bdcd
Diffstat (limited to 'modules/conf/confmaker.pl')
-rwxr-xr-x | modules/conf/confmaker.pl | 49 |
1 files changed, 45 insertions, 4 deletions
diff --git a/modules/conf/confmaker.pl b/modules/conf/confmaker.pl index 2662ae1..6367ec5 100755 --- a/modules/conf/confmaker.pl +++ b/modules/conf/confmaker.pl @@ -41,8 +41,11 @@ use XML::LibXML; use I18N::LangTags::List; use Unicode::UCD 'charinfo'; -use open ':std', ':encoding(UTF-8)'; - +#use open ':std', ':encoding(UTF-8)'; +#use open qw/:std :utf8/; +use utf8; +use Sword; +use HTML::Strip; ## Obtain arguments if (scalar(@ARGV) < 1) { @@ -90,6 +93,17 @@ if (@ARGV[$nextarg] eq "-l") { my $parser = XML::LibXML->new(); my $doc = $parser->parse_file($file); +my $manager = new Sword::SWMgr(); + +$manager->setGlobalOption("Hebrew Vowel Points", "Off"); +$manager->setGlobalOption("Hebrew Cantillation", "Off"); +$manager->setGlobalOption("Arabic Vowel Points", "Off"); +$manager->setGlobalOption("Greek Accents", "Off"); + +my $hs = HTML::Strip->new(); +my $doc_text = new Sword::SWBuf($hs->parse($doc->toString())); +#my $clean_doc_text = $hs->parse( $doc->toString()); +#$doc_text->append($clean_doc_text); ## obtain name, type and language @@ -100,7 +114,9 @@ my $doc_name = @elements[0]->getAttribute('osisIDWork'); my $doc_type = @elements[0]->getAttribute('osisRefWork'); my $doc_lang = @elements[0]->getAttribute('xml:lang'); my $doc_lang_name=I18N::LangTags::List::name($doc_lang); +; + if ((length($language)==0) && (length($doc_lang)==0)) { print STDERR $language."\n", $doc_lang."\n", $doc_lang_name."\n"; @@ -123,7 +139,7 @@ if ((length($language)>0) && (length($doc_lang)==0)) { my @doc_features = ('title', 'note', 'reference', 'q', 'figure', 'rdg'); my @word_features = ('lemma', 'gloss', 'morph',); - +my @char_features = ('Hebrew Vowel Points', 'Arabic Vowel Points', 'Hebrew Cantillation', 'Greek Accents'); my %doc_filters = ( 'title' => "OSISHeadings", 'note' => "OSISFootnotes", @@ -134,12 +150,20 @@ my %doc_filters = ( 'title' => "OSISHeadings", 'q' => "OSISRedLetterWords", 'rdg' => 'OSISVariants', ); + my %doc_feature = ( 'lemma' => 'StrongsNumbers', 'figure' => 'Images', 'p' => 'NoParagraphs', ); + +my %diacritics = ( 'Hebrew Vowel Points' => "UTF8HebrewPoints", + 'Arabic Vowel Points' => 'UTF8ArabicPoints', + 'Hebrew Cantillation' => 'UTF8Cantillation', + 'Greek Accents' => 'UTF8GreekAccents', + ); + my %doc_has_feature; @@ -165,6 +189,8 @@ foreach my $f(@word_features) { my @paragraphs = $doc->getElementsByTagName('p'); if (@paragraphs==0) {$doc_has_feature{'p'}=true}; + + # Assemble and print out @@ -189,6 +215,21 @@ foreach (@word_features) { print "GlobalOptionFilter=".$doc_filters{$_}."\n" } } + +foreach $filter(@char_features) { + + my $tmp = new Sword::SWBuf($hs->parse($doc->toString())); + + $manager->filterText($filter, $tmp); + + if ($tmp->c_str() ne $doc_text->c_str()) { + print "GlobalOptionFilter=".%diacritics{$filter}."\n"; + + } +} + + + foreach (@doc_features) { if ($doc_has_feature{$_} && exists $doc_feature{$_}) { print "Feature=".$doc_feature{$_}."\n" @@ -206,7 +247,7 @@ if ($doc_has_feature{'p'}) { print "Encoding=UTF-8\n"; print "SourceType=OSIS\n"; print "LCSH=".$doc_type.".".I18N::LangTags::List::name($doc_lang)."\n"; -print "SwordVersionDate=".`date +"%F"`."\n"; +print "SwordVersionDate=".`date +"%F"`; if (@inputFile>0) { foreach(@inputFile) { |