From cd4935a574b160bcfd5487c068d97df8160fb56b Mon Sep 17 00:00:00 2001 From: MatÄ›j Cepl Date: Thu, 24 Nov 2016 19:48:06 +0100 Subject: Cleanup the scans resulting in a way better text. --- README.txt | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 README.txt (limited to 'README.txt') diff --git a/README.txt b/README.txt new file mode 100644 index 0000000..106d111 --- /dev/null +++ b/README.txt @@ -0,0 +1,8 @@ + +Postup jak vygenerovat kvalitní OCR + +pdfimages Co_jest_obect_KHB.pdf cojeimg +pdfinfo Co_jest_obect_KHB.pdf +for img in cojeimg-*.ppm ; do textcleaner -g $img ${img%*.ppm}-clean.ppm ; done +for img in cojeimg-*-clean.ppm ; do tesseract $img ${img%*-clean.ppm} -l ces -c tessedit_create_hocr=1 ; done +cat cojeimg-0*.txt >cojestobec-img.txt -- cgit