From e63c31a8f21596d398be1d011b137ed32944ea83 Mon Sep 17 00:00:00 2001 From: Sandro Mani Date: Tue, 6 Oct 2015 11:30:59 +0200 Subject: [PATCH] Update to version 3.04.00 --- .gitignore | 2 + sources | 5 +- tesseract.spec | 213 +++++++++++++++++++++++++++++++--------- tesseract_datadir.patch | 37 +++++++ 4 files changed, 209 insertions(+), 48 deletions(-) create mode 100644 tesseract_datadir.patch diff --git a/.gitignore b/.gitignore index cd2a650..b5f2981 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,5 @@ tesseract-2.00.eng.tar.gz /tesseract-ocr-3.02.eng.tar.gz /tesseract-ocr-3.01.osd.tar.gz /tesseract-3.03-rc1.tar.gz +/tesseract-3.04.00.tar.gz +/tessdata-3.04.00.tar.gz diff --git a/sources b/sources index 660475e..b209211 100644 --- a/sources +++ b/sources @@ -1,3 +1,2 @@ -d69ceca9ae70e0b7020d0f92d60b8565 tesseract-3.03-rc1.tar.gz -683486e01f5b87c17f2f5815f770ccb3 tesseract-ocr-3.01.osd.tar.gz -3562250fe6f4e76229a329166b8ae853 tesseract-ocr-3.02.eng.tar.gz +078130b9c7d28c558a0e49d432505864 tesseract-3.04.00.tar.gz +b25e830d203af5c863081af3f684b53a tessdata-3.04.00.tar.gz diff --git a/tesseract.spec b/tesseract.spec index 3162e5d..284a8ba 100644 --- a/tesseract.spec +++ b/tesseract.spec @@ -1,74 +1,196 @@ -%global fullname tesseract-ocr -%global pre rc1 +Name: tesseract +Version: 3.04.00 +Release: 1%{?dist} +Summary: Raw OCR Engine -Name: tesseract -Version: 3.03 -Release: 0.6%{?pre:.%pre}%{?dist} -Summary: Raw OCR Engine +License: ASL 2.0 +URL: https://github.com/tesseract-ocr/%{name} +Source0: https://github.com/tesseract-ocr/tesseract/archive/%{version}.tar.gz#/%{name}-%{version}.tar.gz +Source1: https://github.com/tesseract-ocr/tessdata/archive/%{version}.tar.gz#/tessdata-%{version}.tar.gz -Group: Applications/File -License: ASL 2.0 -URL: http://code.google.com/p/%{fullname}/ -# The downloads are now posted on google-drive which has impossible download URLS... -# The url of the drive is -# https://drive.google.com/folderview?id=0B7l10Bj_LprhQnpSRkpGMGV2eE0 -Source0: %{name}-%{version}%{?pre:-%pre}.tar.gz -Source1: http://tesseract-ocr.googlecode.com/files/%{fullname}-3.02.eng.tar.gz -Source2: http://tesseract-ocr.googlecode.com/files/%{fullname}-3.01.osd.tar.gz -BuildRequires: libtiff-devel -BuildRequires: leptonica-devel -BuildRequires: cairo-devel -BuildRequires: libicu-devel -BuildRequires: pango-devel -BuildRequires: automake libtool -Obsoletes: tesseract < 3.02.02 +# Tweak location of tessdata folder +Patch0: tesseract_datadir.patch -%package devel -Summary: Development files for %{fullname} -Group: Development/Libraries -Requires: %{name} = %{version}-%{release} +BuildRequires: libtiff-devel +BuildRequires: leptonica-devel +BuildRequires: cairo-devel +BuildRequires: libicu-devel +BuildRequires: pango-devel +BuildRequires: automake libtool -%package osd -Summary: Orientation & Script Detection Data for %{fullname} -Group: Applications/File -Requires: %{name} = %{version}-%{release} %description A commercial quality OCR engine originally developed at HP between 1985 and 1995. In 1995, this engine was among the top 3 evaluated by UNLV. It was open-sourced by HP and UNLV in 2005. + +%package devel +Summary: Development files for %{name} +Requires: %{name}%{?_isa} = %{version}-%{release} + %description devel The %{name}-devel package contains header file for developing applications that use %{name}. + +%package osd +Summary: Orientation & Script Detection Data for %{name} +Requires: %{name}%{?_isa} = %{version}-%{release} + %description osd -Orientation & Script Detection Data for %{fullname} +Orientation & Script Detection Data for %{name} + +%define lang_subpkg() \ +%package langpack-%{1}\ +Summary: %{2} language data for %{name}\ +BuildArch: noarch\ +Requires: %{name} = %{version}-%{release}\ +\ +%description langpack-%{1}\ +%{2} language data for %{name}.\ +\ +%files langpack-%{1}\ +%{_datadir}/%{name}/tessdata/%{1}.* + +# see https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes +# and https://en.wikipedia.org/wiki/ISO_639_macrolanguage +%lang_subpkg afr Afrikaans +%lang_subpkg amh Amharic +%lang_subpkg ara Arabic +%lang_subpkg asm Assamese +%lang_subpkg aze Azerbaijani +%lang_subpkg aze_cyrl "Azerbaijani (Cyrilic)" +%lang_subpkg bel Belarusian +%lang_subpkg ben Bengali +%lang_subpkg bod "Tibetan (Standard)" +%lang_subpkg bos Bosnian +%lang_subpkg bul Bulgarian +%lang_subpkg cat Catalan +%lang_subpkg ceb Cebuano +%lang_subpkg ces Czech +%lang_subpkg chi_sim "Chinese (Simplified)" +%lang_subpkg chi_tra "Chinese (Traditional)" +%lang_subpkg chr Cherokee +%lang_subpkg cym Welsh +%lang_subpkg dan Danish +%lang_subpkg dan_frak "Danish (Fraktur)" +%lang_subpkg deu German +%lang_subpkg deu_frak "German (Fraktur)" +%lang_subpkg dzo Dzongkha +%lang_subpkg ell Greek +%lang_subpkg enm "Middle English (1100-1500)" +%lang_subpkg epo Esperanto +%lang_subpkg equ "Math / equation" +%lang_subpkg est Estonian +%lang_subpkg eus Basque +%lang_subpkg fas "Persian (Farsi)" +%lang_subpkg fin Finnish +%lang_subpkg fra French +%lang_subpkg frk Frankish +%lang_subpkg frm "Middle French (ca. 1400-1600)" +%lang_subpkg gle Irish +%lang_subpkg glg Galician +%lang_subpkg grc "Ancient Greek" +%lang_subpkg guj Gujarati +%lang_subpkg hat Haitian +%lang_subpkg heb Hebrew +%lang_subpkg hin Hindi +%lang_subpkg hrv Croatian +%lang_subpkg hun Hungarian +%lang_subpkg iku Inuktitut +%lang_subpkg ind Indonesian +%lang_subpkg isl Icelandic +%lang_subpkg ita Italian +%lang_subpkg ita_old "Italian (Old)" +%lang_subpkg jav Javanese +%lang_subpkg jpn Japanese +%lang_subpkg kan Kannada +%lang_subpkg kat Georgian +%lang_subpkg kat_old "Georgian (Old)" +%lang_subpkg kaz Kazakh +%lang_subpkg khm Khmer +%lang_subpkg kir Kyrgyz +%lang_subpkg kor Korean +%lang_subpkg kur Kurdish +%lang_subpkg lao Lao +%lang_subpkg lat Latin +%lang_subpkg lav Latvian +%lang_subpkg lit Lithuanian +%lang_subpkg mal Malayalam +%lang_subpkg mar Marathi +%lang_subpkg mkd Macedonian +%lang_subpkg mlt Maltese +%lang_subpkg msa Malay +%lang_subpkg mya Burmese +%lang_subpkg nep Nepali +%lang_subpkg nld Dutch +%lang_subpkg nor Norwegian +%lang_subpkg ori Oriya +%lang_subpkg pan Panjabi +%lang_subpkg pol Polish +%lang_subpkg por Portuguese +%lang_subpkg pus Pashto +%lang_subpkg ron Romanian +%lang_subpkg rus Russian +%lang_subpkg san Sanskrit +%lang_subpkg sin Sinhala +%lang_subpkg slk Slovakian +%lang_subpkg slk_frak "Slovakian (Fraktur)" +%lang_subpkg slv Slovenian +%lang_subpkg spa Spanish +%lang_subpkg spa_old "Spanish (Old)" +%lang_subpkg sqi Albanian +%lang_subpkg srp Serbian +%lang_subpkg srp_latn "Serbian (Latin)" +%lang_subpkg swa Swahili +%lang_subpkg swe Swedish +%lang_subpkg syr Syriac +%lang_subpkg tam Tamil +%lang_subpkg tel Telugu +%lang_subpkg tgk Tajik +%lang_subpkg tgl Tagalog +%lang_subpkg tha Thai +%lang_subpkg tir Tigrinya +%lang_subpkg tur Turkish +%lang_subpkg uig Uyghur +%lang_subpkg ukr Ukrainian +%lang_subpkg urd Urdu +%lang_subpkg uzb Uzbek +%lang_subpkg uzb_cyrl "Uzbek (Cyrillic)" +%lang_subpkg vie Vietnamese +%lang_subpkg yid Yiddish + %prep -%setup -q -n %{name}-%{version} -a1 -a2 +%setup -q -n %{name}-%{version} -a1 +%patch0 -p1 + %build -sed -i 's#-DTESSDATA_PREFIX=@datadir@/#-DTESSDATA_PREFIX=@datadir@/%{name}/##' ccutil/Makefile.* autoreconf -ifv %configure --disable-static + make %{?_smp_mflags} -# Remove compiled files, see https://groups.google.com/forum/#!topic/tesseract-dev/ARKOSV3zpWo -make -C training clean make %{?_smp_mflags} training + %install %make_install %make_install training-install -rm -f %{buildroot}%{_libdir}/*la -mkdir -p %{buildroot}%{_datadir}/%{name} -mv %{buildroot}%{_datadir}/tessdata %{buildroot}%{_datadir}/%{name} -install -m 0644 %{fullname}/tessdata/* %{buildroot}%{_datadir}/%{name}/tessdata + +find %{buildroot}%{_libdir} -type f -name '*.la' -delete + +install -pm 0644 tessdata-%{version}/* %{buildroot}/%{_datadir}/%{name}/tessdata/ + %post -p /sbin/ldconfig %postun -p /sbin/ldconfig + %files +%license COPYING +%doc AUTHORS ChangeLog NEWS README testing/eurotext.tif testing/phototest.tif %{_bindir}/ambiguous_words %{_bindir}/classifier_tester %{_bindir}/combine_tessdata @@ -82,17 +204,14 @@ install -m 0644 %{fullname}/tessdata/* %{buildroot}%{_datadir}/%{name}/tessdata %{_bindir}/wordlist2dawg %dir %{_datadir}/%{name} %dir %{_datadir}/%{name}/tessdata -%{_datadir}/%{name}/tessdata/configs -%{_datadir}/%{name}/tessdata/tessconfigs +%{_datadir}/%{name}/tessdata/configs/ +%{_datadir}/%{name}/tessdata/tessconfigs/ %{_datadir}/%{name}/tessdata/eng.* %{_datadir}/%{name}/tessdata/pdf.ttf -%{_datadir}/%{name}/tessdata/pdf.ttx -%{_libdir}/lib%{name}*.so.* +%{_libdir}/lib%{name}*.so.3* %{_mandir}/man1/* %{_mandir}/man5/* -%doc AUTHORS ChangeLog COPYING eurotext.tif NEWS phototest.tif README - %files devel %{_includedir}/%{name} %{_libdir}/lib%{name}*.so @@ -101,7 +220,11 @@ install -m 0644 %{fullname}/tessdata/* %{buildroot}%{_datadir}/%{name}/tessdata %files osd %{_datadir}/%{name}/tessdata/osd.traineddata + %changelog +* Sat Sep 12 2015 Sandro Mani - 3.04.00-1 +- Update to 3.04.00 + * Fri Jun 19 2015 Fedora Release Engineering - 3.03-0.6.rc1 - Rebuilt for https://fedoraproject.org/wiki/Fedora_23_Mass_Rebuild diff --git a/tesseract_datadir.patch b/tesseract_datadir.patch new file mode 100644 index 0000000..49f5509 --- /dev/null +++ b/tesseract_datadir.patch @@ -0,0 +1,37 @@ +diff -rupN tesseract-3.04.00/ccutil/Makefile.am tesseract-3.04.00-new/ccutil/Makefile.am +--- tesseract-3.04.00/ccutil/Makefile.am 2015-07-11 09:53:12.000000000 +0200 ++++ tesseract-3.04.00-new/ccutil/Makefile.am 2015-09-12 19:10:31.983919381 +0200 +@@ -3,7 +3,7 @@ SUBDIRS = + AM_CXXFLAGS = + + if !NO_TESSDATA_PREFIX +-AM_CXXFLAGS += -DTESSDATA_PREFIX=@datadir@/ ++AM_CXXFLAGS += -DTESSDATA_PREFIX=@datadir@/tesseract/ + endif + + if VISIBILITY +diff -rupN tesseract-3.04.00/tessdata/configs/Makefile.am tesseract-3.04.00-new/tessdata/configs/Makefile.am +--- tesseract-3.04.00/tessdata/configs/Makefile.am 2015-07-11 09:53:12.000000000 +0200 ++++ tesseract-3.04.00-new/tessdata/configs/Makefile.am 2015-09-12 19:10:40.978587765 +0200 +@@ -1,3 +1,3 @@ +-datadir = @datadir@/tessdata/configs ++datadir = @datadir@/tesseract/tessdata/configs + data_DATA = inter makebox box.train unlv ambigs.train api_config kannada box.train.stderr quiet logfile digits hocr linebox pdf rebox strokewidth bigram + EXTRA_DIST = inter makebox box.train unlv ambigs.train api_config kannada box.train.stderr quiet logfile digits hocr linebox pdf rebox strokewidth bigram +diff -rupN tesseract-3.04.00/tessdata/Makefile.am tesseract-3.04.00-new/tessdata/Makefile.am +--- tesseract-3.04.00/tessdata/Makefile.am 2015-07-11 09:53:12.000000000 +0200 ++++ tesseract-3.04.00-new/tessdata/Makefile.am 2015-09-12 19:10:31.985919304 +0200 +@@ -1,4 +1,4 @@ +-datadir = @datadir@/tessdata ++datadir = @datadir@/tesseract/tessdata + + data_DATA = pdf.ttf + EXTRA_DIST = $(data_DATA) +diff -rupN tesseract-3.04.00/tessdata/tessconfigs/Makefile.am tesseract-3.04.00-new/tessdata/tessconfigs/Makefile.am +--- tesseract-3.04.00/tessdata/tessconfigs/Makefile.am 2015-07-11 09:53:12.000000000 +0200 ++++ tesseract-3.04.00-new/tessdata/tessconfigs/Makefile.am 2015-09-12 19:10:48.218340816 +0200 +@@ -1,3 +1,3 @@ +-datadir = @datadir@/tessdata/tessconfigs ++datadir = @datadir@/tesseract/tessdata/tessconfigs + data_DATA = batch batch.nochop nobatch matdemo segdemo msdemo + EXTRA_DIST = batch batch.nochop nobatch matdemo segdemo msdemo