From f7c027617da44ed684bbb9b35c603b790365fdbf Mon Sep 17 00:00:00 2001 From: Maxim Cournoyer Date: Thu, 11 Aug 2022 17:53:21 -0400 Subject: gnu: tesseract-ocr: Update to 5.2.0. * gnu/packages/ocr.scm (tesseract-ocr): Update to 5.2.0. [inputs, native-inputs]: Move after arguments. Use new style inputs. [arguments]: Use gexps. [configure-flags]: Add --disable-static. [phases]{fix-docbook}: Replace phase with... {do-not-override-xml-catalog-files}: ... this new phase. {build-training}: Move after build phase. Enable parallel build. {trailing-install}: Move after install phase. [native-inputs]: Add libxml2. --- gnu/packages/ocr.scm | 124 ++++++++++++++++++++++++++------------------------- 1 file changed, 63 insertions(+), 61 deletions(-) (limited to 'gnu/packages/ocr.scm') diff --git a/gnu/packages/ocr.scm b/gnu/packages/ocr.scm index 11595e87e5..e28bd17668 100644 --- a/gnu/packages/ocr.scm +++ b/gnu/packages/ocr.scm @@ -5,6 +5,7 @@ ;;; Copyright © 2019 Alex Vong ;;; Copyright © 2021 Andy Tai ;;; Copyright © 2021, 2022 Nicolas Goaziou +;;; Copyright © 2022 Maxim Cournoyer ;;; ;;; This file is part of GNU Guix. ;;; @@ -74,71 +75,72 @@ (define-public ocrad (license license:gpl3+))) (define-public tesseract-ocr - ;; There are useful commits beyond the last official stable release. - (let ((commit "97079fa353557af6df86fd20b5d2e0dff5d8d5df") - (revision "1")) - (package - (name "tesseract-ocr") - (version (git-version "4.1.1" revision commit)) - (source - (origin - (method git-fetch) - (uri (git-reference - (url "https://github.com/tesseract-ocr/tesseract") - (commit commit))) - (file-name (git-file-name name version)) - (sha256 - (base32 - "11137a4aaay7qp64vdjd83hz1l089nzi5a0ql0qgk8gn79pyhi98")))) - (build-system gnu-build-system) - (inputs - `(("cairo" ,cairo) - ("icu" ,icu4c) - ("leptonica" ,leptonica) - ("pango" ,pango) - ("python-wrapper" ,python-wrapper))) - (native-inputs - `(("asciidoc" ,asciidoc) - ("autoconf" ,autoconf) - ("automake" ,automake) - ("docbook-xsl" ,docbook-xsl) - ("libarchive" ,libarchive) - ("libcurl" ,curl) - ("libtool" ,libtool) - ("libtiff" ,libtiff) - ("pkg-config" ,pkg-config) - ("xsltproc" ,libxslt))) - (arguments - `(#:configure-flags - (let ((leptonica (assoc-ref %build-inputs "leptonica"))) - (list (string-append "LIBLEPT_HEADERSDIR=" leptonica "/include"))) - #:tests? #f ; Tests currently result in a segfault - #:phases - (modify-phases %standard-phases - (add-after 'unpack 'fix-docbook - (lambda* (#:key inputs #:allow-other-keys) - ;; Don't attempt to download XSL schema. - (substitute* "doc/Makefile.am" - (("http://docbook.sourceforge.net/release/xsl/current/manpages/docbook.xsl") - (string-append (assoc-ref inputs "docbook-xsl") - "/xml/xsl/docbook-xsl-" - ,(package-version docbook-xsl) - "/manpages/docbook.xsl"))))) - (add-after 'install 'build-training - (lambda _ - (invoke "make" "training"))) - (add-after 'build-training 'install-training - (lambda _ - (invoke "make" "training-install")))))) - (home-page "https://github.com/tesseract-ocr/tesseract") - (synopsis "Optical character recognition engine") - (description - "Tesseract is an optical character recognition (OCR) engine with very + (package + (name "tesseract-ocr") + (version "5.2.0") + (source + (origin + (method git-fetch) + (uri (git-reference + (url "https://github.com/tesseract-ocr/tesseract") + (commit version))) + (file-name (git-file-name name version)) + (sha256 + (base32 + "0dai539h07lqj8lyhznd3wbwdpqr78qrsczq78rsmsryqvmdbyaa")))) + (build-system gnu-build-system) + (arguments + (list + #:configure-flags + #~(list (string-append "LIBLEPT_HEADERSDIR=" + #$(this-package-input "leptonica") "/include") + "--disable-static") ;avoid 6 MiB static archive + ;; The unit tests are disabled because they require building bundled + ;; third party libraries. + #:tests? #f + #:phases + #~(modify-phases %standard-phases + (add-after 'unpack 'do-not-override-xml-catalog-files + (lambda _ + (substitute* "configure.ac" + (("AC_SUBST\\(\\[XML_CATALOG_FILES])") + "")))) + (add-after 'build 'build-training + (lambda* (#:key parallel-build? #:allow-other-keys) + (define n (if parallel-build? (number->string + (parallel-job-count)) + "1")) + (invoke "make" "-j" n "training"))) + (add-after 'install 'install-training + (lambda _ + (invoke "make" "training-install")))))) + (native-inputs + (list asciidoc + autoconf + automake + curl + docbook-xsl + libarchive + libtiff + libtool + libxml2 ;for XML_CATALOG_FILES + libxslt + pkg-config)) + (inputs + (list cairo + icu4c + leptonica + pango + python-wrapper)) + (home-page "https://github.com/tesseract-ocr/tesseract") + (synopsis "Optical character recognition engine") + (description + "Tesseract is an optical character recognition (OCR) engine with very high accuracy. It supports many languages, output text formatting, hOCR positional information and page layout analysis. Several image formats are supported through the Leptonica library. It can also detect whether text is monospaced or proportional.") - (license license:asl2.0)))) + (license license:asl2.0))) (define-public gimagereader (package -- cgit v1.2.3