summaryrefslogtreecommitdiff
path: root/gnu/packages/ocr.scm
diff options
context:
space:
mode:
Diffstat (limited to 'gnu/packages/ocr.scm')
-rw-r--r--gnu/packages/ocr.scm124
1 files changed, 63 insertions, 61 deletions
diff --git a/gnu/packages/ocr.scm b/gnu/packages/ocr.scm
index 11595e87e5..e28bd17668 100644
--- a/gnu/packages/ocr.scm
+++ b/gnu/packages/ocr.scm
@@ -5,6 +5,7 @@
;;; Copyright © 2019 Alex Vong <alexvong1995@gmail.com>
;;; Copyright © 2021 Andy Tai <atai@atai.org>
;;; Copyright © 2021, 2022 Nicolas Goaziou <mail@nicolasgoaziou.fr>
+;;; Copyright © 2022 Maxim Cournoyer <maxim.cournoyer@gmail.com>
;;;
;;; This file is part of GNU Guix.
;;;
@@ -74,71 +75,72 @@ it produces text in 8-bit or UTF-8 formats.")
(license license:gpl3+)))
(define-public tesseract-ocr
- ;; There are useful commits beyond the last official stable release.
- (let ((commit "97079fa353557af6df86fd20b5d2e0dff5d8d5df")
- (revision "1"))
- (package
- (name "tesseract-ocr")
- (version (git-version "4.1.1" revision commit))
- (source
- (origin
- (method git-fetch)
- (uri (git-reference
- (url "https://github.com/tesseract-ocr/tesseract")
- (commit commit)))
- (file-name (git-file-name name version))
- (sha256
- (base32
- "11137a4aaay7qp64vdjd83hz1l089nzi5a0ql0qgk8gn79pyhi98"))))
- (build-system gnu-build-system)
- (inputs
- `(("cairo" ,cairo)
- ("icu" ,icu4c)
- ("leptonica" ,leptonica)
- ("pango" ,pango)
- ("python-wrapper" ,python-wrapper)))
- (native-inputs
- `(("asciidoc" ,asciidoc)
- ("autoconf" ,autoconf)
- ("automake" ,automake)
- ("docbook-xsl" ,docbook-xsl)
- ("libarchive" ,libarchive)
- ("libcurl" ,curl)
- ("libtool" ,libtool)
- ("libtiff" ,libtiff)
- ("pkg-config" ,pkg-config)
- ("xsltproc" ,libxslt)))
- (arguments
- `(#:configure-flags
- (let ((leptonica (assoc-ref %build-inputs "leptonica")))
- (list (string-append "LIBLEPT_HEADERSDIR=" leptonica "/include")))
- #:tests? #f ; Tests currently result in a segfault
- #:phases
- (modify-phases %standard-phases
- (add-after 'unpack 'fix-docbook
- (lambda* (#:key inputs #:allow-other-keys)
- ;; Don't attempt to download XSL schema.
- (substitute* "doc/Makefile.am"
- (("http://docbook.sourceforge.net/release/xsl/current/manpages/docbook.xsl")
- (string-append (assoc-ref inputs "docbook-xsl")
- "/xml/xsl/docbook-xsl-"
- ,(package-version docbook-xsl)
- "/manpages/docbook.xsl")))))
- (add-after 'install 'build-training
- (lambda _
- (invoke "make" "training")))
- (add-after 'build-training 'install-training
- (lambda _
- (invoke "make" "training-install"))))))
- (home-page "https://github.com/tesseract-ocr/tesseract")
- (synopsis "Optical character recognition engine")
- (description
- "Tesseract is an optical character recognition (OCR) engine with very
+ (package
+ (name "tesseract-ocr")
+ (version "5.2.0")
+ (source
+ (origin
+ (method git-fetch)
+ (uri (git-reference
+ (url "https://github.com/tesseract-ocr/tesseract")
+ (commit version)))
+ (file-name (git-file-name name version))
+ (sha256
+ (base32
+ "0dai539h07lqj8lyhznd3wbwdpqr78qrsczq78rsmsryqvmdbyaa"))))
+ (build-system gnu-build-system)
+ (arguments
+ (list
+ #:configure-flags
+ #~(list (string-append "LIBLEPT_HEADERSDIR="
+ #$(this-package-input "leptonica") "/include")
+ "--disable-static") ;avoid 6 MiB static archive
+ ;; The unit tests are disabled because they require building bundled
+ ;; third party libraries.
+ #:tests? #f
+ #:phases
+ #~(modify-phases %standard-phases
+ (add-after 'unpack 'do-not-override-xml-catalog-files
+ (lambda _
+ (substitute* "configure.ac"
+ (("AC_SUBST\\(\\[XML_CATALOG_FILES])")
+ ""))))
+ (add-after 'build 'build-training
+ (lambda* (#:key parallel-build? #:allow-other-keys)
+ (define n (if parallel-build? (number->string
+ (parallel-job-count))
+ "1"))
+ (invoke "make" "-j" n "training")))
+ (add-after 'install 'install-training
+ (lambda _
+ (invoke "make" "training-install"))))))
+ (native-inputs
+ (list asciidoc
+ autoconf
+ automake
+ curl
+ docbook-xsl
+ libarchive
+ libtiff
+ libtool
+ libxml2 ;for XML_CATALOG_FILES
+ libxslt
+ pkg-config))
+ (inputs
+ (list cairo
+ icu4c
+ leptonica
+ pango
+ python-wrapper))
+ (home-page "https://github.com/tesseract-ocr/tesseract")
+ (synopsis "Optical character recognition engine")
+ (description
+ "Tesseract is an optical character recognition (OCR) engine with very
high accuracy. It supports many languages, output text formatting, hOCR
positional information and page layout analysis. Several image formats are
supported through the Leptonica library. It can also detect whether text is
monospaced or proportional.")
- (license license:asl2.0))))
+ (license license:asl2.0)))
(define-public gimagereader
(package