summaryrefslogtreecommitdiff
path: root/gnu/packages/bioinformatics.scm
diff options
context:
space:
mode:
Diffstat (limited to 'gnu/packages/bioinformatics.scm')
-rw-r--r--gnu/packages/bioinformatics.scm468
1 files changed, 449 insertions, 19 deletions
diff --git a/gnu/packages/bioinformatics.scm b/gnu/packages/bioinformatics.scm
index 4c6db1da6b..02c65eb99a 100644
--- a/gnu/packages/bioinformatics.scm
+++ b/gnu/packages/bioinformatics.scm
@@ -8,7 +8,7 @@
;;; Copyright © 2016, 2020, 2022 Marius Bakke <marius@gnu.org>
;;; Copyright © 2016, 2018 Raoul Bonnal <ilpuccio.febo@gmail.com>
;;; Copyright © 2017, 2018 Tobias Geerinckx-Rice <me@tobias.gr>
-;;; Copyright © 2017, 2021, 2022 Arun Isaac <arunisaac@systemreboot.net>
+;;; Copyright © 2017, 2021, 2022, 2024 Arun Isaac <arunisaac@systemreboot.net>
;;; Copyright © 2018 Joshua Sierles, Nextjournal <joshua@nextjournal.com>
;;; Copyright © 2018 Gábor Boskovits <boskovits@gmail.com>
;;; Copyright © 2018-2023 Mădălin Ionel Patrașcu <madalinionel.patrascu@mdc-berlin.de>
@@ -1101,6 +1101,43 @@ high-throughput sequence analysis. The package is primarily useful to
developers of other R packages who wish to make use of HTSlib.")
(license license:lgpl2.0+))))
+(define-public r-scenic
+ (let ((commit "cedf8490a634da550cea2c831544e5f7f14467d2")
+ (revision "1"))
+ (package
+ (name "r-scenic")
+ (version (git-version "1.3.1" revision commit))
+ (source
+ (origin
+ (method git-fetch)
+ (uri (git-reference
+ (url "https://github.com/aertslab/SCENIC")
+ (commit commit)))
+ (file-name (git-file-name name version))
+ (sha256
+ (base32 "17ai0q260hdqbvm1km1s5dw93pgz4f546ycfii57jyy9m9jka7r0"))))
+ (properties `((upstream-name . "SCENIC")))
+ (build-system r-build-system)
+ (propagated-inputs (list r-aucell
+ r-data-table
+ r-dynamictreecut
+ r-genie3
+ r-ggrepel
+ r-mixtools
+ r-nmf
+ r-rcistarget
+ r-rtsne))
+ (native-inputs (list r-knitr))
+ (home-page "https://github.com/aertslab/SCENIC")
+ (synopsis
+ "SCENIC (Single Cell rEgulatory Network Inference and Clustering)")
+ (description "SCENIC (Single-cell regulatory network inference and
+clustering) is an R package to infer Gene Regulatory Networks and cell types
+from single-cell RNA-seq data.")
+ ;; As of commit cedf8490a634da550cea2c831544e5f7f14467d2 the license is
+ ;; GPLv3.
+ (license license:gpl3))))
+
(define-public r-singlet
(let ((commit "765a6c45081807a1522f0e8983e2417822a36f36")
(revision "1"))
@@ -1471,6 +1508,39 @@ cpp.find_library('hdf5_cpp', dirs : '~a'), "
"Blasr is a genomic sequence aligner for processing PacBio long reads.")
(license license:bsd-3)))
+(define-public randfold
+ (package
+ (name "randfold")
+ (version "2.0.1")
+ (source
+ (origin
+ (method url-fetch)
+ (uri (string-append "http://bioinformatics.psb.ugent.be/"
+ "supplementary_data/erbon/nov2003/downloads/"
+ "randfold-" version ".tar.gz"))
+ (sha256
+ (base32
+ "0gqixl4ncaibrxmn25d6lm2hrw4ml2fj13nrc9q1kilsxdfi91mj"))))
+ (build-system gnu-build-system)
+ (arguments
+ (list
+ #:tests? #f ;no tests provided
+ #:phases
+ #~(modify-phases %standard-phases
+ (delete 'configure)
+ (replace 'install
+ (lambda _
+ (install-file "randfold"
+ (string-append #$output "/bin")))))))
+ (inputs (list eddylab-squid))
+ (home-page
+ "http://bioinformatics.psb.ugent.be/supplementary_data/erbon/nov2003/")
+ (synopsis "Minimum free energy of folding randomization test software")
+ (description "randfold computes the probability that, for a given
+sequence, the @dfn{Minimum Free Energy} (MFE) of the secondary structure is
+different from MFE computed with random sequences.")
+ (license license:gpl2)))
+
(define-public ribotaper
(package
(name "ribotaper")
@@ -2114,6 +2184,45 @@ matplotlib Axes objects, making them easy to style and incorporate into
multi-panel figures.")
(license license:expat)))
+(define-public python-magic-impute
+ (package
+ (name "python-magic-impute")
+ (version "1.2.1")
+ (source
+ (origin
+ (method git-fetch)
+ (uri (git-reference
+ (url "https://github.com/KrishnaswamyLab/MAGIC")
+ (commit (string-append "v" version))))
+ (file-name (git-file-name name version))
+ (sha256
+ (base32
+ "1yjs16vg87lcg9g16bnblg1v9sk73j6dm229lkcz0bfjlzxjhv8w"))))
+ (build-system pyproject-build-system)
+ (arguments
+ (list
+ #:tests? #false ;there are none
+ #:phases
+ '(modify-phases %standard-phases
+ (add-after 'unpack 'chdir
+ (lambda _ (chdir "python"))))))
+ (propagated-inputs
+ (list python-future
+ python-graphtools
+ python-matplotlib
+ python-numpy
+ python-pandas
+ python-scikit-learn
+ python-scipy
+ python-tasklogger))
+ (home-page "https://github.com/KrishnaswamyLab/MAGIC")
+ (synopsis "Markov affinity-based graph imputation of cells")
+ (description "MAGIC is an interactive tool to impute missing values in
+single-cell sequencing data and to restore the structure of the data. It also
+provides data pre-processing functionality such as dimensionality reduction
+and gene expression visualization.")
+ (license license:gpl2+)))
+
(define-public python-parabam
(package
(name "python-parabam")
@@ -2431,6 +2540,7 @@ counts.")
(base32
"1s5373g5jjbshh3q39zy7dlxr7nda6ksxq9d1gw46h82c4fsmfbn"))))
(build-system pyproject-build-system)
+ (arguments (list #:tests? #false)) ;there are none
(propagated-inputs
(list python-future
python-h5py
@@ -4653,6 +4763,64 @@ meso, or continuum scale.")
files.")
(license license:expat)))
+(define-public lsgkm
+ (package
+ (name "lsgkm")
+ (version "0.1.1")
+ (source
+ (origin
+ (method git-fetch)
+ (uri (git-reference
+ (url "https://github.com/Dongwon-Lee/lsgkm.git")
+ (commit (string-append "v" version))))
+ (file-name (git-file-name name version))
+ (sha256
+ (base32
+ "0b3m94kndvimdfjaf1q2yhmsn7lm5s9v81c5xgfjcp6ig7mh3sa5"))))
+ (build-system gnu-build-system)
+ (arguments
+ (list
+ #:make-flags '(list "-C" "src")
+ #:tests? #false ;there are no executable tests
+ #:phases
+ #~(modify-phases %standard-phases
+ (delete 'configure)
+ (replace 'install
+ (lambda _
+ (let ((bin (string-append #$output "/bin")))
+ (for-each (lambda (file)
+ (install-file file bin))
+ '("src/gkmtrain"
+ "src/gkmpredict"))))))))
+ (home-page "https://github.com/Dongwon-Lee/lsgkm")
+ (synopsis "Predict regulatory DNA elements in large-scale data")
+ (description "gkm-SVM, a sequence-based method for predicting regulatory
+DNA elements, is a useful tool for studying gene regulatory mechanisms.
+LS-GKM is an effort to improve the method. It offers much better scalability
+and provides further advanced gapped k-mer based kernel functions. As a
+result, LS-GKM achieves considerably higher accuracy than the original
+gkm-SVM.")
+ (license license:gpl3+)))
+
+(define-public python-fcsparser
+ (package
+ (name "python-fcsparser")
+ (version "0.2.8")
+ (source
+ (origin
+ (method url-fetch)
+ (uri (pypi-uri "fcsparser" version))
+ (sha256
+ (base32 "1skk1k8phq9sj4ar0cnq8px89y3kcyh5zrbl6anz9wcdcyzkc16z"))))
+ (build-system pyproject-build-system)
+ (propagated-inputs (list python-numpy python-pandas))
+ (native-inputs (list python-poetry-core python-pytest))
+ (home-page "https://github.com/eyurtsev/fcsparser")
+ (synopsis "Package for reading raw fcs files")
+ (description
+ "This package provides a Python package for reading raw fcs files")
+ (license license:expat)))
+
(define-public python-pybigwig
(package
(name "python-pybigwig")
@@ -4687,17 +4855,89 @@ files.")
accessing bigWig files.")
(license license:expat)))
+(define-public python-pyfasta
+ ;; The release on pypi does not contain the test data files.
+ (let ((commit "c2f0611c5311f1b1466f2d56560447898b4a8b03")
+ (revision "1"))
+ (package
+ (name "python-pyfasta")
+ (version (git-version "0.5.2" revision commit))
+ (source
+ (origin
+ (method git-fetch)
+ (uri (git-reference
+ (url "https://github.com/brentp/pyfasta")
+ (commit commit)))
+ (file-name (git-file-name name version))
+ (sha256
+ (base32
+ "0a189id3fbv88gssyk6adbmz2ll1mqpmyw8vxmx3fi955gvaq9j7"))))
+ (build-system pyproject-build-system)
+ (arguments
+ (list
+ #:phases
+ '(modify-phases %standard-phases
+ (add-after 'unpack 'python3.10-compat
+ (lambda _
+ (substitute* "pyfasta/__init__.py"
+ (("from fasta import")
+ "from pyfasta.fasta import")
+ (("from records import")
+ "from pyfasta.records import")
+ (("from split_fasta import")
+ "from pyfasta.split_fasta import")
+ (("in f.iteritems")
+ "in f.items"))
+ (substitute* "pyfasta/fasta.py"
+ (("from collections import Mapping")
+ "from collections.abc import Mapping")
+ (("from records import")
+ "from pyfasta.records import"))
+ (substitute* "pyfasta/records.py"
+ (("cPickle") "pickle")
+ (("\\(int, long\\)")
+ "(int, int)")
+ ;; XXX: it's not clear if this is really correct.
+ (("buffer\\(self\\)")
+ "memoryview(bytes(str(self), encoding='utf-8'))")
+ (("sys.maxint") "sys.maxsize"))
+ (substitute* "pyfasta/split_fasta.py"
+ (("from cStringIO import")
+ "from io import")
+ (("in lens.iteritems") "in lens.items"))
+ (substitute* "tests/test_all.py"
+ (("f.keys\\(\\)\\) == \\['a-extra'")
+ "list(f.keys())) == ['a-extra'")
+ (("f.iterkeys\\(\\)") "iter(f.keys())")
+ (("tests/data/" m)
+ (string-append (getcwd) "/" m))))))))
+ (propagated-inputs (list python-numpy))
+ (native-inputs (list python-nose))
+ (home-page "https://github.com/brentp/pyfasta/")
+ (synopsis "Pythonic access to fasta sequence files")
+ (description
+ "This library provides fast, memory-efficient, pythonic (and
+command-line) access to fasta sequence files. It stores a flattened version
+of a fasta sequence file without spaces or headers and uses either a
+@code{mmap} in numpy binary format or @code{fseek}/@code{fread} so the
+sequence data is never read into memory. It saves a pickle (@code{.gdx}) of
+the start and stop (for @code{fseek}/@code{mmap}) locations of each header in
+the fasta file for internal use.
+
+Note that this package has been deprecated in favor of @code{pyfaidx}.")
+ (license license:expat))))
+
(define-public python-schema-salad
(package
(name "python-schema-salad")
(version "8.2.20211116214159")
(source
- (origin
- (method url-fetch)
- (uri (pypi-uri "schema-salad" version))
- (sha256
- (base32
- "005dh2y45x92zl8sf2sqjmfvcqr4hrz8dfckgkckv87003v7lwqc"))))
+ (origin
+ (method url-fetch)
+ (uri (pypi-uri "schema-salad" version))
+ (sha256
+ (base32
+ "005dh2y45x92zl8sf2sqjmfvcqr4hrz8dfckgkckv87003v7lwqc"))))
(build-system pyproject-build-system)
(arguments
`(#:phases
@@ -5434,6 +5674,35 @@ quantitative phenotypes.")
;; license is the GPL.
(license license:gpl3+)))
+(define-public eddylab-squid
+ (package
+ (name "eddylab-squid")
+ (version "1.9g")
+ (source
+ (origin
+ (method url-fetch)
+ (uri (string-append "http://eddylab.org/software/squid/squid-"
+ version ".tar.gz"))
+ (sha256
+ (base32
+ "19ywv1h581a84yyjnp64gwww99vhgbxi8v4rl37xp92ag7l44brh"))))
+ (build-system gnu-build-system)
+ (arguments
+ `(#:phases
+ (modify-phases %standard-phases
+ (add-before 'check 'set-perl-search-path
+ (lambda _
+ ;; Work around "dotless @INC" build failure.
+ (setenv "PERL5LIB"
+ (string-append (getcwd) "/Testsuite:"
+ (getenv "PERL5LIB"))))))))
+ (inputs (list perl))
+ (home-page "http://eddylab.org/software.html")
+ (synopsis "C function library for sequence analysis")
+ (description "SQUID is Sean Eddy's personal library of C functions
+and utility programs for sequence analysis.")
+ (license license:gpl2)))
+
(define-public edirect
(package
(name "edirect")
@@ -6073,6 +6342,43 @@ sequences to the genome---manipulating the sequences to produce better mapping
results. The FASTX-Toolkit tools perform some of these preprocessing tasks.")
(license license:agpl3+)))
+(define-public flash
+ (package
+ (name "flash")
+ (version "1.2.11")
+ (source
+ (origin
+ (method url-fetch)
+ (uri (string-append "mirror://sourceforge/flashpage/FLASH-"
+ version ".tar.gz"))
+ (sha256
+ (base32
+ "1b1ns9ghbcxy92xwa2a53ikqacvnyhvca0zfv0s7986xzvvscp38"))))
+ (build-system gnu-build-system)
+ (arguments
+ (list
+ #:make-flags #~(list (string-append "CC=" #$(cc-for-target)))
+ #:tests? #f ;no tests
+ #:phases
+ #~(modify-phases %standard-phases
+ ;; No configure script
+ (delete 'configure)
+ ;; No install target
+ (replace 'install
+ (lambda _
+ (install-file "flash"
+ (string-append #$output "/bin")))))))
+ (inputs (list zlib))
+ (home-page "http://ccb.jhu.edu/software/FLASH/")
+ (synopsis "Merge paired-end nucleotide reads from NGS experiments")
+ (description "FLASH (Fast Length Adjustment of SHort reads) is a tool to
+merge paired-end reads from next-generation sequencing experiments. FLASH is
+designed to merge pairs of reads when the original DNA fragments are shorter
+than twice the length of reads. The resulting longer reads can significantly
+improve genome assemblies. They can also improve transcriptome assembly when
+FLASH is used to merge RNA-seq data.")
+ (license license:gpl3+)))
+
(define-public flexbar
(package
(name "flexbar")
@@ -7365,6 +7671,49 @@ to measure the reproducibility of findings identified from replicate
experiments and provide highly stable thresholds based on reproducibility.")
(license license:gpl2+)))
+(define-public isolator
+ (let ((commit "24bafc0a102dce213bfc2b5b9744136ceadaba03")
+ (revision "1"))
+ (package
+ (name "isolator")
+ (version (git-version "0.0.2" revision commit))
+ (source
+ (origin
+ (method git-fetch)
+ (uri (git-reference
+ (url "https://github.com/dcjones/isolator.git")
+ (commit commit)))
+ (file-name (git-file-name name version))
+ (sha256
+ (base32
+ "12mbcfqhiggcjvzizf2ff7b05z31i47njcyzcivpw5j74pfbr3dv"))))
+ (build-system cmake-build-system)
+ (arguments
+ (list
+ #:tests? #f ;no check target
+ #:phases
+ '(modify-phases %standard-phases
+ (add-after 'unpack 'fix-std
+ (lambda _
+ (substitute* '("src/summarize.cpp"
+ "src/shredder.cpp")
+ (("isnan") "std::isnan")
+ (("isinf") "std::isinf")))))))
+ (inputs
+ (list boost hdf5 zlib))
+ (home-page "https://github.com/dcjones/isolator")
+ (synopsis "Tools for the analysis of RNA-Seq experiments")
+ (description "Isolator analyzes RNA-Seq experiments. Isolator has a
+particular focus on producing stable, consistent estimates. It implements a
+full hierarchical Bayesian model of an entire RNA-Seq experiment. It saves
+all the samples generated by the sampler, which can be processed to compute
+posterior probabilities for arbitrarily complex questions, far beyond the
+confines of pairwise tests. It aggressively corrects for technical effects,
+such as random priming bias, GC-bias, 3' bias, and fragmentation effects.
+Compared to other MCMC approaches, it is exceedingly efficient, though
+generally slower than modern maximum likelihood approaches.")
+ (license license:expat))))
+
(define-public jellyfish
(package
(name "jellyfish")
@@ -7836,6 +8185,45 @@ probabilistic distances of genome abundance and tetranucleotide frequency.")
(license (license:non-copyleft "file://license.txt"
"See license.txt in the distribution."))))
+(define-public metal
+ (package
+ (name "metal")
+ (version "2011-03-25")
+ (source
+ (origin
+ (method url-fetch)
+ (uri (string-append "http://csg.sph.umich.edu/abecasis/Metal/"
+ "download/generic-metal-" version ".tar.gz"))
+ (sha256
+ (base32
+ "1bk00hc0xagmq0mabmbb8bykl75qd4kfyirba869h4x6hmn4a0f3"))))
+ (build-system gnu-build-system)
+ (arguments
+ (list
+ #:tests? #f
+ #:make-flags
+ #~(list (string-append "INSTALLDIR=" #$output "/bin") "all")
+ #:phases
+ '(modify-phases %standard-phases
+ (replace 'configure
+ (lambda _
+ (substitute* "Makefile"
+ (("^CFLAGS=") "CFLAGS=-std=c++11 ")))))))
+ (inputs (list zlib `(,zlib "static")))
+ (home-page "http://csg.sph.umich.edu/abecasis/Metal/")
+ (synopsis "Facilitate meta-analysis of large datasets")
+ (description "METAL is a tool for meta-analysis genomewide association
+scans. METAL can combine either test statistics and standard errors or
+p-values across studies (taking sample size and direction of effect into
+account). METAL analysis is a convenient alternative to a direct analysis of
+merged data from multiple studies. It is especially appropriate when data
+from the individual studies cannot be analyzed together because of differences
+in ethnicity, phenotype distribution, gender or constraints in sharing of
+individual level data imposed. Meta-analysis results in little or no loss of
+efficiency compared to analysis of a combined dataset including data from all
+individual studies.")
+ (license license:bsd-3)))
+
(define-public minced
(package
(name "minced")
@@ -7928,6 +8316,41 @@ program for nucleotide and protein sequences.")
;; License information found in 'muscle -h' and usage.cpp.
(license license:public-domain)))
+(define-public music
+ (let ((commit "b1caecdb164b1ab80acccb9463abe2526a56f69f")
+ (revision "1"))
+ (package
+ (name "music")
+ (version (git-version "0.0.0" revision commit))
+ (source (origin
+ (method git-fetch)
+ (uri (git-reference
+ (url "https://github.com/gersteinlab/MUSIC.git")
+ (commit commit)))
+ (file-name (git-file-name name version))
+ (sha256
+ (base32
+ "0arj300h8cpbya7y98g066xsxcg2a65h3y0qs250rlj072f1b4ia"))))
+ (build-system gnu-build-system)
+ (arguments
+ (list
+ #:tests? #f ; no "check" target
+ #:phases
+ #~(modify-phases %standard-phases
+ (delete 'configure)
+ ;; There is no "install" target.
+ (replace 'install
+ (lambda _
+ (let ((bin (string-append #$output "/bin")))
+ (install-file "bin/MUSIC" bin)))))))
+ (home-page "https://github.com/gersteinlab/MUSIC/")
+ (synopsis "Multiscale enrichment calling for ChIP-Seq datasets")
+ (description
+ "MUSIC is an algorithm for identification of enriched regions at
+multiple scales in the read depth signals from ChIP-Seq experiments.")
+ ;; See https://github.com/gersteinlab/MUSIC/issues/6
+ (license license:gpl2+))))
+
(define-public newick-utils
;; There are no recent releases so we package from git.
(let ((commit "da121155a977197cab9fbb15953ca1b40b11eb87"))
@@ -21321,7 +21744,7 @@ single-cell data named @url{https://github.com/PMBio/cardelino, cardelino}.")
(define-public ccwl
(package
(name "ccwl")
- (version "0.2.0")
+ (version "0.3.0")
(source
(origin
(method url-fetch)
@@ -21329,7 +21752,7 @@ single-cell data named @url{https://github.com/PMBio/cardelino, cardelino}.")
version ".tar.lz"))
(sha256
(base32
- "1ar8rfz3zrksgygrv67zv77y8gfvvz54zcs546jn6j28y20basla"))))
+ "0za710mcn9di1njli3dk3660n3836ip8b4msb8f958498va95y7j"))))
(build-system gnu-build-system)
(arguments
`(#:make-flags '("GUILE_AUTO_COMPILE=0") ; to prevent guild warnings
@@ -21340,6 +21763,12 @@ single-cell data named @url{https://github.com/PMBio/cardelino, cardelino}.")
,@%default-gnu-imported-modules)
#:phases
(modify-phases %standard-phases
+ (add-after 'patch-source-shebangs 'patch-more-source-shebangs
+ (lambda* (#:key inputs #:allow-other-keys)
+ (substitute* "scripts/ccwl"
+ (("^exec guile")
+ (string-append "exec "
+ (search-input-file inputs "/bin/guile"))))))
(add-after 'install 'wrap
(lambda* (#:key inputs outputs #:allow-other-keys)
(let ((out (assoc-ref outputs "out"))
@@ -21403,15 +21832,18 @@ based on the pairwise alignment of hidden Markov models (HMMs).")
(define-public wfmash
(package
(name "wfmash")
- (version "0.10.5")
+ (version "0.12.5")
(source
(origin
- (method url-fetch)
- (uri (string-append "https://github.com/ekg/wfmash/releases/download/v"
- version "/wfmash-v" version ".tar.gz"))
+ ;; There are no release tarballs after version 0.10.5.
+ (method git-fetch)
+ (uri (git-reference
+ (url "https://github.com/waveygang/wfmash")
+ (commit (string-append "v" version))))
+ (file-name (git-file-name name version))
(sha256
(base32
- "1jsvnnh14h3ir4l13qhmglhd25kzwvni9apgvr1lbikqwgrpkiq4"))
+ "1qh2chnwp7nqgp88afc4xzdkd21vh6cfqq73siqw7vc0qinqadm6"))
(snippet
#~(begin
(use-modules (guix build utils))
@@ -21437,8 +21869,7 @@ based on the pairwise alignment of hidden Markov models (HMMs).")
(let ((samtools (search-input-file inputs "/bin/samtools")))
;; This is the easiest way to access the data
;; needed for the test suite.
- (symlink (string-append "../wfmash-v" #$version "/data")
- "data")
+ (symlink "../source/data" "data")
(and
;; This test takes 60 minutes on riscv64-linux.
#$@(if (not (target-riscv64?))
@@ -21530,8 +21961,7 @@ based on the pairwise alignment of hidden Markov models (HMMs).")
(lambda _
(invoke "bin/wfmash"
"data/reads.255bps.fa.gz"
- "data/reads.255bps.fa.gz"
- "-X" "-w" "16")))
+ "-w" "16" "-s" "100" "-L")))
(invoke "head" "reads.255bps.paf"))))))))))
(inputs
(list atomic-queue
@@ -21548,7 +21978,7 @@ distances and the wavefront alignment algorithm. It is a fork of MashMap that
implements base-level alignment via the wflign tiled wavefront global
alignment algorithm. It completes MashMap with a high-performance alignment
module capable of computing base-level alignments for very large sequences.")
- (home-page "https://github.com/ekg/wfmash")
+ (home-page "https://github.com/waveygang/wfmash")
(license license:expat)))
(define-public gdcm