-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathDockerfile
More file actions
121 lines (104 loc) · 6.18 KB
/
Dockerfile
File metadata and controls
121 lines (104 loc) · 6.18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# ─────────────────────────────────────────────────────────────────────────
# MetaInformAnt — Amalgkit RNA-seq Pipeline Container
#
# Installs all bioinformatics tools and Python deps needed to run the
# streaming RNA-seq pipeline (download → quantify → merge → curate).
#
# Build: docker build -t metainformant-pipeline .
# Run: docker run -v $(pwd)/output:/app/output metainformant-pipeline
# ─────────────────────────────────────────────────────────────────────────
FROM python:3.13-slim AS base
LABEL maintainer="docxology"
LABEL description="MetaInformAnt amalgkit RNA-seq pipeline"
ENV DEBIAN_FRONTEND=noninteractive \
PYTHONUNBUFFERED=1 \
PIP_NO_CACHE_DIR=1
# ── System dependencies ─────────────────────────────────────────────────
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
cmake \
curl \
git \
wget \
bzip2 \
pigz \
zlib1g-dev \
libhdf5-dev \
autoconf \
automake \
libtool \
pkg-config \
libxml2-dev \
libcurl4-openssl-dev \
libssl-dev \
ca-certificates \
&& rm -rf /var/lib/apt/lists/*
# ── R 4.5 ───────────────────────────────────────────────────────────────
RUN apt-get update && apt-get install -y --no-install-recommends \
r-base r-base-dev \
&& rm -rf /var/lib/apt/lists/* \
&& Rscript -e 'install.packages(c("ggplot2"), repos="https://cloud.r-project.org", quiet=TRUE)'
# ── kallisto 0.48.0 ─────────────────────────────────────────────────────
RUN cd /tmp && \
wget -q https://github.com/pachterlab/kallisto/releases/download/v0.48.0/kallisto_linux-v0.48.0.tar.gz && \
tar xzf kallisto_linux-v0.48.0.tar.gz && \
cp kallisto/kallisto /usr/local/bin/ && \
rm -rf /tmp/kallisto*
# ── fastp 0.24.0 ────────────────────────────────────────────────────────
RUN cd /tmp && \
wget -q http://opengene.org/fastp/fastp && \
chmod +x fastp && mv fastp /usr/local/bin/
# ── SRA Toolkit (fasterq-dump) ──────────────────────────────────────────
# The SRA toolkit uses a launcher pattern: fasterq-dump is a wrapper that
# execs fasterq-dump.VERSION. We install it to /opt and add to PATH to preserve its environment.
RUN cd /tmp && \
wget -q https://ftp-trace.ncbi.nlm.nih.gov/sra/sdk/3.2.1/sratoolkit.3.2.1-ubuntu64.tar.gz && \
tar xzf sratoolkit.3.2.1-ubuntu64.tar.gz && \
mv sratoolkit.3.2.1-ubuntu64 /opt/sratoolkit && \
rm -rf /tmp/sratoolkit* && \
mkdir -p /root/.ncbi && \
printf '/LIBS/GUID = "%s"\n/libs/cloud/report_instance_identity = "false"\n/repository/user/main/public/root = "/tmp/sra-cache"\n' "$(uuidgen 2>/dev/null || echo 'docker-container')" > /root/.ncbi/user-settings.mkfg
ENV PATH="/opt/sratoolkit/bin:${PATH}"
# ── UV for fast Python dependency resolution ────────────────────────────
RUN pip install uv
# ── Application code ────────────────────────────────────────────────────
WORKDIR /app
COPY pyproject.toml ./
COPY src/ ./src/
COPY scripts/ ./scripts/
COPY config/ ./config/
# ── Python & tool environment ──────────────────────────────────────────
# Ensure all conda-installed tools (seqkit, fastp, etc.) are available in the PATH
ENV PATH="/opt/conda/bin:${PATH}"
# Install Python deps and setup amalgkit
RUN uv pip install --system -e "." && \
cd /tmp && \
wget -qO micromamba.tar.bz2 "https://micro.mamba.pm/api/micromamba/linux-64/latest" && \
tar -xjf micromamba.tar.bz2 bin/micromamba && \
export MAMBA_ROOT_PREFIX=/opt/conda && \
bin/micromamba create -y -p /opt/conda -c conda-forge -c bioconda amalgkit seqkit fastp kallisto && \
rm -f /opt/conda/bin/kallisto_orig && \
ln -s /opt/conda/bin/amalgkit /usr/local/bin/amalgkit && \
ln -s /opt/conda/bin/seqkit /usr/local/bin/seqkit && \
ln -s /opt/conda/bin/fastp /usr/local/bin/fastp && \
mv bin/micromamba /usr/local/bin/micromamba && \
rm -rf /tmp/micromamba* bin
# Pre-download NCBI taxdump for ete4 to prevent pipeline hanging during metadata initialization
RUN mkdir -p /root/.local/share/ete && \
wget -qO /root/.local/share/ete/taxdump.tar.gz https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz
# ── Output volume ───────────────────────────────────────────────────────
RUN mkdir -p /app/output/amalgkit
VOLUME ["/app/output"]
# ── Healthcheck ─────────────────────────────────────────────────────────
HEALTHCHECK --interval=60s --timeout=10s --retries=3 \
CMD python3 -c "import metainformant; print('ok')" || exit 1
# ── Entrypoint ──────────────────────────────────────────────────────────
# Default: run the full species pipeline
# Override workers/threads/max-gb via environment variables
ENV PIPELINE_MAX_GB=20.0 \
PIPELINE_WORKERS=80 \
PIPELINE_THREADS=96
CMD python3 scripts/rna/run_all_species.py \
--max-gb ${PIPELINE_MAX_GB} \
--workers ${PIPELINE_WORKERS} \
--threads ${PIPELINE_THREADS}