forked from Unstructured-IO/unstructured
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathDockerfile
More file actions
88 lines (69 loc) · 3.45 KB
/
Dockerfile
File metadata and controls
88 lines (69 loc) · 3.45 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
FROM cgr.dev/chainguard/wolfi-base:latest AS base
ARG PYTHON=python3.12
ARG PIP="${PYTHON} -m pip"
USER root
WORKDIR /app
COPY ./requirements requirements/
COPY unstructured unstructured
COPY test_unstructured test_unstructured
COPY example-docs example-docs
COPY ./docker/packages/*.apk /tmp/packages/
RUN apk update && \
apk add libxml2 python-3.12 python-3.12-base py3.12-pip glib \
mesa-gl mesa-libgallium cmake bash libmagic wget git openjpeg \
poppler poppler-utils poppler-glib libreoffice tesseract && \
apk add --allow-untrusted /tmp/packages/pandoc-3.1.8-r0.apk && \
rm -rf /tmp/packages && \
git clone --depth 1 https://github.com/tesseract-ocr/tessdata.git /tmp/tessdata && \
mkdir -p /usr/local/share/tessdata && \
cp /tmp/tessdata/*.traineddata /usr/local/share/tessdata && \
rm -rf /tmp/tessdata && \
git clone --depth 1 https://github.com/tesseract-ocr/tessconfigs /tmp/tessconfigs && \
cp -r /tmp/tessconfigs/configs /usr/local/share/tessdata && \
cp -r /tmp/tessconfigs/tessconfigs /usr/local/share/tessdata && \
rm -rf /tmp/tessconfigs && \
apk cache clean && \
ln -s /usr/lib/libreoffice/program/soffice.bin /usr/bin/libreoffice && \
ln -s /usr/lib/libreoffice/program/soffice.bin /usr/bin/soffice && \
chmod +x /usr/lib/libreoffice/program/soffice.bin && \
apk add --no-cache font-ubuntu fontconfig && \
apk upgrade --no-cache py3.12-pip && \
fc-cache -fv && \
ln -sf /usr/bin/$PYTHON /usr/bin/python3
ARG NB_UID=1000
ARG NB_USER=notebook-user
RUN addgroup --gid ${NB_UID} ${NB_USER} && \
adduser --disabled-password --gecos "" --uid ${NB_UID} -G ${NB_USER} ${NB_USER}
ENV USER=${NB_USER}
ENV HOME=/home/${NB_USER}
COPY --chown=${NB_USER} scripts/initialize-libreoffice.sh ${HOME}/initialize-libreoffice.sh
# Remove unused Python versions
RUN rm -rf /usr/lib/python3.10 && \
rm -rf /usr/lib/python3.11 && \
rm -rf /usr/lib/python3.13 && \
rm /usr/bin/python3.13
USER notebook-user
WORKDIR ${HOME}
# Initialize libreoffice config as non-root user (required for soffice to work properly)
# See: https://github.com/Unstructured-IO/unstructured/issues/3105
RUN ./initialize-libreoffice.sh && rm initialize-libreoffice.sh
WORKDIR /app
# append PATH before pip install to avoid warning logs; it also avoids issues with packages that needs compilation during installation
ENV PATH="${PATH}:/home/notebook-user/.local/bin"
ENV TESSDATA_PREFIX=/usr/local/share/tessdata
ENV NLTK_DATA=/home/notebook-user/nltk_data
# Upgrade pip to fix CVE-2025-8869
RUN $PIP install --no-cache-dir --user --upgrade "pip>=25.3"
# Install Python dependencies and download required NLTK packages
RUN find requirements/ -type f -name "*.txt" ! -name "test.txt" ! -name "dev.txt" ! -name "constraints.txt" -exec $PIP install --no-cache-dir --user -r '{}' ';' && \
mkdir -p ${NLTK_DATA} && \
$PYTHON -m nltk.downloader -d ${NLTK_DATA} punkt_tab averaged_perceptron_tagger_eng && \
$PYTHON -c "from unstructured.partition.model_init import initialize; initialize()" && \
$PYTHON -c "from unstructured_inference.models.tables import UnstructuredTableTransformerModel; model = UnstructuredTableTransformerModel(); model.initialize('microsoft/table-transformer-structure-recognition')"
ENV HF_HUB_OFFLINE=1
USER root
# Remove setuptools to remove jaraco.context to fix GHSA-58pv-8j8x-9vj2
RUN $PIP uninstall -y setuptools && \
chown -R notebook-user:notebook-user /app
USER notebook-user
CMD ["/bin/bash"]