From 946786dca5c14edf1eff8b030c5f838afdf7cf8d Mon Sep 17 00:00:00 2001
From: Thomas Forgione <thomas@forgione.fr>
Date: Mon, 9 Mar 2026 14:30:33 +0100
Subject: [PATCH] Initial commit

---
 .gitignore         |  1 +
 Dockerfile         | 96 ++++++++++++++++++++++++++++++++++++++++++++++
 README.md          |  7 ++++
 docker-compose.yml | 19 +++++++++
 4 files changed, 123 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 Dockerfile
 create mode 100644 README.md
 create mode 100644 docker-compose.yml

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..604f0f2
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+models
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..d0a32ad
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,96 @@
+ARG UBUNTU_VERSION=24.04
+
+# This needs to generally match the container host's environment.
+ARG CUDA_VERSION=13.1.0
+# Target the CUDA build image
+ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
+
+ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
+
+FROM ${BASE_CUDA_DEV_CONTAINER} AS build
+
+# CUDA architecture to build for (defaults to all supported archs)
+ARG CUDA_DOCKER_ARCH=default
+
+RUN apt-get update && \
+    apt-get install -y build-essential cmake python3 python3-pip git libssl-dev libgomp1
+
+WORKDIR /
+RUN git clone https://github.com/ggml-org/llama.cpp app
+
+WORKDIR /app
+RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
+    export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
+    fi && \
+    cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+    cmake --build build --config Release -j$(nproc)
+
+RUN mkdir -p /app/lib && \
+    find build -name "*.so*" -exec cp -P {} /app/lib \;
+
+RUN mkdir -p /app/full \
+    && cp build/bin/* /app/full \
+    && cp *.py /app/full \
+    && cp -r gguf-py /app/full \
+    && cp -r requirements /app/full \
+    && cp requirements.txt /app/full \
+    && cp .devops/tools.sh /app/full/tools.sh
+
+## Base image
+FROM ${BASE_CUDA_RUN_CONTAINER} AS base
+
+RUN apt-get update \
+    && apt-get install -y libgomp1 curl\
+    && apt autoremove -y \
+    && apt clean -y \
+    && rm -rf /tmp/* /var/tmp/* \
+    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+    && find /var/cache -type f -delete
+
+COPY --from=build /app/lib/ /app
+
+### Full
+FROM base AS full
+
+COPY --from=build /app/full /app
+
+WORKDIR /app
+
+RUN apt-get update \
+    && apt-get install -y \
+    git \
+    python3 \
+    python3-pip \
+    python3-wheel \
+    && pip install --break-system-packages --upgrade setuptools \
+    && pip install --break-system-packages -r requirements.txt \
+    && apt autoremove -y \
+    && apt clean -y \
+    && rm -rf /tmp/* /var/tmp/* \
+    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+    && find /var/cache -type f -delete
+
+
+ENTRYPOINT ["/app/tools.sh"]
+
+### Light, CLI only
+FROM base AS light
+
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
+
+WORKDIR /app
+
+ENTRYPOINT [ "/app/llama-cli" ]
+
+### Server, Server only
+FROM base AS server
+
+ENV LLAMA_ARG_HOST=0.0.0.0
+
+COPY --from=build /app/full/llama-server /app
+
+WORKDIR /app
+
+HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+
+ENTRYPOINT [ "/app/llama-server" ]
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..22a1564
--- /dev/null
+++ b/README.md
@@ -0,0 +1,7 @@
+# llama-cpp-docker
+
+*Simple dockerfile / docker-compose to run llama-cpp*
+
+### Usage
+
+Just put your models in `models` and run `docker-compose up -d --build` and you're good to go.
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 0000000..5227c92
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,19 @@
+services:
+  llm:
+    build: .
+    container_name: server
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    restart: unless-stopped
+    environment:
+      - NVIDIA_VISIBLE_DEVICES=all
+    volumes:
+      - ./models:/models
+    ports:
+      - 3000:8080
+    command: --models-preset /models/model-presets.ini --fit on --fit-target 1024