From 946786dca5c14edf1eff8b030c5f838afdf7cf8d Mon Sep 17 00:00:00 2001 From: Thomas Forgione Date: Mon, 9 Mar 2026 14:30:33 +0100 Subject: [PATCH] Initial commit --- .gitignore | 1 + Dockerfile | 96 ++++++++++++++++++++++++++++++++++++++++++++++ README.md | 7 ++++ docker-compose.yml | 19 +++++++++ 4 files changed, 123 insertions(+) create mode 100644 .gitignore create mode 100644 Dockerfile create mode 100644 README.md create mode 100644 docker-compose.yml diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..604f0f2 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +models diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..d0a32ad --- /dev/null +++ b/Dockerfile @@ -0,0 +1,96 @@ +ARG UBUNTU_VERSION=24.04 + +# This needs to generally match the container host's environment. +ARG CUDA_VERSION=13.1.0 +# Target the CUDA build image +ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} + +ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} + +FROM ${BASE_CUDA_DEV_CONTAINER} AS build + +# CUDA architecture to build for (defaults to all supported archs) +ARG CUDA_DOCKER_ARCH=default + +RUN apt-get update && \ + apt-get install -y build-essential cmake python3 python3-pip git libssl-dev libgomp1 + +WORKDIR / +RUN git clone https://github.com/ggml-org/llama.cpp app + +WORKDIR /app +RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \ + export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \ + fi && \ + cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ + cmake --build build --config Release -j$(nproc) + +RUN mkdir -p /app/lib && \ + find build -name "*.so*" -exec cp -P {} /app/lib \; + +RUN mkdir -p /app/full \ + && cp build/bin/* /app/full \ + && cp *.py /app/full \ + && cp -r gguf-py /app/full \ + && cp -r requirements /app/full \ + && cp requirements.txt /app/full \ + && cp .devops/tools.sh /app/full/tools.sh + +## Base image +FROM ${BASE_CUDA_RUN_CONTAINER} AS base + +RUN apt-get update \ + && apt-get install -y libgomp1 curl\ + && apt autoremove -y \ + && apt clean -y \ + && rm -rf /tmp/* /var/tmp/* \ + && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ + && find /var/cache -type f -delete + +COPY --from=build /app/lib/ /app + +### Full +FROM base AS full + +COPY --from=build /app/full /app + +WORKDIR /app + +RUN apt-get update \ + && apt-get install -y \ + git \ + python3 \ + python3-pip \ + python3-wheel \ + && pip install --break-system-packages --upgrade setuptools \ + && pip install --break-system-packages -r requirements.txt \ + && apt autoremove -y \ + && apt clean -y \ + && rm -rf /tmp/* /var/tmp/* \ + && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ + && find /var/cache -type f -delete + + +ENTRYPOINT ["/app/tools.sh"] + +### Light, CLI only +FROM base AS light + +COPY --from=build /app/full/llama-cli /app/full/llama-completion /app + +WORKDIR /app + +ENTRYPOINT [ "/app/llama-cli" ] + +### Server, Server only +FROM base AS server + +ENV LLAMA_ARG_HOST=0.0.0.0 + +COPY --from=build /app/full/llama-server /app + +WORKDIR /app + +HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] + +ENTRYPOINT [ "/app/llama-server" ] diff --git a/README.md b/README.md new file mode 100644 index 0000000..22a1564 --- /dev/null +++ b/README.md @@ -0,0 +1,7 @@ +# llama-cpp-docker + +*Simple dockerfile / docker-compose to run llama-cpp* + +### Usage + +Just put your models in `models` and run `docker-compose up -d --build` and you're good to go. diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..5227c92 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,19 @@ +services: + llm: + build: . + container_name: server + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + restart: unless-stopped + environment: + - NVIDIA_VISIBLE_DEVICES=all + volumes: + - ./models:/models + ports: + - 3000:8080 + command: --models-preset /models/model-presets.ini --fit on --fit-target 1024