diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 000000000..9c4feb456 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,30 @@ +# BitNet.cpp Docker Image +# For running 1-bit LLM inference on CPU + +FROM python:3.11-slim + +# Install build dependencies +RUN apt-get update && apt-get install -y \ + cmake \ + build-essential \ + git \ + wget \ + && rm -rf /var/lib/apt/lists/* + +# Set working directory +WORKDIR /app + +# Copy project files +COPY . . + +# Initialize git submodules +RUN git submodule update --init --recursive + +# Build the project +RUN cmake -B build && cmake --build build --config Release + +# Set environment variables +ENV PYTHONPATH=/app/build/bin:$PYTHONPATH + +# Default command - show help +CMD ["python3", "run_inference.py", "--help"] diff --git a/README.md b/README.md index 3bb25596e..6dce2cd62 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ Try it out via this [demo](https://demo-bitnet-h0h8hcfqeqhrf5gf.canadacentral-01.azurewebsites.net/), or build and run it on your own [CPU](https://github.com/microsoft/BitNet?tab=readme-ov-file#build-from-source) or [GPU](https://github.com/microsoft/BitNet/blob/main/gpu/README.md). -bitnet.cpp is the official inference framework for 1-bit LLMs (e.g., BitNet b1.58). It offers a suite of optimized kernels, that support **fast** and **lossless** inference of 1.58-bit models on CPU and GPU (NPU support will coming next). +bitnet.cpp is the official inference framework for 1-bit LLMs (e.g., BitNet b1.58). It offers a suite of optimized kernels, that support **fast** and **lossless** inference of 1.58-bit models on CPU and GPU (NPU support will be coming next). The first release of bitnet.cpp is to support inference on CPUs. bitnet.cpp achieves speedups of **1.37x** to **5.07x** on ARM CPUs, with larger models experiencing greater performance gains. Additionally, it reduces energy consumption by **55.4%** to **70.0%**, further boosting overall efficiency. On x86 CPUs, speedups range from **2.37x** to **6.17x** with energy reductions between **71.9%** to **82.2%**. Furthermore, bitnet.cpp can run a 100B BitNet b1.58 model on a single CPU, achieving speeds comparable to human reading (5-7 tokens per second), significantly enhancing the potential for running LLMs on local devices. Please refer to the [technical report](https://arxiv.org/abs/2410.16144) for more details.