# Merging all the Spark-related tools into one image saves a LOT of space # over having separate images for each, especially given they all share # the same underlying infrastructure. It’s still just as easy to set up # and run the various services via Docker Compose. FROM python:3.11-slim-buster as python-base # I originally used python:3.6-alpine but dropped it after getting tied # in knots with Python dependencies around numpy. Pip under Alpine builds # all modules from source. You can try to be clever and install the bigger # modules like numpy via apk, but the problem with numpy is that the Python # version most likely will install a different version of numpy from what # the base Alpine image supplies pre-built, so you end up building numpy # from source regardless, which takes forever (well, 10 minutes ;). I tried # to figure out a combination that worked without success. # # Going to a later version of Python would probably resolve this, but that # would require moving to Spark 3 (2.4 doesn't support 3.8 or later), and # we don’t have time to test that at the moment. Which is why this is still # using Python 3.6: if it ain't broke... # Ideally we should run as a non-root user, but it's problematic to set # up shared files, especially if running Docker within, say, VirtualBox. # See https://vsupalov.com/docker-shared-permissions/ # ARG NB_USER="pyspark" # ARG NB_UID="1000" # ARG NB_GID="1000" ENV PYTHONUNBUFFERED=1 \ PYTHONDONTWRITEBYTECODE=1 \ PIP_NO_CACHE_DIR=off \ PIP_DISABLE_PIP_VERSION_CHECK=on \ PIP_DEFAULT_TIMEOUT=100 \ VENV_PATH="/opt/env" \ # download mirrors SPARK_MIRROR="https://archive.apache.org/dist" \ GRAPHFRAMES_MIRROR="https://repos.spark-packages.org" \ KAFKA_MIRROR="https://archive.apache.org/dist" \ # Spark SPARK_MAJOR_VERSION="3.2" \ SPARK_MINOR_VERSION="4" \ HADOOP_VERSION="2.7" \ SPARK_INSTALL="/usr/local" \ SPARK_HOSTNAME="localhost" \ SPARK_MASTER_PORT="7077" \ SPARK_MASTER_WEBUI_PORT="8080" \ # Scala SCALA_VERSION="2.12" \ # graphframes GRAPHFRAMES_VERSION="0.8.2" \ # Kafka KAFKA_INSTALL="/usr/local" \ KAFKA_VERSION="3.5.1" \ # PySpark PYSPARK_KERNEL_NAME="PySpark" \ PYSPARK_PYTHON="/usr/local/bin/python" \ KERNELS_TEMPLATE_PATH="/tmp" \ KERNELS_DIR_PATH="/root/.local/share/jupyter/kernels" # These environment variables need to be defined separately because you # can't reference prior environment variables in the same ENV block. ENV \ # Spark SPARK_VERSION="$SPARK_MAJOR_VERSION.$SPARK_MINOR_VERSION" \ SPARK_HOME="$SPARK_INSTALL/spark" \ SPARK_MASTER="spark://$SPARK_HOSTNAME:$SPARK_MASTER_PORT" \ # Kafka KAFKA_HOME="$KAFKA_INSTALL/kafka" \ # PySpark PYSPARK_DRIVER_PYTHON="$PYSPARK_PYTHON" # ditto ENV PYTHONPATH="$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.7-src.zip" \ PYTHONSTARTUP="$SPARK_HOME/python/pyspark/shell.py" \ PYSPARK_SUBMIT_ARGS="--master $SPARK_MASTER pyspark-shell" # builder-base is used to build dependencies FROM python-base as builder-base ARG BUILD_PACKAGES="wget gnupg software-properties-common apt-transport-https gcc python-dev" # need gcc, python-dev to build psutil (dependency of something in the pip install) ENV PATH="${VENV_PATH}/bin:$PATH" # Coalescing the RUNs saves quite a significant amount of space # (about 10% just for Spark alone). # <https://www.dajobe.org/blog/2015/04/18/making-debian-docker-images-smaller/> RUN apt-get update && \ apt-get install -y --no-install-recommends $BUILD_PACKAGES && \ # Add Temurin key and source: <https://adoptium.net/installation/linux/> mkdir -p /etc/apt/keyrings && \ wget -O - https://packages.adoptium.net/artifactory/api/gpg/key/public | tee /etc/apt/keyrings/adoptium.asc && \ echo "deb [signed-by=/etc/apt/keyrings/adoptium.asc] https://packages.adoptium.net/artifactory/deb $(awk -F= '/^VERSION_CODENAME/{print$2}' /etc/os-release) main" | tee /etc/apt/sources.list.d/adoptium.list && \ # Create Python virtual environment python3 -m venv $VENV_PATH && \ # Download, install, and symlink spark cd $SPARK_INSTALL && \ wget -q --show-progress --progress=bar:force:noscroll $SPARK_MIRROR/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz 2>&1 && \ tar xzf spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz && \ ln -s spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION spark && \ # Download and install graphframes cd $SPARK_INSTALL/spark/jars && \ wget -q --show-progress --progress=bar:force:noscroll $GRAPHFRAMES_MIRROR/graphframes/graphframes/$GRAPHFRAMES_VERSION-spark$SPARK_MAJOR_VERSION-s_$SCALA_VERSION/graphframes-$GRAPHFRAMES_VERSION-spark$SPARK_MAJOR_VERSION-s_$SCALA_VERSION.jar 2>&1 ; \ # Download, install, and symlink Kafka cd $SPARK_INSTALL && \ wget -q --show-progress --progress=bar:force:noscroll $KAFKA_MIRROR/kafka/$KAFKA_VERSION/kafka_$SCALA_VERSION-$KAFKA_VERSION.tgz 2>&1 && \ tar xzf kafka_$SCALA_VERSION-$KAFKA_VERSION.tgz && \ # Install required modules for PySpark and install kernel pip install --upgrade pip && \ pip install jupyterlab kafka-python graphframes # Tornado 6 breaks sparkmonitor # tornado==5.1 \ # sparkmonitor \ FROM python-base as production COPY --from=builder-base $VENV_PATH $VENV_PATH ENV PATH="${VENV_PATH}/bin:$PATH" \ PYTHONSTARTUP="$SPARK_HOME/python/pyspark/startup.py" COPY --from=builder-base $SPARK_INSTALL/spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION $SPARK_INSTALL/spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION COPY --from=builder-base $SPARK_INSTALL/kafka_$SCALA_VERSION-$KAFKA_VERSION $SPARK_INSTALL/kafka_$SCALA_VERSION-$KAFKA_VERSION COPY --from=builder-base /etc/apt/keyrings/adoptium.asc /etc/apt/keyrings/adoptium.asc COPY --from=builder-base /etc/apt/sources.list.d/adoptium.list /etc/apt/sources.list.d/adoptium.list RUN cd $SPARK_INSTALL && \ ln -s spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION spark && \ ln -s kafka_$SCALA_VERSION-$KAFKA_VERSION kafka && \ apt-get update && apt-get install -y --no-install-recommends temurin-11-jdk tini procps && \ pip install findspark && \ ipython kernel install --name "$PYSPARK_KERNEL_NAME" && \ # Clean up some crap, like the 50MB of JDK source and various documentation. rm -rf /usr/lib/jvm/temurin-11-jdk-amd64/src.zip /usr/lib/jvm/temurin-11-jdk-amd64/sample /usr/lib/jvm/temurin-11-jdk-amd64/man && \ rm -rf $SPARK_HOME/examples $KAFKA_HOME/site-docs && \ # Clear out as many caches as possible. apt -y --purge autoremove && \ apt-get clean -y && \ rm -rf /var/lib/apt/lists/* && \ rm -rf /tmp/* && \ rm -rf /var/cache/* && \ rm -rf /root/.cache # && \ # # Technically dodgy, but it's not as if we're going to mess with packages later... # rm -rf /var/lib/dpkg/info # Startup scripts COPY start-master.sh start-worker.sh start-jupyter.sh start-kafka.sh /usr/local/bin/ #COPY startup.py $SPARK_HOME/python/pyspark # Configuration files COPY spark-defaults.conf $SPARK_HOME/conf COPY server.properties $KAFKA_HOME/config/ # PySpark kernel, based on the template from # <https://github.com/Anchormen/pyspark-jupyter-kernels>. COPY kernel.new.json $KERNELS_DIR_PATH/$PYSPARK_KERNEL_NAME/kernel.json # Spark doesn't seem to respond directly to SIGTERM as the exit status is # for SIGKILL (137), after a pause. Presumably docker compose down times out. # Using tini gives immediate exit with status 143 (SIGTERM). ENTRYPOINT ["/usr/bin/tini", "--"] # Override this in your docker-compose.yml or at the command line. # Spark master (default): /usr/local/bin/start-master.sh # Spark worker: /usr/local/bin/start-worker.sh # PySpark: /usr/local/bin/start-jupyter.sh # Kafka: /usr/local/bin/start-kafka.sh CMD ["/usr/local/bin/start-master.sh"] # debugging # CMD ["bash"]