Newer
Older
docker-analytics / spark-pyspark-kafka / Dockerfile
# Merging all the Spark-related tools into one image saves a LOT of space
# over having separate images for each, especially given they all share
# the same underlying infrastructure. It’s still just as easy to set up
# and run the various services via Docker Compose.

FROM python:3.6-slim-buster as python-base
# I originally used python:3.6-alpine but dropped it after getting tied
# in knots with Python dependencies around numpy. Pip under Alpine builds
# all modules from source. You can try to be clever and install the bigger
# modules like numpy via apk, but the problem with numpy is that the Python
# version most likely will install a different version of numpy from what
# the base Alpine image supplies pre-built, so you end up building numpy
# from source regardless, which takes forever (well, 10 minutes ;). I tried
# to figure out a combination that worked without success.
#
# Going to a later version of Python would probably resolve this, but that
# would require moving to Spark 3 (2.4 doesn't support 3.8 or later), and
# we don’t have time to test that at the moment. Which is why this is still
# using Python 3.6: if it ain't broke...

# Ideally we should run as a non-root user, but it's problematic to set
# up shared files, especially if running Docker within, say, VirtualBox.
# See https://vsupalov.com/docker-shared-permissions/
# ARG NB_USER="pyspark"
# ARG NB_UID="1000"
# ARG NB_GID="1000"
ENV PYTHONUNBUFFERED=1 \
    PYTHONDONTWRITEBYTECODE=1 \
    PIP_NO_CACHE_DIR=off \
    PIP_DISABLE_PIP_VERSION_CHECK=on \
    PIP_DEFAULT_TIMEOUT=100 \
    VENV_PATH="/opt/env" \
    # download mirrors
    SPARK_MIRROR="https://archive.apache.org/dist" \
    GRAPHFRAMES_MIRROR="https://repos.spark-packages.org" \
    KAFKA_MIRROR="https://archive.apache.org/dist" \
    # Spark
    SPARK_VERSION="2.4.8" \
    HADOOP_VERSION="2.7" \
    SPARK_INSTALL="/usr/local" \
    SPARK_HOSTNAME="localhost" \
    SPARK_MASTER_PORT="7077" \
    SPARK_MASTER_WEBUI_PORT="8080" \
    # graphframes
    GRAPHFRAMES_VERSION="0.8.1-spark2.4-s_2.11" \
    # Kafka
    KAFKA_INSTALL="/usr/local" \
    KAFKA_VERSION="2.2.2" \
    SCALA_VERSION="2.11" \
    # PySpark
    PYSPARK_KERNEL_NAME="PySpark" \
    PYSPARK_PYTHON="/usr/local/bin/python" \
    KERNELS_TEMPLATE_PATH="/tmp" \
    KERNELS_DIR_PATH="/root/.local/share/jupyter/kernels"

# These environment variables need to be defined separately because you
# can't reference prior environment variables in the same ENV block.
ENV \
    # Spark
    SPARK_HOME="$SPARK_INSTALL/spark" \
    SPARK_MASTER="spark://$SPARK_HOSTNAME:$SPARK_MASTER_PORT" \
    # Kafka
    KAFKA_HOME="$KAFKA_INSTALL/kafka" \
    # PySpark
    PYSPARK_DRIVER_PYTHON="$PYSPARK_PYTHON"

# ditto
ENV PYTHONPATH="$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.7-src.zip" \
    PYTHONSTARTUP="$SPARK_HOME/python/pyspark/shell.py" \
    PYSPARK_SUBMIT_ARGS="--master $SPARK_MASTER pyspark-shell"

# builder-base is used to build dependencies
FROM python-base as builder-base

ARG BUILD_PACKAGES="wget gnupg software-properties-common apt-transport-https"

ENV PATH="${VENV_PATH}/bin:$PATH"

# Coalescing the RUNs saves quite a significant amount of space
# (about 10% just for Spark alone).
# <https://www.dajobe.org/blog/2015/04/18/making-debian-docker-images-smaller/>
RUN apt-get update && \
    apt-get install -y --no-install-recommends $BUILD_PACKAGES && \
    # Add Temurin key and source: <https://adoptium.net/installation/linux/>
    mkdir -p /etc/apt/keyrings && \
    wget -O - https://packages.adoptium.net/artifactory/api/gpg/key/public | tee /etc/apt/keyrings/adoptium.asc && \
    echo "deb [signed-by=/etc/apt/keyrings/adoptium.asc] https://packages.adoptium.net/artifactory/deb $(awk -F= '/^VERSION_CODENAME/{print$2}' /etc/os-release) main" | tee /etc/apt/sources.list.d/adoptium.list && \
    # Create Python virtual environment
    python3 -m venv $VENV_PATH && \
    # Download, install, and symlink spark
    cd $SPARK_INSTALL && \
    wget -q --show-progress --progress=bar:force:noscroll $SPARK_MIRROR/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz 2>&1 && \
    tar xzf spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz && \
    ln -s spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION spark && \
    # Download and install graphframes
    cd $SPARK_INSTALL/spark/jars && \
    wget -q --show-progress --progress=bar:force:noscroll $GRAPHFRAMES_MIRROR/graphframes/graphframes/$GRAPHFRAMES_VERSION/graphframes-$GRAPHFRAMES_VERSION.jar 2>&1 ; \
    # Download, install, and symlink Kafka
    cd $SPARK_INSTALL && \
    wget -q --show-progress --progress=bar:force:noscroll $KAFKA_MIRROR/kafka/$KAFKA_VERSION/kafka_$SCALA_VERSION-$KAFKA_VERSION.tgz 2>&1 && \
    tar xzf kafka_$SCALA_VERSION-$KAFKA_VERSION.tgz && \
    # Install required modules for PySpark and install kernel
    pip install --upgrade pip && \
    pip install jupyterlab kafka-python graphframes
        # Tornado 6 breaks sparkmonitor
        # tornado==5.1 \
        # sparkmonitor \

FROM python-base as production

COPY --from=builder-base $VENV_PATH $VENV_PATH

ENV PATH="${VENV_PATH}/bin:$PATH"

COPY --from=builder-base $SPARK_INSTALL/spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION $SPARK_INSTALL/spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION
COPY --from=builder-base $SPARK_INSTALL/kafka_$SCALA_VERSION-$KAFKA_VERSION $SPARK_INSTALL/kafka_$SCALA_VERSION-$KAFKA_VERSION

COPY --from=builder-base /etc/apt/keyrings/adoptium.asc /etc/apt/keyrings/adoptium.asc
COPY --from=builder-base /etc/apt/sources.list.d/adoptium.list /etc/apt/sources.list.d/adoptium.list

RUN cd $SPARK_INSTALL && \
    ln -s spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION spark && \
    ln -s kafka_$SCALA_VERSION-$KAFKA_VERSION kafka && \
    apt-get update && apt-get install -y --no-install-recommends temurin-8-jdk tini && \
    ipython kernel install --name "$PYSPARK_KERNEL_NAME" && \
    # Clean up some crap, like the 50MB of JDK source and various documentation.
    rm -rf /usr/lib/jvm/temurin-8-jdk-amd64/src.zip /usr/lib/jvm/temurin-8-jdk-amd64/sample /usr/lib/jvm/temurin-8-jdk-amd64/man && \
    rm -rf $SPARK_HOME/examples $KAFKA_HOME/site-docs && \
    # Clear out as many caches as possible.
    apt -y --purge autoremove && \
    apt-get clean -y && \
    rm -rf /var/lib/apt/lists/* && \
    rm -rf /tmp/* && \
    rm -rf /var/cache/* && \
    rm -rf /root/.cache
    #  && \
    # # Technically dodgy, but it's not as if we're going to mess with packages later...
    # rm -rf /var/lib/dpkg/info

# Startup scripts
COPY start-master.sh start-worker.sh start-jupyter.sh start-kafka.sh /usr/local/bin/

# Configuration files
COPY spark-defaults.conf $SPARK_HOME/conf
COPY server.properties $KAFKA_HOME/config/

# PySpark kernel, based on the template from
# <https://github.com/Anchormen/pyspark-jupyter-kernels>.
COPY kernel.json $KERNELS_DIR_PATH/$PYSPARK_KERNEL_NAME/

# Spark doesn't seem to respond directly to SIGTERM as the exit status is
# for SIGKILL (137), after a pause. Presumably docker compose down times out.
# Using tini gives immediate exit with status 143 (SIGTERM).
ENTRYPOINT ["/usr/bin/tini", "--"]

# Override this in your docker-compose.yml or at the command line.
# Spark master (default): /usr/local/bin/start-master.sh
# Spark worker: /usr/local/bin/start-worker.sh
# PySpark: /usr/local/bin/start-jupyter.sh
# Kafka: /usr/local/bin/start-kafka.sh
CMD ["/usr/local/bin/start-master.sh"]
# debugging
# CMD ["bash"]