diff --git a/spark-pyspark-kafka/Dockerfile b/spark-pyspark-kafka/Dockerfile index 421ae46..9e25e3b 100644 --- a/spark-pyspark-kafka/Dockerfile +++ b/spark-pyspark-kafka/Dockerfile @@ -3,7 +3,7 @@ # the same underlying infrastructure. It’s still just as easy to set up # and run the various services via Docker Compose. -FROM python:3.6-slim-buster +FROM python:3.6-slim-buster as python-base # I originally used python:3.6-alpine but dropped it after getting tied # in knots with Python dependencies around numpy. Pip under Alpine builds # all modules from source. You can try to be clever and install the bigger @@ -24,8 +24,12 @@ # ARG NB_USER="pyspark" # ARG NB_UID="1000" # ARG NB_GID="1000" - -ENV \ +ENV PYTHONUNBUFFERED=1 \ + PYTHONDONTWRITEBYTECODE=1 \ + PIP_NO_CACHE_DIR=off \ + PIP_DISABLE_PIP_VERSION_CHECK=on \ + PIP_DEFAULT_TIMEOUT=100 \ + VENV_PATH="/opt/env" \ # download mirrors SPARK_MIRROR="https://archive.apache.org/dist" \ GRAPHFRAMES_MIRROR="https://repos.spark-packages.org" \ @@ -47,10 +51,7 @@ PYSPARK_KERNEL_NAME="PySpark" \ PYSPARK_PYTHON="/usr/local/bin/python" \ KERNELS_TEMPLATE_PATH="/tmp" \ - KERNELS_DIR_PATH="/root/.local/share/jupyter/kernels" \ - # Miscellaneous - BUILD_PACKAGES="wget gnupg software-properties-common" \ - PURGE_PACKAGES="$BUILD_PACKAGES readline-common libreadline7 netbase libgdbm6" + KERNELS_DIR_PATH="/root/.local/share/jupyter/kernels" # These environment variables need to be defined separately because you # can't reference prior environment variables in the same ENV block. @@ -63,28 +64,34 @@ # PySpark PYSPARK_DRIVER_PYTHON="$PYSPARK_PYTHON" -# More PySpark stuff +# ditto ENV PYTHONPATH="$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.7-src.zip" \ - PYSPARK_SUBMIT_ARGS="--master $SPARK_MASTER pyspark-shell" \ - PYTHONSTARTUP="$SPARK_HOME/python/pyspark/shell.py" + PYTHONSTARTUP="$SPARK_HOME/python/pyspark/shell.py" \ + PYSPARK_SUBMIT_ARGS="--master $SPARK_MASTER pyspark-shell" + +# builder-base is used to build dependencies +FROM python-base as builder-base + +ARG BUILD_PACKAGES="wget gnupg software-properties-common apt-transport-https" + +ENV PATH="${VENV_PATH}/bin:$PATH" # Coalescing the RUNs saves quite a significant amount of space # (about 10% just for Spark alone). # RUN apt-get update && \ - # Miscellaneous infrastructure - apt-get install -y --no-install-recommends tini $BUILD_PACKAGES ; \ - # Install AdoptOpenJDK 8 - # - wget -qO - https://adoptopenjdk.jfrog.io/adoptopenjdk/api/gpg/key/public | apt-key add - && \ - add-apt-repository --yes https://adoptopenjdk.jfrog.io/adoptopenjdk/deb/ && \ - apt-get update && apt-get install -y --no-install-recommends adoptopenjdk-8-hotspot ; \ + apt-get install -y --no-install-recommends $BUILD_PACKAGES && \ + # Add Temurin key and source: + mkdir -p /etc/apt/keyrings && \ + wget -O - https://packages.adoptium.net/artifactory/api/gpg/key/public | tee /etc/apt/keyrings/adoptium.asc && \ + echo "deb [signed-by=/etc/apt/keyrings/adoptium.asc] https://packages.adoptium.net/artifactory/deb $(awk -F= '/^VERSION_CODENAME/{print$2}' /etc/os-release) main" | tee /etc/apt/sources.list.d/adoptium.list && \ + # Create Python virtual environment + python3 -m venv $VENV_PATH && \ # Download, install, and symlink spark cd $SPARK_INSTALL && \ wget -q --show-progress --progress=bar:force:noscroll $SPARK_MIRROR/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz 2>&1 && \ tar xzf spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz && \ ln -s spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION spark && \ - rm -f spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz ; \ # Download and install graphframes cd $SPARK_INSTALL/spark/jars && \ wget -q --show-progress --progress=bar:force:noscroll $GRAPHFRAMES_MIRROR/graphframes/graphframes/$GRAPHFRAMES_VERSION/graphframes-$GRAPHFRAMES_VERSION.jar 2>&1 ; \ @@ -92,23 +99,43 @@ cd $SPARK_INSTALL && \ wget -q --show-progress --progress=bar:force:noscroll $KAFKA_MIRROR/kafka/$KAFKA_VERSION/kafka_$SCALA_VERSION-$KAFKA_VERSION.tgz 2>&1 && \ tar xzf kafka_$SCALA_VERSION-$KAFKA_VERSION.tgz && \ - ln -s kafka_$SCALA_VERSION-$KAFKA_VERSION kafka && \ - rm -f kafka_$SCALA_VERSION-$KAFKA_VERSION.tgz ; \ # Install required modules for PySpark and install kernel pip install --upgrade pip && \ - pip install jupyterlab kafka-python graphframes && \ + pip install jupyterlab kafka-python graphframes # Tornado 6 breaks sparkmonitor # tornado==5.1 \ # sparkmonitor \ - ipython kernel install --name "$PYSPARK_KERNEL_NAME" ; \ - # Clear out all the unnecessary crap - apt-get remove -y --purge $PURGE_PACKAGES && \ - apt -y autoremove && \ + +FROM python-base as production + +COPY --from=builder-base $VENV_PATH $VENV_PATH + +ENV PATH="${VENV_PATH}/bin:$PATH" + +COPY --from=builder-base $SPARK_INSTALL/spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION $SPARK_INSTALL/spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION +COPY --from=builder-base $SPARK_INSTALL/kafka_$SCALA_VERSION-$KAFKA_VERSION $SPARK_INSTALL/kafka_$SCALA_VERSION-$KAFKA_VERSION + +COPY --from=builder-base /etc/apt/keyrings/adoptium.asc /etc/apt/keyrings/adoptium.asc +COPY --from=builder-base /etc/apt/sources.list.d/adoptium.list /etc/apt/sources.list.d/adoptium.list + +RUN cd $SPARK_INSTALL && \ + ln -s spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION spark && \ + ln -s kafka_$SCALA_VERSION-$KAFKA_VERSION kafka && \ + apt-get update && apt-get install -y --no-install-recommends temurin-8-jdk tini && \ + ipython kernel install --name "$PYSPARK_KERNEL_NAME" && \ + # Clean up some crap, like the 50MB of JDK source and various documentation. + rm -rf /usr/lib/jvm/temurin-8-jdk-amd64/src.zip /usr/lib/jvm/temurin-8-jdk-amd64/sample /usr/lib/jvm/temurin-8-jdk-amd64/man && \ + rm -rf $SPARK_HOME/examples $KAFKA_HOME/site-docs && \ + # Clear out as many caches as possible. + apt -y --purge autoremove && \ apt-get clean -y && \ rm -rf /var/lib/apt/lists/* && \ rm -rf /tmp/* && \ rm -rf /var/cache/* && \ rm -rf /root/.cache + # && \ + # # Technically dodgy, but it's not as if we're going to mess with packages later... + # rm -rf /var/lib/dpkg/info # Startup scripts COPY start-master.sh start-worker.sh start-jupyter.sh start-kafka.sh /usr/local/bin/ @@ -122,7 +149,7 @@ COPY kernel.json $KERNELS_DIR_PATH/$PYSPARK_KERNEL_NAME/ # Spark doesn't seem to respond directly to SIGTERM as the exit status is -# for SIGKILL (137), after a pause. Presumably docker-compose down times out. +# for SIGKILL (137), after a pause. Presumably docker compose down times out. # Using tini gives immediate exit with status 143 (SIGTERM). ENTRYPOINT ["/usr/bin/tini", "--"]