diff --git a/spark-pyspark-kafka/Dockerfile b/spark-pyspark-kafka/Dockerfile index 9e25e3b..35067f0 100644 --- a/spark-pyspark-kafka/Dockerfile +++ b/spark-pyspark-kafka/Dockerfile @@ -3,7 +3,7 @@ # the same underlying infrastructure. It’s still just as easy to set up # and run the various services via Docker Compose. -FROM python:3.6-slim-buster as python-base +FROM python:3.11-slim-buster as python-base # I originally used python:3.6-alpine but dropped it after getting tied # in knots with Python dependencies around numpy. Pip under Alpine builds # all modules from source. You can try to be clever and install the bigger @@ -35,18 +35,20 @@ GRAPHFRAMES_MIRROR="https://repos.spark-packages.org" \ KAFKA_MIRROR="https://archive.apache.org/dist" \ # Spark - SPARK_VERSION="2.4.8" \ + SPARK_MAJOR_VERSION="3.2" \ + SPARK_MINOR_VERSION="4" \ HADOOP_VERSION="2.7" \ SPARK_INSTALL="/usr/local" \ SPARK_HOSTNAME="localhost" \ SPARK_MASTER_PORT="7077" \ SPARK_MASTER_WEBUI_PORT="8080" \ + # Scala + SCALA_VERSION="2.12" \ # graphframes - GRAPHFRAMES_VERSION="0.8.1-spark2.4-s_2.11" \ + GRAPHFRAMES_VERSION="0.8.2" \ # Kafka KAFKA_INSTALL="/usr/local" \ - KAFKA_VERSION="2.2.2" \ - SCALA_VERSION="2.11" \ + KAFKA_VERSION="3.5.1" \ # PySpark PYSPARK_KERNEL_NAME="PySpark" \ PYSPARK_PYTHON="/usr/local/bin/python" \ @@ -57,6 +59,7 @@ # can't reference prior environment variables in the same ENV block. ENV \ # Spark + SPARK_VERSION="$SPARK_MAJOR_VERSION.$SPARK_MINOR_VERSION" \ SPARK_HOME="$SPARK_INSTALL/spark" \ SPARK_MASTER="spark://$SPARK_HOSTNAME:$SPARK_MASTER_PORT" \ # Kafka @@ -72,7 +75,8 @@ # builder-base is used to build dependencies FROM python-base as builder-base -ARG BUILD_PACKAGES="wget gnupg software-properties-common apt-transport-https" +ARG BUILD_PACKAGES="wget gnupg software-properties-common apt-transport-https gcc python-dev" +# need gcc, python-dev to build psutil (dependency of something in the pip install) ENV PATH="${VENV_PATH}/bin:$PATH" @@ -94,7 +98,7 @@ ln -s spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION spark && \ # Download and install graphframes cd $SPARK_INSTALL/spark/jars && \ - wget -q --show-progress --progress=bar:force:noscroll $GRAPHFRAMES_MIRROR/graphframes/graphframes/$GRAPHFRAMES_VERSION/graphframes-$GRAPHFRAMES_VERSION.jar 2>&1 ; \ + wget -q --show-progress --progress=bar:force:noscroll $GRAPHFRAMES_MIRROR/graphframes/graphframes/$GRAPHFRAMES_VERSION-spark$SPARK_MAJOR_VERSION-s_$SCALA_VERSION/graphframes-$GRAPHFRAMES_VERSION-spark$SPARK_MAJOR_VERSION-s_$SCALA_VERSION.jar 2>&1 ; \ # Download, install, and symlink Kafka cd $SPARK_INSTALL && \ wget -q --show-progress --progress=bar:force:noscroll $KAFKA_MIRROR/kafka/$KAFKA_VERSION/kafka_$SCALA_VERSION-$KAFKA_VERSION.tgz 2>&1 && \ @@ -110,7 +114,8 @@ COPY --from=builder-base $VENV_PATH $VENV_PATH -ENV PATH="${VENV_PATH}/bin:$PATH" +ENV PATH="${VENV_PATH}/bin:$PATH" \ + PYTHONSTARTUP="$SPARK_HOME/python/pyspark/startup.py" COPY --from=builder-base $SPARK_INSTALL/spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION $SPARK_INSTALL/spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION COPY --from=builder-base $SPARK_INSTALL/kafka_$SCALA_VERSION-$KAFKA_VERSION $SPARK_INSTALL/kafka_$SCALA_VERSION-$KAFKA_VERSION @@ -121,10 +126,11 @@ RUN cd $SPARK_INSTALL && \ ln -s spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION spark && \ ln -s kafka_$SCALA_VERSION-$KAFKA_VERSION kafka && \ - apt-get update && apt-get install -y --no-install-recommends temurin-8-jdk tini && \ + apt-get update && apt-get install -y --no-install-recommends temurin-11-jdk tini procps && \ + pip install findspark && \ ipython kernel install --name "$PYSPARK_KERNEL_NAME" && \ # Clean up some crap, like the 50MB of JDK source and various documentation. - rm -rf /usr/lib/jvm/temurin-8-jdk-amd64/src.zip /usr/lib/jvm/temurin-8-jdk-amd64/sample /usr/lib/jvm/temurin-8-jdk-amd64/man && \ + rm -rf /usr/lib/jvm/temurin-11-jdk-amd64/src.zip /usr/lib/jvm/temurin-11-jdk-amd64/sample /usr/lib/jvm/temurin-11-jdk-amd64/man && \ rm -rf $SPARK_HOME/examples $KAFKA_HOME/site-docs && \ # Clear out as many caches as possible. apt -y --purge autoremove && \ @@ -139,6 +145,7 @@ # Startup scripts COPY start-master.sh start-worker.sh start-jupyter.sh start-kafka.sh /usr/local/bin/ +#COPY startup.py $SPARK_HOME/python/pyspark # Configuration files COPY spark-defaults.conf $SPARK_HOME/conf @@ -146,7 +153,7 @@ # PySpark kernel, based on the template from # . -COPY kernel.json $KERNELS_DIR_PATH/$PYSPARK_KERNEL_NAME/ +COPY kernel.new.json $KERNELS_DIR_PATH/$PYSPARK_KERNEL_NAME/kernel.json # Spark doesn't seem to respond directly to SIGTERM as the exit status is # for SIGKILL (137), after a pause. Presumably docker compose down times out.