diff --git a/spark-pyspark-kafka/Dockerfile b/spark-pyspark-kafka/Dockerfile new file mode 100644 index 0000000..ec8531b --- /dev/null +++ b/spark-pyspark-kafka/Dockerfile @@ -0,0 +1,128 @@ +# Merging all the Spark-related tools into one image saves a LOT of space +# over having separate images for each, especially given they all share +# the same underlying infrastructure. It’s still just as easy to set up +# and run the various services via docker-compose. + +FROM python:3.6-slim-buster +# I originally used python:3.6-alpine but dropped it after getting tied +# in knots with Python dependencies around numpy. Pip under Alpine builds +# all modules from source. You can try to be clever and install the bigger +# modules like numpy via apk, but the problem with numpy is that the Python +# version most likely will install a different version of numpy from what +# the base Alpine image supplies pre-built, so you end up building numpy +# from source regardless, which takes forever (well, 10 minutes ;). I tried +# to figure out a combination that worked without success. +# +# Going to a later version of Python would probably resolve this, but that +# would require moving to Spark 3 (2.4 doesn't support 3.8 or later), and +# we don’t have time to test that at the moment. Which is why this is still +# using Python 3.6: if it ain't broke... + +ENV \ + # download mirrors + SPARK_MIRROR="https://dlcdn.apache.org" \ + GRAPHFRAMES_MIRROR="https://repos.spark-packages.org" \ + KAFKA_MIRROR="https://archive.apache.org/dist" \ + # Spark + SPARK_VERSION="2.4.8" \ + HADOOP_VERSION="2.7" \ + SPARK_INSTALL="/usr/local" \ + SPARK_HOSTNAME="localhost" \ + SPARK_MASTER_PORT="7077" \ + SPARK_MASTER_WEBUI_PORT="8080" \ + # graphframes + GRAPHFRAMES_VERSION="0.8.1-spark2.4-s_2.11" \ + # Kafka + KAFKA_VERSION="2.2.2" \ + SCALA_VERSION="2.11" \ + # PySpark + PYSPARK_KERNEL_NAME="PySpark" \ + PYSPARK_PYTHON="/usr/local/bin/python" \ + KERNELS_TEMPLATE_PATH="/tmp" \ + KERNELS_DIR_PATH="/root/.local/share/jupyter/kernels" \ + # Miscellaneous + BUILD_PACKAGES="wget gnupg software-properties-common" \ + PURGE_PACKAGES="$BUILD_PACKAGES readline-common libreadline7 netbase libgdbm6" + +# These environment variables need to be defined separately because you +# can't reference prior environment variables in the same ENV block. +ENV \ + # Spark + SPARK_HOME="$SPARK_INSTALL/spark" \ + SPARK_MASTER="spark://$SPARK_HOSTNAME:$SPARK_MASTER_PORT" \ + # Kafka + KAFKA_HOME="$KAFKA_INSTALL/kafka" \ + # PySpark + PYSPARK_DRIVER_PYTHON="$PYSPARK_PYTHON" + +# More PySpark stuff +ENV PYTHONPATH="$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.7-src.zip" \ + PYSPARK_SUBMIT_ARGS="--master $SPARK_MASTER pyspark-shell --py-files $SPARK_HOME/jars/graphframes-$GRAPHFRAMES_VERSION.jar" \ + PYTHONSTARTUP="$SPARK_HOME/python/pyspark/shell.py" + +# Coalescing the RUNs saves quite a significant amount of space +# (about 10% just for Spark alone). +# +RUN apt-get update && \ + # Miscellaneous infrastructure + apt-get install -y --no-install-recommends tini $BUILD_PACKAGES ; \ + # Install AdoptOpenJDK 8 + # + wget -qO - https://adoptopenjdk.jfrog.io/adoptopenjdk/api/gpg/key/public | apt-key add - && \ + add-apt-repository --yes https://adoptopenjdk.jfrog.io/adoptopenjdk/deb/ && \ + apt-get update && apt-get install -y adoptopenjdk-8-hotspot ; \ + # Download, install, and symlink spark + cd $SPARK_INSTALL && \ + wget -q --show-progress --progress=bar:force:noscroll $SPARK_MIRROR/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz 2>&1 && \ + tar xzf spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz && \ + ln -s spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION spark && \ + rm -f spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz ; \ + # Download and install graphframes + cd $SPARK_INSTALL/spark/jars && \ + wget -q --show-progress --progress=bar:force:noscroll $GRAPHFRAMES_MIRROR/graphframes/graphframes/$GRAPHFRAMES_VERSION/graphframes-$GRAPHFRAMES_VERSION.jar ; \ + # Download, install, and symlink Kafka + cd $SPARK_INSTALL && \ + wget -q --show-progress --progress=bar:force:noscroll $KAFKA_MIRROR/kafka/$KAFKA_VERSION/kafka_$SCALA_VERSION-$KAFKA_VERSION.tgz 2>&1 && \ + tar xzf kafka_$SCALA_VERSION-$KAFKA_VERSION.tgz && \ + ln -s kafka_$SCALA_VERSION-$KAFKA_VERSION kafka && \ + rm -f kafka_$SCALA_VERSION-$KAFKA_VERSION.tgz ; \ + # Install required modules for PySpark and install kernel + pip install --upgrade pip && \ + pip install jupyter kafka-python graphframes && \ + # Tornado 6 breaks sparkmonitor + # tornado==5.1 \ + # sparkmonitor \ + ipython kernel install --name "$PYSPARK_KERNEL_NAME" ; \ + # Clear out all the unnecessary crap + apt-get remove -y --purge $PURGE_PACKAGES && \ + apt -y autoremove && \ + apt-get clean -y && \ + rm -rf /var/lib/apt/lists/* && \ + rm -rf /tmp/* && \ + rm -rf /var/cache/* && \ + rm -rf /root/.cache + +# Startup scripts +COPY start-master.sh start-worker.sh start-kafka.sh /usr/local/bin/ + +# Configuration files +COPY spark-defaults.conf $SPARK_HOME/conf +COPY server.properties $KAFKA_HOME/config/ + +# PySpark kernel, based on the template from +# . +COPY kernel.json $KERNELS_DIR_PATH/$PYSPARK_KERNEL_NAME/ + +# Spark doesn't seem to respond directly to SIGTERM as the exit status is +# for SIGKILL (137), after a pause. Presumably docker-compose down times out. +# Using tini gives immediate exit with status 143 (SIGTERM). +ENTRYPOINT ["/usr/bin/tini", "--"] + +# Override this in your docker-compose.yml or at the command line. +# Spark master (default): /usr/local/bin/start-master.sh +# Spark worker: /usr/local/bin/start-worker.sh +# PySpark: /usr/local/bin/jupyter notebook --ip=0.0.0.0 --port=8888 --allow-root +# Kafka: /usr/local/bin/start-kafka.sh +CMD ["/usr/local/bin/start-master.sh"] +# debugging +# CMD ["bash"]