Newer
Older
docker-analytics / spark / Dockerfile
# This is useful:
# https://www.dajobe.org/blog/2015/04/18/making-debian-docker-images-smaller/

FROM python:3.6-slim-buster

ENV SPARK_VERSION="2.4.8" \
    HADOOP_VERSION="2.7" \
    GRAPHFRAMES_VERSION="0.8.1-spark2.4-s_2.11" \
    APACHE_MIRROR="https://dlcdn.apache.org" \
    SPARK_INSTALL="/usr/local"

# Coalescing the RUNs saves about 68MB on the final image size (10%)
RUN apt-get update && \
    apt-get install -y --no-install-recommends tini wget gnupg software-properties-common ; \
    # Install AdoptOpenJDK 8
    # https://stackoverflow.com/a/59436618
    wget -qO - https://adoptopenjdk.jfrog.io/adoptopenjdk/api/gpg/key/public | apt-key add - && \
    add-apt-repository --yes https://adoptopenjdk.jfrog.io/adoptopenjdk/deb/ && \
    apt-get update && apt-get install -y adoptopenjdk-8-hotspot ; \
    # Download, install, and symlink spark
    cd $SPARK_INSTALL && \
    wget -q --show-progress --progress=bar:force:noscroll $APACHE_MIRROR/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz 2>&1 && \
    tar xzf spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz && \
    ln -s spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION spark && \
    rm -f spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz ; \
    # Download and install graphframes
    cd $SPARK_INSTALL/spark/jars && \
    wget -q --show-progress --progress=bar:force:noscroll https://repos.spark-packages.org/graphframes/graphframes/$GRAPHFRAMES_VERSION/graphframes-$GRAPHFRAMES_VERSION.jar ; \
    # Clear out all the crap
    apt-get remove -y --purge wget gnupg software-properties-common readline-common libreadline7 netbase libgdbm6 && \
    apt -y autoremove && \
    apt-get clean -y && \
    rm -rf /var/lib/apt/lists/* && \
    rm -rf /tmp/* && \
    rm -rf /var/cache/* && \
    rm -rf /root/.cache

COPY start-master.sh start-worker.sh /usr/local/bin/

# these need to be separate because you can't reference prior environment
# variables in the same ENV block
ENV SPARK_HOME="$SPARK_INSTALL/spark" \
    SPARK_HOSTNAME="localhost" \
    SPARK_MASTER_PORT="7077" \
    SPARK_MASTER_WEBUI_PORT="8080"

COPY spark-defaults.conf $SPARK_HOME/conf

ENV SPARK_MASTER="spark://$SPARK_HOSTNAME:$SPARK_MASTER_PORT"

# Spark doesn't seem to respond directly to SIGTERM as the exit status is
# for SIGKILL (137), after a pause. Presumably docker-compose down times out.
# Using tini gives immediate exit with status 143 (SIGTERM).
ENTRYPOINT ["/usr/bin/tini", "--"]

CMD ["/usr/local/bin/start-master.sh"]