Switched standalone images to new structure

nigel.stanger / docker-analytics

Browse code Switched standalone images to new structure master spark3
1 parent f094dd9 commit e1425ee16762ed569458b60c693d36265b3b713a Nigel Stanger authored on 21 Sep 2021

Patch

Showing 4 changed files

Ignore Space Show notes View Makefile
all: spark pyspark kafka spark-pyspark-kafka COMBINED_FILES:=$(wildcard spark-pyspark-kafka/) SPARK_FILES:=$(wildcard spark/) PYSPARK_FILES:=$(wildcard pyspark/) KAFKA_FILES:=$(wildcard kafka/) spark-pyspark-kafka: $(COMBINED_FILES) docker build $(BUILD_OPTS) -t nstanger/$@:latest -f $< $@ spark: $(SPARK_FILES) docker build $(BUILD_OPTS) -t nstanger/$@:latest -f $< $@ # Rebuild both the following if spark changes! pyspark: $(PYSPARK_FILES) docker build $(BUILD_OPTS) -t nstanger/$@:latest -f $< $@ kafka: $(KAFKA_FILES) docker build $(BUILD_OPTS) -t nstanger/$@:latest -f $< $@ debug: @echo "COMBINED_FILES = [$(COMBINED_FILES)]" @echo "SPARK_FILES = [$(SPARK_FILES)]" @echo "PYSPARK_FILES = [$(PYSPARK_FILES)]" @echo "KAFKA_FILES = [$(KAFKA_FILES)]" all: spark pyspark kafka spark: spark/Dockerfile spark/start-master.sh spark/start-worker.sh docker build $(BUILD_OPTS) -t nstanger/spark:latest -f $< $@ # Rebuild both the following if spark changes! pyspark: pyspark/Dockerfile pyspark/kernel.json docker build $(BUILD_OPTS) -t nstanger/pyspark:latest -f $< $@ kafka: kafka/Dockerfile kafka/start-kafka.sh docker build $(BUILD_OPTS) -t nstanger/kafka:latest -f $< $@

Ignore Space Show notes View

Makefile

Ignore Space Show notes View kafka/Dockerfile
FROM nstanger/spark ENV KAFKA_VERSION="2.2.2" \ SCALA_VERSION="2.11" \ APACHE_MIRROR="https://archive.apache.org/dist" \ KAFKA_INSTALL="/usr/local" RUN apt-get update && \ apt-get install -y --no-install-recommends wget ; \ # download, install, and symlink kafka cd $SPARK_INSTALL && \ wget -q --show-progress --progress=bar:force:noscroll $APACHE_MIRROR/kafka/$KAFKA_VERSION/kafka_$SCALA_VERSION-$KAFKA_VERSION.tgz 2>&1 && \ tar xzf kafka_$SCALA_VERSION-$KAFKA_VERSION.tgz && \ ln -s kafka_$SCALA_VERSION-$KAFKA_VERSION kafka && \ rm -f kafka_$SCALA_VERSION-$KAFKA_VERSION.tgz ; \ # Clean up the crap apt-get remove -y --purge wget && \ apt -y autoremove && \ apt-get clean -y && \ rm -rf /var/lib/apt/lists/* && \ rm -rf /tmp/* && \ rm -rf /var/cache/* && \ rm -rf /root/.cache # COPY start-master.sh /usr/local/bin/start-master.sh # COPY start-worker.sh /usr/local/bin/start-worker.sh ENV KAFKA_HOME="$KAFKA_INSTALL/kafka" # SPARK_HOSTNAME="127.0.0.1" \ # SPARK_MASTER_PORT="7077" \ # SPARK_MASTER_WEBUI_PORT="8080" \ # PYSPARK_PYTHON="/usr/local/bin/python3" COPY start-kafka.sh /usr/local/bin/ COPY server.properties $KAFKA_HOME/config/ # Spark doesn't seem to respond directly to SIGTERM as the exit status is # for SIGKILL (137), after a pause. Presumably docker-compose down times out. # Using tini gives immediate exit with status 143 (SIGTERM). ENTRYPOINT ["/sbin/tini", "--"] CMD ["/usr/local/bin/start-kafka.sh"] FROM nstanger/spark ENV KAFKA_VERSION="2.2.2" \ SCALA_VERSION="2.11" \ APACHE_MIRROR="https://archive.apache.org/dist" \ KAFKA_INSTALL="/usr/local" RUN apk add --no-cache --virtual .fetch-deps \ wget \ tar # download, install, and symlink kafka RUN cd $SPARK_INSTALL && \ wget -q --show-progress --progress=bar:force:noscroll $APACHE_MIRROR/kafka/$KAFKA_VERSION/kafka_$SCALA_VERSION-$KAFKA_VERSION.tgz 2>&1 && \ tar xzf kafka_$SCALA_VERSION-$KAFKA_VERSION.tgz && \ ln -s kafka_$SCALA_VERSION-$KAFKA_VERSION kafka && \ rm -f kafka_$SCALA_VERSION-$KAFKA_VERSION.tgz RUN apk del .fetch-deps && \ rm -rf /tmp/* && \ rm -rf /var/cache/* && \ rm -rf /root/.cache # COPY start-master.sh /usr/local/bin/start-master.sh # COPY start-worker.sh /usr/local/bin/start-worker.sh ENV KAFKA_HOME="$KAFKA_INSTALL/kafka" # SPARK_HOSTNAME="127.0.0.1" \ # SPARK_MASTER_PORT="7077" \ # SPARK_MASTER_WEBUI_PORT="8080" \ # PYSPARK_PYTHON="/usr/local/bin/python3" COPY start-kafka.sh /usr/local/bin/ COPY server.properties $KAFKA_HOME/config/ # Spark doesn't seem to respond directly to SIGTERM as the exit status is # for SIGKILL (137), after a pause. Presumably docker-compose down times out. # Using tini gives immediate exit with status 143 (SIGTERM). ENTRYPOINT ["/sbin/tini", "--"] CMD ["/usr/local/bin/start-kafka.sh"]

Ignore Space Show notes View

kafka/Dockerfile

FROM nstanger/spark

ENV KAFKA_VERSION="2.2.2" \
    SCALA_VERSION="2.11" \
    APACHE_MIRROR="https://archive.apache.org/dist" \
    KAFKA_INSTALL="/usr/local"

RUN apt-get update && \
    apt-get install -y --no-install-recommends wget ; \
    # download, install, and symlink kafka
    cd $SPARK_INSTALL && \
    wget -q --show-progress --progress=bar:force:noscroll $APACHE_MIRROR/kafka/$KAFKA_VERSION/kafka_$SCALA_VERSION-$KAFKA_VERSION.tgz 2>&1 && \
    tar xzf kafka_$SCALA_VERSION-$KAFKA_VERSION.tgz && \
    ln -s kafka_$SCALA_VERSION-$KAFKA_VERSION kafka && \
    rm -f kafka_$SCALA_VERSION-$KAFKA_VERSION.tgz ; \
    # Clean up the crap
    apt-get remove -y --purge wget && \
    apt -y autoremove && \
    apt-get clean -y && \
    rm -rf /var/lib/apt/lists/* && \
    rm -rf /tmp/* && \
    rm -rf /var/cache/* && \
    rm -rf /root/.cache

# COPY start-master.sh /usr/local/bin/start-master.sh
# COPY start-worker.sh /usr/local/bin/start-worker.sh

ENV KAFKA_HOME="$KAFKA_INSTALL/kafka"
#     SPARK_HOSTNAME="127.0.0.1" \
#     SPARK_MASTER_PORT="7077" \
#     SPARK_MASTER_WEBUI_PORT="8080" \
#     PYSPARK_PYTHON="/usr/local/bin/python3"
COPY start-kafka.sh /usr/local/bin/
COPY server.properties $KAFKA_HOME/config/

# Spark doesn't seem to respond directly to SIGTERM as the exit status is
# for SIGKILL (137), after a pause. Presumably docker-compose down times out.
# Using tini gives immediate exit with status 143 (SIGTERM).
ENTRYPOINT ["/sbin/tini", "--"]

CMD ["/usr/local/bin/start-kafka.sh"]

FROM nstanger/spark

ENV KAFKA_VERSION="2.2.2" \
    SCALA_VERSION="2.11" \
    APACHE_MIRROR="https://archive.apache.org/dist" \
    KAFKA_INSTALL="/usr/local"

RUN apk add --no-cache --virtual .fetch-deps \
        wget \
        tar

# download, install, and symlink kafka
RUN cd $SPARK_INSTALL && \
    wget -q --show-progress --progress=bar:force:noscroll $APACHE_MIRROR/kafka/$KAFKA_VERSION/kafka_$SCALA_VERSION-$KAFKA_VERSION.tgz 2>&1 && \
    tar xzf kafka_$SCALA_VERSION-$KAFKA_VERSION.tgz && \
    ln -s kafka_$SCALA_VERSION-$KAFKA_VERSION kafka && \
    rm -f kafka_$SCALA_VERSION-$KAFKA_VERSION.tgz

RUN apk del .fetch-deps && \
    rm -rf /tmp/* && \
    rm -rf /var/cache/* && \
    rm -rf /root/.cache

# COPY start-master.sh /usr/local/bin/start-master.sh
# COPY start-worker.sh /usr/local/bin/start-worker.sh

CMD ["/usr/local/bin/start-kafka.sh"]

Ignore Space Show notes View pyspark/Dockerfile
# Don't use the official jupyter/pyspark-notebook image because it's massive! FROM nstanger/spark # Ideally we should run as a non-root user, but it's problematic to set # up shared files, especially if running Docker within, say, VirtualBox. # See https://vsupalov.com/docker-shared-permissions/ # ARG NB_USER="pyspark" # ARG NB_UID="1000" # ARG NB_GID="1000" USER root # RUN adduser -D -G root $NB_USER RUN pip install --upgrade pip && \ pip install \ # Tornado 6 breaks sparkmonitor # tornado==5.1 \ # sparkmonitor \ jupyter kafka-python graphframes ; \ # Clean up the crap rm -rf /tmp/* && \ rm -rf /var/cache/* && \ rm -rf /root/.cache ENV PYSPARK_PYTHON="/usr/local/bin/python" ENV KERNEL_NAME="PySpark" \ KERNELS_TEMPLATE_PATH="/tmp" \ KERNELS_DIR_PATH="/root/.local/share/jupyter/kernels" \ PYTHONPATH="$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.7-src.zip" \ PYSPARK_DRIVER_PYTHON="$PYSPARK_PYTHON" \ PYSPARK_SUBMIT_ARGS="--master $SPARK_MASTER pyspark-shell" \ PYTHONSTARTUP="$SPARK_HOME/python/pyspark/shell.py" # ENV SPARKMONITOR_UI_PORT 8080 # USER $NB_USER RUN ipython kernel install --name $KERNEL_NAME # kernel.json is based on the template from <https://github.com/Anchormen/pyspark-jupyter-kernels>. COPY kernel.json $KERNELS_DIR_PATH/$KERNEL_NAME/ # RUN jupyter nbextension install sparkmonitor --py --symlink \ # && jupyter nbextension enable sparkmonitor --py \ # && jupyter serverextension enable --py sparkmonitor \ # && ipython profile create \ # && echo "c.InteractiveShellApp.extensions.append('sparkmonitor.kernelextension')" >> $(ipython profile locate default)/ipython_kernel_config.py # RUN mkdir -p /home/$NB_USER/work # WORKDIR /home/$NB_USER/work ENTRYPOINT ["/sbin/tini", "--"] # The kernel will automatically set up the PySpark context when it is loaded. CMD ["jupyter", "notebook", "--ip=0.0.0.0", "--port=8888", "--allow-root"] # debugging # CMD ["bash"] # Don't use the official jupyter/pyspark-notebook image because it's massive! FROM nstanger/spark # Ideally we should run as a non-root user, but it's problematic to set # up shared files, especially if running Docker within, say, VirtualBox. # See https://vsupalov.com/docker-shared-permissions/ # ARG NB_USER="pyspark" # ARG NB_UID="1000" # ARG NB_GID="1000" USER root # RUN adduser -D -G root $NB_USER RUN apk --no-cache add \ zeromq-dev RUN apk add --no-cache --virtual .build-deps \ build-base \ libffi-dev \ python3-dev \ py3-wheel \ py3-numpy RUN pip install --upgrade \ pip \ && pip install \ # Tornado 6 breaks sparkmonitor # tornado==5.1 \ jupyter \ kafka-python \ # sparkmonitor \ graphframes RUN apk del .build-deps && \ rm -rf /tmp/* && \ rm -rf /var/cache/* && \ rm -rf /root/.cache ENV PYSPARK_PYTHON="/usr/local/bin/python" ENV KERNEL_NAME="PySpark" \ KERNELS_TEMPLATE_PATH="/tmp" \ KERNELS_DIR_PATH="/root/.local/share/jupyter/kernels" \ PYTHONPATH="$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.7-src.zip" \ PYSPARK_DRIVER_PYTHON="$PYSPARK_PYTHON" \ PYSPARK_SUBMIT_ARGS="--master $SPARK_MASTER pyspark-shell" \ PYTHONSTARTUP="$SPARK_HOME/python/pyspark/shell.py" # ENV SPARKMONITOR_UI_PORT 8080 # USER $NB_USER RUN ipython kernel install --name $KERNEL_NAME # kernel.json is based on the template from <https://github.com/Anchormen/pyspark-jupyter-kernels>. COPY kernel.json $KERNELS_DIR_PATH/$KERNEL_NAME/ # RUN jupyter nbextension install sparkmonitor --py --symlink \ # && jupyter nbextension enable sparkmonitor --py \ # && jupyter serverextension enable --py sparkmonitor \ # && ipython profile create \ # && echo "c.InteractiveShellApp.extensions.append('sparkmonitor.kernelextension')" >> $(ipython profile locate default)/ipython_kernel_config.py # RUN mkdir -p /home/$NB_USER/work # WORKDIR /home/$NB_USER/work ENTRYPOINT ["/sbin/tini", "--"] # The kernel will automatically set up the PySpark context when it is loaded. CMD ["jupyter", "notebook", "--ip=0.0.0.0", "--port=8888", "--allow-root"] # debugging # CMD ["bash"]

Ignore Space Show notes View

pyspark/Dockerfile

# Don't use the official jupyter/pyspark-notebook image because it's massive!
FROM nstanger/spark

# Ideally we should run as a non-root user, but it's problematic to set
# up shared files, especially if running Docker within, say, VirtualBox.
# See https://vsupalov.com/docker-shared-permissions/
# ARG NB_USER="pyspark"
# ARG NB_UID="1000"
# ARG NB_GID="1000"

USER root

# RUN adduser -D -G root $NB_USER

RUN pip install --upgrade pip && \
    pip install \
        # Tornado 6 breaks sparkmonitor
        # tornado==5.1 \
        # sparkmonitor \
        jupyter kafka-python graphframes ; \
    # Clean up the crap
    rm -rf /tmp/* && \
    rm -rf /var/cache/* && \
    rm -rf /root/.cache

ENV PYSPARK_PYTHON="/usr/local/bin/python"

ENV KERNEL_NAME="PySpark" \
    KERNELS_TEMPLATE_PATH="/tmp" \
    KERNELS_DIR_PATH="/root/.local/share/jupyter/kernels" \
    PYTHONPATH="$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.7-src.zip" \
    PYSPARK_DRIVER_PYTHON="$PYSPARK_PYTHON" \
    PYSPARK_SUBMIT_ARGS="--master $SPARK_MASTER pyspark-shell" \
    PYTHONSTARTUP="$SPARK_HOME/python/pyspark/shell.py"

# ENV SPARKMONITOR_UI_PORT 8080

# USER $NB_USER

RUN ipython kernel install --name $KERNEL_NAME
# kernel.json is based on the template from <https://github.com/Anchormen/pyspark-jupyter-kernels>.
COPY kernel.json $KERNELS_DIR_PATH/$KERNEL_NAME/

# RUN jupyter nbextension install sparkmonitor --py --symlink \
#     && jupyter nbextension enable sparkmonitor --py \
#     && jupyter serverextension enable --py sparkmonitor \
#     && ipython profile create \
#     && echo "c.InteractiveShellApp.extensions.append('sparkmonitor.kernelextension')" >> $(ipython profile locate default)/ipython_kernel_config.py

# RUN mkdir -p /home/$NB_USER/work
# WORKDIR /home/$NB_USER/work

ENTRYPOINT ["/sbin/tini", "--"]

# The kernel will automatically set up the PySpark context when it is loaded.
CMD ["jupyter", "notebook", "--ip=0.0.0.0", "--port=8888", "--allow-root"]

# debugging
# CMD ["bash"]

# Don't use the official jupyter/pyspark-notebook image because it's massive!
FROM nstanger/spark

USER root

# RUN adduser -D -G root $NB_USER

RUN apk --no-cache add \
    zeromq-dev

RUN apk add --no-cache --virtual .build-deps \
        build-base \
        libffi-dev \
        python3-dev \
        py3-wheel \
        py3-numpy

RUN pip install --upgrade \
        pip \
    && pip install \
        # Tornado 6 breaks sparkmonitor
        # tornado==5.1 \
        jupyter \
        kafka-python \
        # sparkmonitor \
        graphframes

RUN apk del .build-deps && \
    rm -rf /tmp/* && \
    rm -rf /var/cache/* && \
    rm -rf /root/.cache

ENV PYSPARK_PYTHON="/usr/local/bin/python"

# ENV SPARKMONITOR_UI_PORT 8080

# USER $NB_USER

RUN ipython kernel install --name $KERNEL_NAME
# kernel.json is based on the template from <https://github.com/Anchormen/pyspark-jupyter-kernels>.
COPY kernel.json $KERNELS_DIR_PATH/$KERNEL_NAME/

# RUN mkdir -p /home/$NB_USER/work
# WORKDIR /home/$NB_USER/work

ENTRYPOINT ["/sbin/tini", "--"]

# The kernel will automatically set up the PySpark context when it is loaded.
CMD ["jupyter", "notebook", "--ip=0.0.0.0", "--port=8888", "--allow-root"]

# debugging
# CMD ["bash"]

Ignore Space Show notes View spark/Dockerfile
# This is useful: # https://www.dajobe.org/blog/2015/04/18/making-debian-docker-images-smaller/ FROM python:3.6-slim-buster ENV SPARK_VERSION="2.4.8" \ HADOOP_VERSION="2.7" \ GRAPHFRAMES_VERSION="0.8.1-spark2.4-s_2.11" \ APACHE_MIRROR="https://dlcdn.apache.org" \ SPARK_INSTALL="/usr/local" # Coalescing the RUNs saves about 68MB on the final image size (10%) RUN apt-get update && \ apt-get install -y --no-install-recommends tini wget gnupg software-properties-common ; \ # Install AdoptOpenJDK 8 # https://stackoverflow.com/a/59436618 wget -qO - https://adoptopenjdk.jfrog.io/adoptopenjdk/api/gpg/key/public \| apt-key add - && \ add-apt-repository --yes https://adoptopenjdk.jfrog.io/adoptopenjdk/deb/ && \ apt-get update && apt-get install -y adoptopenjdk-8-hotspot ; \ # Download, install, and symlink spark cd $SPARK_INSTALL && \ wget -q --show-progress --progress=bar:force:noscroll $APACHE_MIRROR/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz 2>&1 && \ tar xzf spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz && \ ln -s spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION spark && \ rm -f spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz ; \ # Download and install graphframes cd $SPARK_INSTALL/spark/jars && \ wget -q --show-progress --progress=bar:force:noscroll https://repos.spark-packages.org/graphframes/graphframes/$GRAPHFRAMES_VERSION/graphframes-$GRAPHFRAMES_VERSION.jar ; \ # Clear out all the crap apt-get remove -y --purge wget gnupg software-properties-common readline-common libreadline7 netbase libgdbm6 && \ apt -y autoremove && \ apt-get clean -y && \ rm -rf /var/lib/apt/lists/* && \ rm -rf /tmp/* && \ rm -rf /var/cache/* && \ rm -rf /root/.cache COPY start-master.sh start-worker.sh /usr/local/bin/ # these need to be separate because you can't reference prior environment # variables in the same ENV block ENV SPARK_HOME="$SPARK_INSTALL/spark" \ SPARK_HOSTNAME="localhost" \ SPARK_MASTER_PORT="7077" \ SPARK_MASTER_WEBUI_PORT="8080" COPY spark-defaults.conf $SPARK_HOME/conf ENV SPARK_MASTER="spark://$SPARK_HOSTNAME:$SPARK_MASTER_PORT" # Spark doesn't seem to respond directly to SIGTERM as the exit status is # for SIGKILL (137), after a pause. Presumably docker-compose down times out. # Using tini gives immediate exit with status 143 (SIGTERM). ENTRYPOINT ["/usr/bin/tini", "--"] CMD ["/usr/local/bin/start-master.sh"] FROM python:3.6-alpine ENV SPARK_VERSION="2.4.8" \ HADOOP_VERSION="2.7" \ GRAPHFRAMES_VERSION="0.8.1-spark2.4-s_2.11" \ APACHE_MIRROR="https://dlcdn.apache.org" \ SPARK_INSTALL="/usr/local" RUN apk add --no-cache \ bash \ openjdk8-jre \ tini RUN apk add --no-cache --virtual .fetch-deps \ wget \ tar # download, install, and symlink spark RUN cd $SPARK_INSTALL && \ wget -q --show-progress --progress=bar:force:noscroll $APACHE_MIRROR/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz 2>&1 && \ tar xzf spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz && \ ln -s spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION spark && \ rm -f spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz # download and install graphframes RUN cd $SPARK_INSTALL/spark/jars && \ wget -q --show-progress --progress=bar:force:noscroll https://repos.spark-packages.org/graphframes/graphframes/$GRAPHFRAMES_VERSION/graphframes-$GRAPHFRAMES_VERSION.jar RUN apk del .fetch-deps && \ rm -rf /tmp/* && \ rm -rf /var/cache/* && \ rm -rf /root/.cache COPY start-master.sh start-worker.sh /usr/local/bin/ # these need to be separate because you can't reference prior environment # variables in the same ENV block ENV SPARK_HOME="$SPARK_INSTALL/spark" \ SPARK_HOSTNAME="localhost" \ SPARK_MASTER_PORT="7077" \ SPARK_MASTER_WEBUI_PORT="8080" COPY spark-defaults.conf $SPARK_HOME/conf ENV SPARK_MASTER="spark://$SPARK_HOSTNAME:$SPARK_MASTER_PORT" # Spark doesn't seem to respond directly to SIGTERM as the exit status is # for SIGKILL (137), after a pause. Presumably docker-compose down times out. # Using tini gives immediate exit with status 143 (SIGTERM). ENTRYPOINT ["/sbin/tini", "--"] CMD ["/usr/local/bin/start-master.sh"]

Ignore Space Show notes View

spark/Dockerfile

# This is useful:
# https://www.dajobe.org/blog/2015/04/18/making-debian-docker-images-smaller/

FROM python:3.6-slim-buster

ENV SPARK_VERSION="2.4.8" \
    HADOOP_VERSION="2.7" \
    GRAPHFRAMES_VERSION="0.8.1-spark2.4-s_2.11" \
    APACHE_MIRROR="https://dlcdn.apache.org" \
    SPARK_INSTALL="/usr/local"

# Coalescing the RUNs saves about 68MB on the final image size (10%)
RUN apt-get update && \
    apt-get install -y --no-install-recommends tini wget gnupg software-properties-common ; \
    # Install AdoptOpenJDK 8
    # https://stackoverflow.com/a/59436618
    wget -qO - https://adoptopenjdk.jfrog.io/adoptopenjdk/api/gpg/key/public | apt-key add - && \
    add-apt-repository --yes https://adoptopenjdk.jfrog.io/adoptopenjdk/deb/ && \
    apt-get update && apt-get install -y adoptopenjdk-8-hotspot ; \
    # Download, install, and symlink spark
    cd $SPARK_INSTALL && \
    wget -q --show-progress --progress=bar:force:noscroll $APACHE_MIRROR/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz 2>&1 && \
    tar xzf spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz && \
    ln -s spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION spark && \
    rm -f spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz ; \
    # Download and install graphframes
    cd $SPARK_INSTALL/spark/jars && \
    wget -q --show-progress --progress=bar:force:noscroll https://repos.spark-packages.org/graphframes/graphframes/$GRAPHFRAMES_VERSION/graphframes-$GRAPHFRAMES_VERSION.jar ; \
    # Clear out all the crap
    apt-get remove -y --purge wget gnupg software-properties-common readline-common libreadline7 netbase libgdbm6 && \
    apt -y autoremove && \
    apt-get clean -y && \
    rm -rf /var/lib/apt/lists/* && \
    rm -rf /tmp/* && \
    rm -rf /var/cache/* && \
    rm -rf /root/.cache

COPY start-master.sh start-worker.sh /usr/local/bin/

# these need to be separate because you can't reference prior environment
# variables in the same ENV block
ENV SPARK_HOME="$SPARK_INSTALL/spark" \
    SPARK_HOSTNAME="localhost" \
    SPARK_MASTER_PORT="7077" \
    SPARK_MASTER_WEBUI_PORT="8080"

COPY spark-defaults.conf $SPARK_HOME/conf

ENV SPARK_MASTER="spark://$SPARK_HOSTNAME:$SPARK_MASTER_PORT"

CMD ["/usr/local/bin/start-master.sh"]

FROM python:3.6-alpine

ENV SPARK_VERSION="2.4.8" \
    HADOOP_VERSION="2.7" \
    GRAPHFRAMES_VERSION="0.8.1-spark2.4-s_2.11" \
    APACHE_MIRROR="https://dlcdn.apache.org" \
    SPARK_INSTALL="/usr/local"

RUN apk add --no-cache \
        bash \
        openjdk8-jre \
        tini

RUN apk add --no-cache --virtual .fetch-deps \
        wget \
        tar

# download, install, and symlink spark
RUN cd $SPARK_INSTALL && \
    wget -q --show-progress --progress=bar:force:noscroll $APACHE_MIRROR/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz 2>&1 && \
    tar xzf spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz && \
    ln -s spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION spark && \
    rm -f spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz

# download and install graphframes
 RUN cd $SPARK_INSTALL/spark/jars && \
    wget -q --show-progress --progress=bar:force:noscroll https://repos.spark-packages.org/graphframes/graphframes/$GRAPHFRAMES_VERSION/graphframes-$GRAPHFRAMES_VERSION.jar

RUN apk del .fetch-deps && \
    rm -rf /tmp/* && \
    rm -rf /var/cache/* && \
    rm -rf /root/.cache

COPY start-master.sh start-worker.sh /usr/local/bin/

COPY spark-defaults.conf $SPARK_HOME/conf

ENV SPARK_MASTER="spark://$SPARK_HOSTNAME:$SPARK_MASTER_PORT"

CMD ["/usr/local/bin/start-master.sh"]

Show line notes below