FROM python:3.6-alpine
ENV SPARK_VERSION="2.4.3" \
HADOOP_VERSION="2.7" \
SPARK_INSTALL="/usr/local"
RUN apk add --no-cache \
bash \
openjdk8 \
tini \
zeromq
RUN apk add --no-cache --virtual .fetch-deps \
curl \
tar
RUN curl -s https://www-us.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz | tar -xz -C $SPARK_INSTALL && \
cd $SPARK_INSTALL && ln -s spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION spark
RUN apk del .fetch-deps
COPY start-master.sh start-worker.sh /usr/local/bin/
# these need to be separate because you can't reference prior environment
# variables in the same ENV block
ENV SPARK_HOME="$SPARK_INSTALL/spark" \
SPARK_HOSTNAME="localhost" \
SPARK_MASTER_PORT="7077" \
SPARK_MASTER_WEBUI_PORT="8080"
COPY spark-defaults.conf $SPARK_HOME/conf
ENV SPARK_MASTER="spark://$SPARK_HOSTNAME:$SPARK_MASTER_PORT"
# Spark doesn't seem to respond directly to SIGTERM as the exit status is
# for SIGKILL (137), after a pause. Presumably docker-compose down times out.
# Using tini gives immediate exit with status 143 (SIGTERM).
ENTRYPOINT ["/sbin/tini", "--"]
CMD ["/usr/local/bin/start-master.sh"]