Newer
Older
docker-analytics / spark / Dockerfile
FROM python:3.6-alpine3.9

ENV SPARK_VERSION 2.4.3
ENV SPARK_INSTALL /usr/local
ENV HADOOP_VERSION 2.7

RUN apk add --no-cache \
        bash \
        openjdk8 \
        tini \
        zeromq

RUN apk add --no-cache --virtual .fetch-deps \
        curl \
        tar

RUN curl -s https://www-us.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz | tar -xz -C $SPARK_INSTALL && \
    cd $SPARK_INSTALL && ln -s spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION spark

RUN apk del .fetch-deps

COPY start-master.sh /usr/local/bin/start-master.sh
COPY start-worker.sh /usr/local/bin/start-worker.sh

ENV SPARK_HOME $SPARK_INSTALL/spark
ENV SPARK_LOCAL_IP 127.0.0.1
ENV SPARK_MASTER_PORT 7077
ENV SPARK_MASTER_WEBUI_PORT 8080
ENV PYSPARK_PYTHON /usr/local/bin/python3

# Spark doesn't seem to respond directly to SIGTERM as the exit status is
# for SIGKILL (137), after a pause. Presumably docker-compose down times out.
# Using tini gives immediate exit with status 143 (SIGTERM).
ENTRYPOINT ["/sbin/tini", "--"]

CMD ["/usr/local/bin/start-master.sh"]