diff --git a/spark/Dockerfile b/spark/Dockerfile index 480abf9..23fe392 100644 --- a/spark/Dockerfile +++ b/spark/Dockerfile @@ -1,23 +1,35 @@ FROM python:3.6-alpine -ENV SPARK_VERSION="2.4.3" \ +ENV SPARK_VERSION="2.4.8" \ HADOOP_VERSION="2.7" \ + GRAPHFRAMES_VERSION="0.8.1-spark2.4-s_2.11" \ + APACHE_MIRROR="https://dlcdn.apache.org" \ SPARK_INSTALL="/usr/local" RUN apk add --no-cache \ bash \ - openjdk8 \ - tini \ - zeromq + openjdk8-jre \ + tini RUN apk add --no-cache --virtual .fetch-deps \ - curl \ + wget \ tar -RUN curl -s https://www-us.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz | tar -xz -C $SPARK_INSTALL && \ - cd $SPARK_INSTALL && ln -s spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION spark +# download, install, and symlink spark +RUN cd $SPARK_INSTALL && \ + wget -q --show-progress --progress=bar:force:noscroll $APACHE_MIRROR/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz 2>&1 && \ + tar xzf spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz && \ + ln -s spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION spark && \ + rm -f spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz -RUN apk del .fetch-deps +# download and install graphframes + RUN cd $SPARK_INSTALL/spark/jars && \ + wget -q --show-progress --progress=bar:force:noscroll https://repos.spark-packages.org/graphframes/graphframes/$GRAPHFRAMES_VERSION/graphframes-$GRAPHFRAMES_VERSION.jar + +RUN apk del .fetch-deps && \ + rm -rf /tmp/* && \ + rm -rf /var/cache/* && \ + rm -rf /root/.cache COPY start-master.sh start-worker.sh /usr/local/bin/