Revamped in line with Mark’s approach
• switched to direct install of graphframes (closes #1)
1 parent 2868c44 commit ba71b8b37456c752dba7a3ef5a969e27b915ea11
Nigel Stanger authored on 20 Sep 2021
Showing 1 changed file
View
40
spark/Dockerfile
FROM python:3.6-alpine
 
ENV SPARK_VERSION="2.4.3" \
ENV SPARK_VERSION="2.4.8" \
HADOOP_VERSION="2.7" \
GRAPHFRAMES_VERSION="0.8.1-spark2.4-s_2.11" \
APACHE_MIRROR="https://dlcdn.apache.org" \
SPARK_INSTALL="/usr/local"
 
RUN apk add --no-cache \
bash \
openjdk8 \
tini \
zeromq
openjdk8-jre \
tini
 
RUN apk add --no-cache --virtual .fetch-deps \
curl \
wget \
tar
 
RUN curl -s https://www-us.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz | tar -xz -C $SPARK_INSTALL && \
cd $SPARK_INSTALL && ln -s spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION spark
# download, install, and symlink spark
RUN cd $SPARK_INSTALL && \
wget -q --show-progress --progress=bar:force:noscroll $APACHE_MIRROR/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz 2>&1 && \
tar xzf spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz && \
ln -s spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION spark && \
rm -f spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION.tgz
 
RUN apk del .fetch-deps
# download and install graphframes
RUN cd $SPARK_INSTALL/spark/jars && \
wget -q --show-progress --progress=bar:force:noscroll https://repos.spark-packages.org/graphframes/graphframes/$GRAPHFRAMES_VERSION/graphframes-$GRAPHFRAMES_VERSION.jar
 
RUN apk del .fetch-deps && \
rm -rf /tmp/* && \
rm -rf /var/cache/* && \
rm -rf /root/.cache
 
COPY start-master.sh start-worker.sh /usr/local/bin/
 
# these need to be separate because you can't reference prior environment