diff --git a/pyspark/Dockerfile b/pyspark/Dockerfile index bd5c0ee..a85adf7 100644 --- a/pyspark/Dockerfile +++ b/pyspark/Dockerfile @@ -1,5 +1,5 @@ # Don't use the official jupyter/pyspark-notebook image because it's massive! -FROM nstanger/spark +FROM nstanger/spark2 # Ideally we should run as a non-root user, but it's problematic to set # up shared files, especially if running Docker within, say, VirtualBox. @@ -12,28 +12,32 @@ # RUN adduser -D -G root $NB_USER +RUN apk --no-cache add \ + zeromq-dev + RUN apk add --no-cache --virtual .build-deps \ build-base \ - zeromq-dev \ - python3-dev \ libffi-dev \ - py3-zmq + python3-dev \ + py3-wheel \ + py3-numpy RUN pip install --upgrade \ pip \ && pip install \ # Tornado 6 breaks sparkmonitor - tornado==5.1 \ + # tornado==5.1 \ jupyter \ kafka-python \ - pyspark \ - sparkmonitor \ - graphframes \ - tini + # sparkmonitor \ + graphframes -RUN apk del .build-deps +RUN apk del .build-deps && \ + rm -rf /tmp/* && \ + rm -rf /var/cache/* && \ + rm -rf /root/.cache -ENV PYSPARK_PYTHON="/usr/local/bin/python3" +ENV PYSPARK_PYTHON="/usr/local/bin/python" ENV KERNEL_NAME="PySpark" \ KERNELS_TEMPLATE_PATH="/tmp" \ @@ -49,21 +53,20 @@ RUN ipython kernel install --name $KERNEL_NAME # kernel.json is based on the template from . -COPY kernel.json pyspark-kernel.sh $KERNELS_DIR_PATH/$KERNEL_NAME/ +COPY kernel.json $KERNELS_DIR_PATH/$KERNEL_NAME/ -RUN jupyter nbextension install sparkmonitor --py --symlink \ - && jupyter nbextension enable sparkmonitor --py \ - && jupyter serverextension enable --py sparkmonitor \ - && ipython profile create \ - && echo "c.InteractiveShellApp.extensions.append('sparkmonitor.kernelextension')" >> $(ipython profile locate default)/ipython_kernel_config.py +# RUN jupyter nbextension install sparkmonitor --py --symlink \ +# && jupyter nbextension enable sparkmonitor --py \ +# && jupyter serverextension enable --py sparkmonitor \ +# && ipython profile create \ +# && echo "c.InteractiveShellApp.extensions.append('sparkmonitor.kernelextension')" >> $(ipython profile locate default)/ipython_kernel_config.py # RUN mkdir -p /home/$NB_USER/work # WORKDIR /home/$NB_USER/work ENTRYPOINT ["/sbin/tini", "--"] -# pyspark-kernel.sh will automatically set up the PySpark context when -# the kernel is loaded. +# The kernel will automatically set up the PySpark context when it is loaded. CMD ["jupyter", "notebook", "--ip=0.0.0.0", "--port=8888", "--allow-root"] # debugging