- # Don't use the official jupyter/pyspark-notebook image because it's massive!
- FROM nstanger/spark
-
- # Ideally we should run as a non-root user, but it's problematic to set
- # up shared files, especially if running Docker within, say, VirtualBox.
- # See https://vsupalov.com/docker-shared-permissions/
- # ARG NB_USER="pyspark"
- # ARG NB_UID="1000"
- # ARG NB_GID="1000"
-
- USER root
-
- # RUN adduser -D -G root $NB_USER
-
- RUN pip install --upgrade pip && \
- pip install \
- # Tornado 6 breaks sparkmonitor
- # tornado==5.1 \
- # sparkmonitor \
- jupyter kafka-python graphframes ; \
- # Clean up the crap
- rm -rf /tmp/* && \
- rm -rf /var/cache/* && \
- rm -rf /root/.cache
-
- ENV PYSPARK_PYTHON="/usr/local/bin/python"
-
- ENV KERNEL_NAME="PySpark" \
- KERNELS_TEMPLATE_PATH="/tmp" \
- KERNELS_DIR_PATH="/root/.local/share/jupyter/kernels" \
- PYTHONPATH="$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.7-src.zip" \
- PYSPARK_DRIVER_PYTHON="$PYSPARK_PYTHON" \
- PYSPARK_SUBMIT_ARGS="--master $SPARK_MASTER pyspark-shell" \
- PYTHONSTARTUP="$SPARK_HOME/python/pyspark/shell.py"
-
- # ENV SPARKMONITOR_UI_PORT 8080
-
- # USER $NB_USER
-
- RUN ipython kernel install --name $KERNEL_NAME
- # kernel.json is based on the template from <https://github.com/Anchormen/pyspark-jupyter-kernels>.
- COPY kernel.json $KERNELS_DIR_PATH/$KERNEL_NAME/
-
- # RUN jupyter nbextension install sparkmonitor --py --symlink \
- # && jupyter nbextension enable sparkmonitor --py \
- # && jupyter serverextension enable --py sparkmonitor \
- # && ipython profile create \
- # && echo "c.InteractiveShellApp.extensions.append('sparkmonitor.kernelextension')" >> $(ipython profile locate default)/ipython_kernel_config.py
-
- # RUN mkdir -p /home/$NB_USER/work
- # WORKDIR /home/$NB_USER/work
-
- ENTRYPOINT ["/sbin/tini", "--"]
-
- # The kernel will automatically set up the PySpark context when it is loaded.
- CMD ["jupyter", "notebook", "--ip=0.0.0.0", "--port=8888", "--allow-root"]
-
- # debugging
- # CMD ["bash"]