# Don't use the official jupyter/pyspark-notebook image because it's massive! FROM nstanger/spark # Ideally we should run as a non-root user, but it's problematic to set # up shared files, especially if running Docker within, say, VirtualBox. # See https://vsupalov.com/docker-shared-permissions/ # ARG NB_USER="pyspark" # ARG NB_UID="1000" # ARG NB_GID="1000" USER root # RUN adduser -D -G root $NB_USER RUN pip install --upgrade pip && \ pip install \ # Tornado 6 breaks sparkmonitor # tornado==5.1 \ # sparkmonitor \ jupyter kafka-python graphframes ; \ # Clean up the crap rm -rf /tmp/* && \ rm -rf /var/cache/* && \ rm -rf /root/.cache ENV PYSPARK_PYTHON="/usr/local/bin/python" ENV KERNEL_NAME="PySpark" \ KERNELS_TEMPLATE_PATH="/tmp" \ KERNELS_DIR_PATH="/root/.local/share/jupyter/kernels" \ PYTHONPATH="$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.7-src.zip" \ PYSPARK_DRIVER_PYTHON="$PYSPARK_PYTHON" \ PYSPARK_SUBMIT_ARGS="--master $SPARK_MASTER pyspark-shell" \ PYTHONSTARTUP="$SPARK_HOME/python/pyspark/shell.py" # ENV SPARKMONITOR_UI_PORT 8080 # USER $NB_USER RUN ipython kernel install --name $KERNEL_NAME # kernel.json is based on the template from <https://github.com/Anchormen/pyspark-jupyter-kernels>. COPY kernel.json $KERNELS_DIR_PATH/$KERNEL_NAME/ # RUN jupyter nbextension install sparkmonitor --py --symlink \ # && jupyter nbextension enable sparkmonitor --py \ # && jupyter serverextension enable --py sparkmonitor \ # && ipython profile create \ # && echo "c.InteractiveShellApp.extensions.append('sparkmonitor.kernelextension')" >> $(ipython profile locate default)/ipython_kernel_config.py # RUN mkdir -p /home/$NB_USER/work # WORKDIR /home/$NB_USER/work ENTRYPOINT ["/sbin/tini", "--"] # The kernel will automatically set up the PySpark context when it is loaded. CMD ["jupyter", "notebook", "--ip=0.0.0.0", "--port=8888", "--allow-root"] # debugging # CMD ["bash"]