docker-analytics/pyspark/Dockerfile at 5f3d69d5df392b8549193540cdc6a2b9e82995b9

nigel.stanger / docker-analytics

Find file

Newer

Older

docker-analytics / pyspark / Dockerfile

Nigel Stanger on 21 Sep 2021 1 KB Switched standalone images to new structure

Raw Blame History

# Don't use the official jupyter/pyspark-notebook image because it's massive!
FROM nstanger/spark

# Ideally we should run as a non-root user, but it's problematic to set
# up shared files, especially if running Docker within, say, VirtualBox.
# See https://vsupalov.com/docker-shared-permissions/
# ARG NB_USER="pyspark"
# ARG NB_UID="1000"
# ARG NB_GID="1000"

USER root

# RUN adduser -D -G root $NB_USER

RUN pip install --upgrade pip && \
    pip install \
        # Tornado 6 breaks sparkmonitor
        # tornado==5.1 \
        # sparkmonitor \
        jupyter kafka-python graphframes ; \
    # Clean up the crap
    rm -rf /tmp/* && \
    rm -rf /var/cache/* && \
    rm -rf /root/.cache

ENV PYSPARK_PYTHON="/usr/local/bin/python"

ENV KERNEL_NAME="PySpark" \
    KERNELS_TEMPLATE_PATH="/tmp" \
    KERNELS_DIR_PATH="/root/.local/share/jupyter/kernels" \
    PYTHONPATH="$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.7-src.zip" \
    PYSPARK_DRIVER_PYTHON="$PYSPARK_PYTHON" \
    PYSPARK_SUBMIT_ARGS="--master $SPARK_MASTER pyspark-shell" \
    PYTHONSTARTUP="$SPARK_HOME/python/pyspark/shell.py"

# ENV SPARKMONITOR_UI_PORT 8080

# USER $NB_USER

RUN ipython kernel install --name $KERNEL_NAME
# kernel.json is based on the template from <https://github.com/Anchormen/pyspark-jupyter-kernels>.
COPY kernel.json $KERNELS_DIR_PATH/$KERNEL_NAME/

# RUN jupyter nbextension install sparkmonitor --py --symlink \
#     && jupyter nbextension enable sparkmonitor --py \
#     && jupyter serverextension enable --py sparkmonitor \
#     && ipython profile create \
#     && echo "c.InteractiveShellApp.extensions.append('sparkmonitor.kernelextension')" >> $(ipython profile locate default)/ipython_kernel_config.py

# RUN mkdir -p /home/$NB_USER/work
# WORKDIR /home/$NB_USER/work

ENTRYPOINT ["/sbin/tini", "--"]

# The kernel will automatically set up the PySpark context when it is loaded.
CMD ["jupyter", "notebook", "--ip=0.0.0.0", "--port=8888", "--allow-root"]

# debugging
# CMD ["bash"]