Newer
Older
docker-analytics / pyspark / Dockerfile
# Don't use the official jupyter/pyspark-notebook image because it's massive!
FROM analytics/spark

ARG NB_USER="pyspark"
ARG NB_UID="1000"
ARG NB_GID="1000"

USER root

RUN adduser -D -u $NB_UID $NB_USER

RUN apk add --no-cache --virtual .build-deps \
        build-base \
        zeromq-dev \
        python3-dev \
        py3-zmq

RUN pip install --upgrade \
        pip \
    && pip install \
        jupyter \
        kafka-python \
        pyspark \
        sparkmonitor

RUN apk del .build-deps

ENV PYSPARK_PYTHON="/usr/local/bin/python3"

ENV KERNEL_NAME="PySpark" \
    KERNELS_TEMPLATE_PATH="/tmp" \
    KERNELS_DIR_PATH="/home/$NB_USER/.local/share/jupyter/kernels" \
    PYTHONPATH="$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.7-src.zip" \
    PYSPARK_DRIVER_PYTHON="$PYSPARK_PYTHON" \
    PYSPARK_SUBMIT_ARGS="--master $SPARK_MASTER pyspark-shell" \
    PYTHONSTARTUP="$SPARK_HOME/python/pyspark/shell.py"

# ENV SPARKMONITOR_UI_PORT 8080

USER $NB_UID

RUN ipython kernel install --user --name $KERNEL_NAME
# kernel.json is based on the template from <https://github.com/Anchormen/pyspark-jupyter-kernels>.
COPY kernel.json pyspark-kernel.sh $KERNELS_DIR_PATH/$KERNEL_NAME/

RUN jupyter nbextension install sparkmonitor --py --user --symlink \
    && jupyter nbextension enable sparkmonitor --py --user \
    && jupyter serverextension enable --py --user sparkmonitor \
    && ipython profile create \
    && echo "c.InteractiveShellApp.extensions.append('sparkmonitor.kernelextension')" >> $(ipython profile locate default)/ipython_kernel_config.py

RUN mkdir -p /home/$NB_USER/work
WORKDIR /home/$NB_USER/work

# pyspark-kernel.sh will automatically set up the PySpark context when
# the kernel is loaded.
CMD ["jupyter", "notebook", "--ip=0.0.0.0", "--port=8888", "--no-browser"]

# debugging
# CMD ["bash"]