# Don't use the official jupyter/pyspark-notebook image because it's massive!
FROM analytics/spark
ARG NB_USER="pyspark"
ARG NB_UID="1000"
ARG NB_GID="1000"
USER root
RUN adduser -D -u $NB_UID $NB_USER
RUN apk add --no-cache --virtual .build-deps \
build-base \
zeromq-dev \
python3-dev \
py3-zmq
RUN pip install --upgrade \
pip \
&& pip install \
jupyter \
kafka-python \
pyspark \
sparkmonitor
RUN apk del .build-deps
ENV PYSPARK_PYTHON="/usr/local/bin/python3"
ENV KERNEL_NAME="PySpark" \
KERNELS_TEMPLATE_PATH="/tmp" \
KERNELS_DIR_PATH="/home/$NB_USER/.local/share/jupyter/kernels" \
PYTHONPATH="$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.7-src.zip" \
PYSPARK_DRIVER_PYTHON="$PYSPARK_PYTHON" \
PYSPARK_SUBMIT_ARGS="--master $SPARK_MASTER pyspark-shell" \
PYTHONSTARTUP="$SPARK_HOME/python/pyspark/shell.py"
# RUN mkdir -p $KERNELS_DIR_PATH/$KERNEL_NAME \
# && pip install pyhocon \
# && cat $KERNELS_TEMPLATE_PATH/kernel.json | pyhocon -f json >> $KERNELS_DIR_PATH/$KERNEL_NAME/kernel.json \
# && pip uninstall -y pyhocon pyparsing
# ENV SPARKMONITOR_UI_PORT 8080
USER $NB_UID
RUN ipython kernel install --user --name $KERNEL_NAME
COPY kernel.json pyspark-kernel.sh $KERNELS_DIR_PATH/$KERNEL_NAME/
RUN jupyter nbextension install sparkmonitor --py --user --symlink \
&& jupyter nbextension enable sparkmonitor --py --user \
&& jupyter serverextension enable --py --user sparkmonitor \
&& ipython profile create \
&& echo "c.InteractiveShellApp.extensions.append('sparkmonitor.kernelextension')" >> $(ipython profile locate default)/ipython_kernel_config.py
RUN mkdir -p /home/$NB_USER/work
WORKDIR /home/$NB_USER/work
# CMD ["pyspark"]
# PySpark doesn't seem to load the sparkmonitor extension, so let's just
# go with Jupyter and manually create contexts and sessions as required.
CMD ["jupyter", "notebook", "--ip=0.0.0.0", "--port=8888", "--no-browser"]
# debugging
# CMD ["bash"]