Newer
Older
docker-analytics / pyspark / Dockerfile
FROM comp101/spark

ARG NB_USER="pyspark"
ARG NB_UID="1000"
ARG NB_GID="1000"

USER root

RUN adduser -D -u $NB_UID $NB_USER

RUN apk add --no-cache --virtual .build-deps \
        build-base \
        zeromq-dev \
        python3-dev \
        py3-zmq \
    && pip install --upgrade pip \
    # Tornado 6 breaks sparkmonitor
    && pip install tornado==5.1 pyspark jupyter sparkmonitor \
    && apk del .build-deps

ENV PYTHONPATH $SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.7-src.zip
# ENV PYSPARK_DRIVER_PYTHON jupyter
# ENV PYSPARK_DRIVER_PYTHON_OPTS "notebook --ip=0.0.0.0 --port=8888"
# ENV SPARKMONITOR_UI_PORT 8080

USER $NB_UID

RUN jupyter nbextension install sparkmonitor --py --user --symlink \
    && jupyter nbextension enable sparkmonitor --py --user \
    && jupyter serverextension enable --py --user sparkmonitor \
    && ipython profile create \
    && echo "c.InteractiveShellApp.extensions.append('sparkmonitor.kernelextension')" >> $(ipython profile locate default)/ipython_kernel_config.py

COPY pyspark_demo.ipynb /home/$NB_USER/work/pyspark_demo.ipynb

# WORKDIR /home/$NB_USER/work
WORKDIR /mnt/sparkdata

# CMD ["pyspark"]
# PySpark doesn't seem to load the sparkmonitor extension, so let's just
# go with Jupyter and manually create contexts and sessions as required.
CMD ["jupyter", "notebook", "--ip=0.0.0.0", "--port=8888"]

# debugging
# CMD ["bash"]