Newer
Older
docker-analytics / pyspark / Dockerfile
  1. # Don't use the official jupyter/pyspark-notebook image because it's massive!
  2. FROM analytics/spark
  3.  
  4. ARG NB_USER="pyspark"
  5. ARG NB_UID="1000"
  6. ARG NB_GID="1000"
  7.  
  8. USER root
  9.  
  10. RUN adduser -D -u $NB_UID $NB_USER
  11.  
  12. RUN apk add --no-cache --virtual .build-deps \
  13. build-base \
  14. zeromq-dev \
  15. python3-dev \
  16. py3-zmq
  17.  
  18. RUN pip install --upgrade \
  19. pip \
  20. && pip install \
  21. # Tornado 6 breaks sparkmonitor
  22. tornado==5.1 \
  23. jupyter \
  24. kafka-python \
  25. pyspark \
  26. sparkmonitor \
  27. tini
  28.  
  29. RUN apk del .build-deps
  30.  
  31. ENV PYSPARK_PYTHON="/usr/local/bin/python3"
  32.  
  33. ENV KERNEL_NAME="PySpark" \
  34. KERNELS_TEMPLATE_PATH="/tmp" \
  35. KERNELS_DIR_PATH="/home/$NB_USER/.local/share/jupyter/kernels" \
  36. PYTHONPATH="$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.7-src.zip" \
  37. PYSPARK_DRIVER_PYTHON="$PYSPARK_PYTHON" \
  38. PYSPARK_SUBMIT_ARGS="--master $SPARK_MASTER pyspark-shell" \
  39. PYTHONSTARTUP="$SPARK_HOME/python/pyspark/shell.py"
  40.  
  41. # ENV SPARKMONITOR_UI_PORT 8080
  42.  
  43. USER $NB_UID
  44.  
  45. RUN ipython kernel install --user --name $KERNEL_NAME
  46. # kernel.json is based on the template from <https://github.com/Anchormen/pyspark-jupyter-kernels>.
  47. COPY kernel.json pyspark-kernel.sh $KERNELS_DIR_PATH/$KERNEL_NAME/
  48.  
  49. RUN jupyter nbextension install sparkmonitor --py --user --symlink \
  50. && jupyter nbextension enable sparkmonitor --py --user \
  51. && jupyter serverextension enable --py --user sparkmonitor \
  52. && ipython profile create \
  53. && echo "c.InteractiveShellApp.extensions.append('sparkmonitor.kernelextension')" >> $(ipython profile locate default)/ipython_kernel_config.py
  54.  
  55. RUN mkdir -p /home/$NB_USER/work
  56. WORKDIR /home/$NB_USER/work
  57.  
  58. ENTRYPOINT ["/sbin/tini", "--"]
  59.  
  60. # pyspark-kernel.sh will automatically set up the PySpark context when
  61. # the kernel is loaded.
  62. CMD ["jupyter", "notebook", "--ip=0.0.0.0", "--port=8888"]
  63.  
  64. # debugging
  65. # CMD ["bash"]