Newer
Older
docker-analytics / pyspark / Dockerfile
  1. # Don't use the official jupyter/pyspark-notebook image because it's massive!
  2. FROM nstanger/spark
  3.  
  4. # Ideally we should run as a non-root user, but it's problematic to set
  5. # up shared files, especially if running Docker within, say, VirtualBox.
  6. # See https://vsupalov.com/docker-shared-permissions/
  7. # ARG NB_USER="pyspark"
  8. # ARG NB_UID="1000"
  9. # ARG NB_GID="1000"
  10.  
  11. USER root
  12.  
  13. # RUN adduser -D -G root $NB_USER
  14.  
  15. RUN pip install --upgrade pip && \
  16. pip install \
  17. # Tornado 6 breaks sparkmonitor
  18. # tornado==5.1 \
  19. # sparkmonitor \
  20. jupyter kafka-python graphframes ; \
  21. # Clean up the crap
  22. rm -rf /tmp/* && \
  23. rm -rf /var/cache/* && \
  24. rm -rf /root/.cache
  25.  
  26. ENV PYSPARK_PYTHON="/usr/local/bin/python"
  27.  
  28. ENV KERNEL_NAME="PySpark" \
  29. KERNELS_TEMPLATE_PATH="/tmp" \
  30. KERNELS_DIR_PATH="/root/.local/share/jupyter/kernels" \
  31. PYTHONPATH="$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.7-src.zip" \
  32. PYSPARK_DRIVER_PYTHON="$PYSPARK_PYTHON" \
  33. PYSPARK_SUBMIT_ARGS="--master $SPARK_MASTER pyspark-shell" \
  34. PYTHONSTARTUP="$SPARK_HOME/python/pyspark/shell.py"
  35.  
  36. # ENV SPARKMONITOR_UI_PORT 8080
  37.  
  38. # USER $NB_USER
  39.  
  40. RUN ipython kernel install --name $KERNEL_NAME
  41. # kernel.json is based on the template from <https://github.com/Anchormen/pyspark-jupyter-kernels>.
  42. COPY kernel.json $KERNELS_DIR_PATH/$KERNEL_NAME/
  43.  
  44. # RUN jupyter nbextension install sparkmonitor --py --symlink \
  45. # && jupyter nbextension enable sparkmonitor --py \
  46. # && jupyter serverextension enable --py sparkmonitor \
  47. # && ipython profile create \
  48. # && echo "c.InteractiveShellApp.extensions.append('sparkmonitor.kernelextension')" >> $(ipython profile locate default)/ipython_kernel_config.py
  49.  
  50. # RUN mkdir -p /home/$NB_USER/work
  51. # WORKDIR /home/$NB_USER/work
  52.  
  53. ENTRYPOINT ["/sbin/tini", "--"]
  54.  
  55. # The kernel will automatically set up the PySpark context when it is loaded.
  56. CMD ["jupyter", "notebook", "--ip=0.0.0.0", "--port=8888", "--allow-root"]
  57.  
  58. # debugging
  59. # CMD ["bash"]