Skip to content

Commit

Permalink
Fix "Eliminate superfluous Spark tarball download (#37)" (#39)
Browse files Browse the repository at this point in the history
The PR #37 broke Notebooks in Colab, because a packages there are not installed
in `sysconfig.get_paths()["purelib"]` but in a directory listed via
`site.getsitepackages()`.

This also reverts commit 3d76f31 for extra safety
as a fallback.
  • Loading branch information
snazy authored May 18, 2021
1 parent bbaa20a commit eb593d2
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 2 deletions.
2 changes: 2 additions & 0 deletions configs/nessie-0.5-iceberg-0.11.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,5 @@ python_dependencies:
- pandas==1.2.4
- pyarrow==4.0.0

spark:
tarball: https://downloads.apache.org/spark/spark-3.0.2/spark-3.0.2-bin-hadoop2.7.tgz
15 changes: 13 additions & 2 deletions pydemolib/nessiedemo/spark.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@

import os
import re
import site
import sysconfig
from types import TracebackType
from typing import Any, Tuple, TypeVar
Expand Down Expand Up @@ -55,10 +56,19 @@ def __init__(self: T, demo: NessieDemo) -> None:
"""Creates a `NessieDemoSpark` instance for respectively using the given `NessieDemo` instance."""
self.__demo = demo

spark_dir = None

pyspark_dir = os.path.join(sysconfig.get_paths()["purelib"], "pyspark")
if os.path.isdir(pyspark_dir):
spark_dir = pyspark_dir
elif "spark" in self.__demo._get_versions_dict() and "tarball" in self.__demo._get_versions_dict()["spark"]:
else:
for dir in site.getsitepackages():
pyspark_dir = os.path.join(dir, "pyspark")
if os.path.isdir(pyspark_dir):
spark_dir = pyspark_dir
break

if not spark_dir and "spark" in self.__demo._get_versions_dict() and "tarball" in self.__demo._get_versions_dict()["spark"]:
spark_url = self.__demo._get_versions_dict()["spark"]["tarball"]
# derive directory name inside the tarball from the URL
m = re.match(".*[/]([a-zA-Z0-9-.]+)[.]tgz", spark_url)
Expand All @@ -71,7 +81,8 @@ def __init__(self: T, demo: NessieDemo) -> None:
if not os.path.exists(tgz):
_Util.wget(spark_url, tgz)
_Util.exec_fail(["tar", "-x", "-C", os.path.abspath(os.path.join(spark_dir, "..")), "-f", tgz])
else:

if not spark_dir:
raise Exception("configuration does not define spark.tarball and pyspark is not installed. Unable to find Spark.")

print("Using Spark in {}".format(spark_dir))
Expand Down

0 comments on commit eb593d2

Please sign in to comment.