Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dbfs HF #1214

Merged
merged 38 commits into from
Jun 15, 2024
Merged

Dbfs HF #1214

Changes from 13 commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
d2f059d
remerge
KuuCi May 15, 2024
dbf3720
import supported extensions
KuuCi May 15, 2024
44db361
no circular imports
KuuCi May 15, 2024
0cd0271
local import
KuuCi May 16, 2024
5c3dd59
typo
KuuCi May 16, 2024
060ea34
testing
KuuCi May 16, 2024
5558a2d
test
KuuCi May 16, 2024
0f51530
test
KuuCi May 16, 2024
d9fe08c
more test
KuuCi May 16, 2024
d88077b
rm . from ext
KuuCi May 16, 2024
605816c
clean up and pre commit
KuuCi May 16, 2024
44420b4
imports and boolean typo
KuuCi May 16, 2024
507fc28
Merge branch 'main' into dbfs-hf
dakinggg May 16, 2024
709d409
revert one exception, use backend over startwith, and set path using …
KuuCi May 17, 2024
46fa3ce
move import to local
KuuCi May 17, 2024
490f5fa
debug
KuuCi May 17, 2024
ad58877
typo
KuuCi May 17, 2024
f6ace0e
debug
KuuCi May 17, 2024
05f1ee4
more debug
KuuCi May 17, 2024
95a0215
debug
KuuCi May 17, 2024
1b8702b
clean up
KuuCi May 17, 2024
f83730a
back to debug
KuuCi May 17, 2024
f9d82a6
typo
KuuCi May 17, 2024
6129058
add slash
KuuCi May 17, 2024
5699b5f
move slash to verify_uc_path
KuuCi May 17, 2024
f6680f5
moved back
KuuCi May 17, 2024
c579c94
debug
KuuCi May 17, 2024
9fd26bd
more debug
KuuCi May 17, 2024
986b1f8
clean up
KuuCi May 17, 2024
3adc789
pyright
KuuCi May 17, 2024
e5a0719
Merge branch 'main' into dbfs-hf
dakinggg Jun 5, 2024
37dbce3
generic error messaging
Jun 6, 2024
d6621f9
move NotFound up
Jun 6, 2024
d9a0c25
dbfs directory check
Jun 6, 2024
99ed590
precommit
Jun 7, 2024
35d2aaa
Merge branch 'main' into dbfs-hf
KuuCi Jun 14, 2024
6dcf61f
Merge branch 'main' into dbfs-hf
KuuCi Jun 14, 2024
d852cbb
Merge branch 'main' into dbfs-hf
KuuCi Jun 14, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 53 additions & 1 deletion llmfoundry/utils/config_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -592,7 +592,24 @@ def _process_data_source(
elif 'hf_name' in dataset:
hf_path = dataset['hf_name']
backend, _, _ = parse_uri(hf_path)
if backend:
unsupported_file = True
if hf_path.startswith('dbfs:'):
KuuCi marked this conversation as resolved.
Show resolved Hide resolved
assert cfg_split
from llmfoundry.data.finetuning.tasks import SUPPORTED_EXTENSIONS
possible_files = [
f'{cfg_split}{ext}' for ext in SUPPORTED_EXTENSIONS
]
for file in possible_files:
path = os.path.join(hf_path[len('dbfs:'):], file)
KuuCi marked this conversation as resolved.
Show resolved Hide resolved
if _verify_uc_path(path):
data_paths.append(('uc_volume', path, true_split))
unsupported_file = False
break
if unsupported_file:
log.warning(
f'{hf_path} does not contain a supported file extension.',
)
elif backend:
hf_path = os.path.join(hf_path, cfg_split) if cfg_split else hf_path
data_paths.append((backend, hf_path, true_split))
elif os.path.exists(hf_path):
Expand Down Expand Up @@ -656,3 +673,38 @@ def _log_dataset_uri(cfg: Dict[str, Any]) -> None:
mlflow.log_input(
mlflow.data.meta_dataset.MetaDataset(source, name=split),
)


def _verify_uc_path(path: str) -> bool:
"""Verify a UC path exists.

Args:
path (str): UnityCatalog path
Returns:
(bool): If path exists or not
"""
try:
from databricks.sdk import WorkspaceClient

w = WorkspaceClient()
except ImportError:
log.warning(
'Cannot verify the path of `UCVolumeDatasetSource` because of missing' + \
'`databricks-sdk`. Please install `databricks-sdk` via ' + \
'`pip install -U databricks-sdk`. This does not block creating ' + \
'`UCVolumeDatasetSource`, but your `UCVolumeDatasetSource` might be invalid.',
)
return False
except ValueError:
log.warning(
KuuCi marked this conversation as resolved.
Show resolved Hide resolved
'Cannot verify the path of `UCVolumeDatasetSource` due to a connection failure ' + \
'with Databricks workspace. Please run `mlflow.login()` to log in to Databricks. ' + \
'This does not block creating `UCVolumeDatasetSource`, but your ' + \
'`UCVolumeDatasetSource` might be invalid.')
return False

try:
w.files.get_metadata(path)
return True
except AttributeError:
return False
Loading