Skip to content

Commit

Permalink
Set ft dataloader name explicitly (#1187)
Browse files Browse the repository at this point in the history
  • Loading branch information
milocress authored May 9, 2024
1 parent 139abab commit 51d0d09
Show file tree
Hide file tree
Showing 6 changed files with 6 additions and 23 deletions.
3 changes: 1 addition & 2 deletions llmfoundry/data/finetuning/dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@ def build_finetuning_dataloader(
pin_memory: bool = True,
prefetch_factor: int = 2,
persistent_workers: bool = True,
name: Optional[str] = None,
timeout: int = 0,
) -> DataSpec:
"""Builds a finetuning dataloader for training or evaluating.
Expand Down Expand Up @@ -161,7 +160,7 @@ def build_finetuning_dataloader(

# this full config is necessary for properly profiling the packing ratio
dataloader_cfg = {
'name': name,
'name': 'finetuning',
'dataset': dataset_cfg,
'drop_last': drop_last,
'num_workers': num_workers,
Expand Down
10 changes: 4 additions & 6 deletions llmfoundry/utils/config_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -500,15 +500,13 @@ def log_config(cfg: Dict[str, Any]) -> None:
config with different variables.
"""
print(om.to_yaml(cfg))
if 'wandb' in cfg.get('loggers', {}):
try:
import wandb
except ImportError as e:
raise e
loggers = cfg.get('loggers', None) or {}
if 'wandb' in loggers:
import wandb
if wandb.run:
wandb.config.update(cfg)

if 'mlflow' in cfg.get('loggers', {}) and mlflow.active_run():
if 'mlflow' in loggers and mlflow.active_run():
mlflow.log_params(params=om.to_container(cfg, resolve=True))
_log_dataset_uri(cfg)

Expand Down
1 change: 1 addition & 0 deletions tests/a_scripts/inference/test_convert_composer_to_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -849,6 +849,7 @@ def test_huggingface_conversion_callback(
tokenizer_kwargs={'model_max_length': max_seq_len},
)

dataloader_cfg.pop('name')
train_dataloader = build_finetuning_dataloader(
tokenizer=tokenizer,
device_batch_size=device_batch_size,
Expand Down
12 changes: 0 additions & 12 deletions tests/data/test_dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,7 +318,6 @@ def test_invalid_jsonl_data():
packing_ratio = 'auto'
allow_pad_trimming = False
cfg = {
'name': 'finetuning',
'dataset': {
'hf_name': 'iamroot/chat_malformatted_examples',
'split': 'train',
Expand Down Expand Up @@ -374,7 +373,6 @@ def test_finetuning_dataloader(
max_seq_len = 2048 if decoder_only_format else 1024

cfg = {
'name': 'finetuning',
'dataset': {
'hf_name':
'iamroot/chat_formatted_examples' if use_chat_formatting else
Expand Down Expand Up @@ -444,7 +442,6 @@ def test_finetuning_dataloader_safe_load(
# Clear the folder
shutil.rmtree(DOWNLOADED_FT_DATASETS_DIRPATH, ignore_errors=True)
cfg = DictConfig({
'name': 'finetuning',
'dataset': {
'hf_name': hf_name,
'split': 'train',
Expand Down Expand Up @@ -507,7 +504,6 @@ def test_finetuning_dataloader_small_data(
)

cfg = {
'name': 'finetuning',
'dataset': {
'hf_name': tiny_dataset_folder_path,
'split': 'train',
Expand Down Expand Up @@ -564,7 +560,6 @@ def test_finetuning_dataloader_custom_split(tmp_path: pathlib.Path, split: str):
make_tiny_ft_dataset(path=tiny_dataset_path, size=16)

cfg = {
'name': 'finetuning',
'dataset': {
'hf_name': tiny_dataset_folder_path,
'split': split,
Expand Down Expand Up @@ -611,7 +606,6 @@ def test_finetuning_dataloader_custom_split_remote(split: str):
max_seq_len = 2048

cfg = {
'name': 'finetuning',
'dataset': {
'hf_name': 's3://test-bucket/path/to/data',
'split': split,
Expand Down Expand Up @@ -696,7 +690,6 @@ def test_finetuning_dataloader_streaming(
}

cfg = {
'name': 'finetuning',
'dataset': {
'max_seq_len': 2048,
'decoder_only_format': True,
Expand Down Expand Up @@ -889,7 +882,6 @@ def test_malformed_data(
)

cfg = {
'name': 'finetuning',
'dataset': {
'hf_name': str(tiny_dataset_folder_path),
'split': 'train',
Expand Down Expand Up @@ -999,7 +991,6 @@ def test_malformed_conversation_data(
)

cfg = {
'name': 'finetuning',
'dataset': {
'hf_name': str(tiny_dataset_folder_path),
'split': 'train',
Expand Down Expand Up @@ -1238,7 +1229,6 @@ def test_token_counting_func_dataloader_setting(

if dataloader_type == 'finetuning-hf':
cfg = DictConfig({
'name': 'finetuning',
'dataset': {
'hf_name': 'dummy-path',
'split': 'train',
Expand All @@ -1262,7 +1252,6 @@ def test_token_counting_func_dataloader_setting(
)
elif dataloader_type == 'finetuning-streaming':
cfg = DictConfig({
'name': 'finetuning',
'dataset': {
'remote': 'dummy-path',
'local': 'dummy-path',
Expand Down Expand Up @@ -1378,7 +1367,6 @@ def test_sharegpt_format(
)

cfg = {
'name': 'finetuning',
'dataset': {
'hf_name': str(tiny_dataset_folder_path),
'preprocessing_fn': 'teknium/OpenHermes-2.5',
Expand Down
2 changes: 0 additions & 2 deletions tests/data/test_packing.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,6 @@ def test_auto_packing_with_streaming_dataloader(tmp_path: Path):
with MDSWriter(out=remote_dir, columns=columns, compression=None) as out:
out.write({'prompt': 'HELLO', 'response': 'WORLD'})
cfg = DictConfig({
'name': 'finetuning',
'dataset': {
'remote': remote_dir,
'local': local_dir,
Expand Down Expand Up @@ -218,7 +217,6 @@ def test_packing_with_dataloader(packing_ratio: Any):
reproducibility.seed_all(17)
tokenizer = build_tokenizer('gpt2', {})
cfg = {
'name': 'finetuning',
'dataset': {
'hf_name': 'tatsu-lab/alpaca',
'split': 'train',
Expand Down
1 change: 0 additions & 1 deletion tests/fixtures/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ def tiny_ft_dataloader(
device_batch_size: int = 1,
) -> DataLoader:
dataloader_cfg = DictConfig({
'name': 'finetuning',
'dataset': {
'hf_name': str(tiny_ft_dataset_path),
'split': 'train',
Expand Down

0 comments on commit 51d0d09

Please sign in to comment.