amazon-science · mzolfaghari · Feb 6, 2022 · Feb 6, 2022 · Feb 6, 2022 · Feb 6, 2022
diff --git a/README.md b/README.md
@@ -15,7 +15,7 @@ Authors:
 
 
 ## Update
-
+##### [06 Feb 2022] CrossCLR code released
 ##### [Dec 2021] CrossCLR-onlyIntraModality released
 ## Loss Function
 The loss function [`CrossCLR`](https://github.com/amazon-research/crossmodal-contrastive-learning) in `loss.py` takes `video features`  and `text features` as input, and return the loss. 
@@ -65,6 +65,8 @@ loss = criterion(video_features, text_features)
 
 See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information.
 
+## Acknowledgements
+Part of this code is inspired by Simon Ging's COOT implementation [[Code](https://github.com/gingsi/coot-videotext)].
 ## License
 
 This project is licensed under the Apache-2.0 License.

diff --git a/config/lsmdc.yaml b/config/lsmdc.yaml
@@ -0,0 +1,159 @@
+description: "experiment with 100m features."
+random_seed: null
+config_type: "ret"
+training:
+    debug_size: 11000 #Debugging will be deactivated if set it to a number bigger that 10000!
+    loss_func: 'contrastive'
+    batch_size: 96 #64
+    num_epochs: 20
+    compute_clip_retrieval: true
+
+    cross_clr_config:
+        temperature: 0.03
+        temperature_weights: 0.0055
+        score_thrshold: 1
+        negative_weight: 0.65 #0.8
+        queue_size: 10000
+
+    contrastive_loss_config:
+        margin: 0.1
+        weight_high: 0.5 #1.0
+        weight_high_cluster: 0 #0.1
+        weight_low: 1
+        weight_low_cluster: 0 #.2
+        weight_context: 0 #0.7 #1.0
+        weight_context_cluster: 0.0  
+    loss_cycle_cons: 0 #0.001
+val:
+    debug_size: 1000 #0 #00 #000 #500 #Debugging will be deactivated if set it to a number bigger that 10000!
+    batch_size: 64
+    val_freq: 1
+    val_start: 2 #0
+    val_clips: true
+    val_clips_freq: 1
+    det_best_field: "val_clip_score_at_1"
+    det_best_compare_mode: "max"
+    det_best_threshold_mode: "rel"
+    det_best_threshold_value: 1e-4
+    det_best_terminate_after: 16
+test:
+    debug_size: 11000 #000 #Debugging will be deactivated if set it to a number bigger that 10000!
+    batch_size: 64
+    test_freq: 1
+    test_start: 1 #0
+    test_clips: true
+    test_clips_freq: 1
+
+
+video_local:
+    name: transformer
+    output_dim: 512
+    input_fc: true
+    input_fc_output_dim: 512
+    selfatn_config:
+        hidden_dim: 512
+        num_layers: 1
+        num_heads: 4
+        pointwise_ff_dim: 512
+        activation: "gelu"
+        dropout: 0.01
+        norm: "layernorm_coot"
+    use_context: true
+    use_subspace: false #deactivated
+    pooler_config:
+        name: max #atn #avg #max #atn
+        hidden_dim: 512
+        num_heads: 4
+        num_layers: 1
+        dropout: 0.01
+        activation: "gelu"
+    weight_init_type: "truncnorm"
+    weight_init_std: 0.01
+video_global:
+    name: transformer
+    # copy all settings from the local network
+    same_as: "video_local"
+    output_dim: 512
+    input_fc: false
+    use_context: true
+    use_subspace: false
+    crossatn_config:
+        hidden_dim: 512
+        num_layers: 1
+        num_heads: 8
+        pointwise_ff_dim: 512
+        activation: "gelu"
+        dropout: 0.01
+        norm: "layernorm_coot"
+    pooler_config:
+        name: "avg"
+
+text_local:
+    same_as: "video_local"
+text_global:
+    same_as: "video_global"
+dataset:
+    name: lsmdc16
+    modality_feat_name_a: action,scene,appearance,audio,howto100m_finetune,object,flow #list modality names without space options: mmt or any combination of [action, scene, flow, ...]
+    modality_feat_name_b: "text" #_feat_youcook2_meta_all_transformers_bert-base-uncased_-2,-1"
+    train_split: train
+    val_split: val #test 
+    test_split: test1k
+    min_frames: 1
+    use_clips: true
+    max_frames: 4
+    feat_agg_dim: 2048 #1024 #512 # Split features and add them along a new dimension
+    mmt_feat_dim: 1024 #1024
+    action_feat_dim: 2048 #1024
+    scene_feat_dim: 2208 #1024
+    appearance_feat_dim: 2048 #1024
+    flow_feat_dim: 529 #1024
+    object_feat_dim: 256 #1024
+    audio_feat_dim: 2048
+    howto100m_finetune_feat_dim: 512
+    text_feat_dim: 1536
+    frames_noise: 0
+    word_emb: bert
+    # video feature loading
+    add_stop_frame: 2
+    expand_segments: 0
+    # technical dataloading details
+    preload_data: true
+    pin_memory: true
+    num_workers: 4
+    drop_last: false
+optimizer:
+    name: radam
+    lr: 0.0005
+    weight_decay: 0 #3.0e-05 #0
+    weight_decay_for_bias: true
+    lr_decay_mult: false
+    momentum: 0.56
+    adam_beta2: 0.98
+    adam_eps: 1.5e-09 #1.5e-09
+scheduler:
+    patience: 6
+    cooldown: 4
+    warmup: 4
+
+# ---------- Logging / Saving ----------
+logging:
+    step_train: 10
+    step_val: 10
+    step_test: 10 #if -1 then ignore testing
+    step_gpu: -1
+    step_gpu_once: 10
+saving:
+    keep_freq: -1
+    save_last: true
+    save_best: true
+    save_opt_state: true
+# ---------- Technical PyTorch settings ----------
+use_cuda: true
+use_multi_gpu: true
+cudnn_enabled: true
+cudnn_benchmark: true
+cudnn_deterministic: false
+cuda_non_blocking: true
+fp16_train: true
+fp16_val: true