Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP - CrossCLR full code #3

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ Authors:


## Update

##### [06 Feb 2022] CrossCLR code released
##### [Dec 2021] CrossCLR-onlyIntraModality released
## Loss Function
The loss function [`CrossCLR`](https://github.com/amazon-research/crossmodal-contrastive-learning) in `loss.py` takes `video features` and `text features` as input, and return the loss.
Expand Down Expand Up @@ -65,6 +65,8 @@ loss = criterion(video_features, text_features)

See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information.

## Acknowledgements
Part of this code is inspired by Simon Ging's COOT implementation [[Code](https://github.com/gingsi/coot-videotext)].
## License

This project is licensed under the Apache-2.0 License.
Expand Down
159 changes: 159 additions & 0 deletions config/lsmdc.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
description: "experiment with 100m features."
random_seed: null
config_type: "ret"
training:
debug_size: 11000 #Debugging will be deactivated if set it to a number bigger that 10000!
loss_func: 'contrastive'
batch_size: 96 #64
num_epochs: 20
compute_clip_retrieval: true

cross_clr_config:
temperature: 0.03
temperature_weights: 0.0055
score_thrshold: 1
negative_weight: 0.65 #0.8
queue_size: 10000

contrastive_loss_config:
margin: 0.1
weight_high: 0.5 #1.0
weight_high_cluster: 0 #0.1
weight_low: 1
weight_low_cluster: 0 #.2
weight_context: 0 #0.7 #1.0
weight_context_cluster: 0.0
loss_cycle_cons: 0 #0.001
val:
debug_size: 1000 #0 #00 #000 #500 #Debugging will be deactivated if set it to a number bigger that 10000!
batch_size: 64
val_freq: 1
val_start: 2 #0
val_clips: true
val_clips_freq: 1
det_best_field: "val_clip_score_at_1"
det_best_compare_mode: "max"
det_best_threshold_mode: "rel"
det_best_threshold_value: 1e-4
det_best_terminate_after: 16
test:
debug_size: 11000 #000 #Debugging will be deactivated if set it to a number bigger that 10000!
batch_size: 64
test_freq: 1
test_start: 1 #0
test_clips: true
test_clips_freq: 1


video_local:
name: transformer
output_dim: 512
input_fc: true
input_fc_output_dim: 512
selfatn_config:
hidden_dim: 512
num_layers: 1
num_heads: 4
pointwise_ff_dim: 512
activation: "gelu"
dropout: 0.01
norm: "layernorm_coot"
use_context: true
use_subspace: false #deactivated
pooler_config:
name: max #atn #avg #max #atn
hidden_dim: 512
num_heads: 4
num_layers: 1
dropout: 0.01
activation: "gelu"
weight_init_type: "truncnorm"
weight_init_std: 0.01
video_global:
name: transformer
# copy all settings from the local network
same_as: "video_local"
output_dim: 512
input_fc: false
use_context: true
use_subspace: false
crossatn_config:
hidden_dim: 512
num_layers: 1
num_heads: 8
pointwise_ff_dim: 512
activation: "gelu"
dropout: 0.01
norm: "layernorm_coot"
pooler_config:
name: "avg"

text_local:
same_as: "video_local"
text_global:
same_as: "video_global"
dataset:
name: lsmdc16
modality_feat_name_a: action,scene,appearance,audio,howto100m_finetune,object,flow #list modality names without space options: mmt or any combination of [action, scene, flow, ...]
modality_feat_name_b: "text" #_feat_youcook2_meta_all_transformers_bert-base-uncased_-2,-1"
train_split: train
val_split: val #test
test_split: test1k
min_frames: 1
use_clips: true
max_frames: 4
feat_agg_dim: 2048 #1024 #512 # Split features and add them along a new dimension
mmt_feat_dim: 1024 #1024
action_feat_dim: 2048 #1024
scene_feat_dim: 2208 #1024
appearance_feat_dim: 2048 #1024
flow_feat_dim: 529 #1024
object_feat_dim: 256 #1024
audio_feat_dim: 2048
howto100m_finetune_feat_dim: 512
text_feat_dim: 1536
frames_noise: 0
word_emb: bert
# video feature loading
add_stop_frame: 2
expand_segments: 0
# technical dataloading details
preload_data: true
pin_memory: true
num_workers: 4
drop_last: false
optimizer:
name: radam
lr: 0.0005
weight_decay: 0 #3.0e-05 #0
weight_decay_for_bias: true
lr_decay_mult: false
momentum: 0.56
adam_beta2: 0.98
adam_eps: 1.5e-09 #1.5e-09
scheduler:
patience: 6
cooldown: 4
warmup: 4

# ---------- Logging / Saving ----------
logging:
step_train: 10
step_val: 10
step_test: 10 #if -1 then ignore testing
step_gpu: -1
step_gpu_once: 10
saving:
keep_freq: -1
save_last: true
save_best: true
save_opt_state: true
# ---------- Technical PyTorch settings ----------
use_cuda: true
use_multi_gpu: true
cudnn_enabled: true
cudnn_benchmark: true
cudnn_deterministic: false
cuda_non_blocking: true
fp16_train: true
fp16_val: true
Loading