Skip to content

Commit

Permalink
Update SAM AMG README with more descriptive install instructions (#1337)
Browse files Browse the repository at this point in the history
  • Loading branch information
philipbutler authored Dec 9, 2024
1 parent 23db9bf commit a6f8676
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 43 deletions.
22 changes: 21 additions & 1 deletion examples/sam2_amg_server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,32 @@ The 'ao' mode is a copy of the baseline with modifications to make the code more
### 0. Download checkpoints and install requirements

```
pip install -r requirements.txt
# From the top-level "ao" directory
# If necessary, create and activate a virtual environment
# Ex:
python -m venv venv && source venv/bin/activate
# Install requirements for this example
pip install -r examples/sam2_amg_server/requirements.txt
# If you have an older version of torch in your current environment, uninstall it first
pip uninstall torch
# Install torch nightly
pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124
# Build ao from source for now
python setup.py develop
# On your mark, get set...
cd examples/sam2_amg_server/
```

Download `sam2.1_hiera_large.pt` from https://github.com/facebookresearch/sam2?tab=readme-ov-file#download-checkpoints and put it into `~/checkpoints/sam2`

### 1. Create a random subset of 1000 images
Using images with corresponding mask annotations, like from the Segment Anything Video (SA-V) [Dataset](https://github.com/facebookresearch/sam2/tree/main/sav_dataset#download-the-dataset) is suggested, to later compare any drop in accuracy using `--furious` (using `torch.float16`).
```
find sav_val -type f > sav_val_image_paths
shuf -n 1000 sav_val_image_paths > sav_val_image_paths_shuf_1000
Expand Down
28 changes: 14 additions & 14 deletions torchao/_models/sam2/configs/sam2/sam2_hiera_b+.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,18 @@

# Model
model:
_target_: sam2.modeling.sam2_base.SAM2Base
_target_: torchao._models.sam2.modeling.sam2_base.SAM2Base
image_encoder:
_target_: sam2.modeling.backbones.image_encoder.ImageEncoder
_target_: torchao._models.sam2.modeling.backbones.image_encoder.ImageEncoder
scalp: 1
trunk:
_target_: sam2.modeling.backbones.hieradet.Hiera
_target_: torchao._models.sam2.modeling.backbones.hieradet.Hiera
embed_dim: 112
num_heads: 2
neck:
_target_: sam2.modeling.backbones.image_encoder.FpnNeck
_target_: torchao._models.sam2.modeling.backbones.image_encoder.FpnNeck
position_encoding:
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
_target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine
num_pos_feats: 256
normalize: true
scale: null
Expand All @@ -24,17 +24,17 @@ model:
fpn_interp_model: nearest

memory_attention:
_target_: sam2.modeling.memory_attention.MemoryAttention
_target_: torchao._models.sam2.modeling.memory_attention.MemoryAttention
d_model: 256
pos_enc_at_input: true
layer:
_target_: sam2.modeling.memory_attention.MemoryAttentionLayer
_target_: torchao._models.sam2.modeling.memory_attention.MemoryAttentionLayer
activation: relu
dim_feedforward: 2048
dropout: 0.1
pos_enc_at_attn: false
self_attention:
_target_: sam2.modeling.sam.transformer.RoPEAttention
_target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention
rope_theta: 10000.0
feat_sizes: [32, 32]
embedding_dim: 256
Expand All @@ -45,7 +45,7 @@ model:
pos_enc_at_cross_attn_keys: true
pos_enc_at_cross_attn_queries: false
cross_attention:
_target_: sam2.modeling.sam.transformer.RoPEAttention
_target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention
rope_theta: 10000.0
feat_sizes: [32, 32]
rope_k_repeat: True
Expand All @@ -57,23 +57,23 @@ model:
num_layers: 4

memory_encoder:
_target_: sam2.modeling.memory_encoder.MemoryEncoder
_target_: torchao._models.sam2.modeling.memory_encoder.MemoryEncoder
out_dim: 64
position_encoding:
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
_target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine
num_pos_feats: 64
normalize: true
scale: null
temperature: 10000
mask_downsampler:
_target_: sam2.modeling.memory_encoder.MaskDownSampler
_target_: torchao._models.sam2.modeling.memory_encoder.MaskDownSampler
kernel_size: 3
stride: 2
padding: 1
fuser:
_target_: sam2.modeling.memory_encoder.Fuser
_target_: torchao._models.sam2.modeling.memory_encoder.Fuser
layer:
_target_: sam2.modeling.memory_encoder.CXBlock
_target_: torchao._models.sam2.modeling.memory_encoder.CXBlock
dim: 256
kernel_size: 7
padding: 3
Expand Down
28 changes: 14 additions & 14 deletions torchao/_models/sam2/configs/sam2/sam2_hiera_s.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,21 @@

# Model
model:
_target_: sam2.modeling.sam2_base.SAM2Base
_target_: torchao._models.sam2.modeling.sam2_base.SAM2Base
image_encoder:
_target_: sam2.modeling.backbones.image_encoder.ImageEncoder
_target_: torchao._models.sam2.modeling.backbones.image_encoder.ImageEncoder
scalp: 1
trunk:
_target_: sam2.modeling.backbones.hieradet.Hiera
_target_: torchao._models.sam2.modeling.backbones.hieradet.Hiera
embed_dim: 96
num_heads: 1
stages: [1, 2, 11, 2]
global_att_blocks: [7, 10, 13]
window_pos_embed_bkg_spatial_size: [7, 7]
neck:
_target_: sam2.modeling.backbones.image_encoder.FpnNeck
_target_: torchao._models.sam2.modeling.backbones.image_encoder.FpnNeck
position_encoding:
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
_target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine
num_pos_feats: 256
normalize: true
scale: null
Expand All @@ -27,17 +27,17 @@ model:
fpn_interp_model: nearest

memory_attention:
_target_: sam2.modeling.memory_attention.MemoryAttention
_target_: torchao._models.sam2.modeling.memory_attention.MemoryAttention
d_model: 256
pos_enc_at_input: true
layer:
_target_: sam2.modeling.memory_attention.MemoryAttentionLayer
_target_: torchao._models.sam2.modeling.memory_attention.MemoryAttentionLayer
activation: relu
dim_feedforward: 2048
dropout: 0.1
pos_enc_at_attn: false
self_attention:
_target_: sam2.modeling.sam.transformer.RoPEAttention
_target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention
rope_theta: 10000.0
feat_sizes: [32, 32]
embedding_dim: 256
Expand All @@ -48,7 +48,7 @@ model:
pos_enc_at_cross_attn_keys: true
pos_enc_at_cross_attn_queries: false
cross_attention:
_target_: sam2.modeling.sam.transformer.RoPEAttention
_target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention
rope_theta: 10000.0
feat_sizes: [32, 32]
rope_k_repeat: True
Expand All @@ -60,23 +60,23 @@ model:
num_layers: 4

memory_encoder:
_target_: sam2.modeling.memory_encoder.MemoryEncoder
_target_: torchao._models.sam2.modeling.memory_encoder.MemoryEncoder
out_dim: 64
position_encoding:
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
_target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine
num_pos_feats: 64
normalize: true
scale: null
temperature: 10000
mask_downsampler:
_target_: sam2.modeling.memory_encoder.MaskDownSampler
_target_: torchao._models.sam2.modeling.memory_encoder.MaskDownSampler
kernel_size: 3
stride: 2
padding: 1
fuser:
_target_: sam2.modeling.memory_encoder.Fuser
_target_: torchao._models.sam2.modeling.memory_encoder.Fuser
layer:
_target_: sam2.modeling.memory_encoder.CXBlock
_target_: torchao._models.sam2.modeling.memory_encoder.CXBlock
dim: 256
kernel_size: 7
padding: 3
Expand Down
28 changes: 14 additions & 14 deletions torchao/_models/sam2/configs/sam2/sam2_hiera_t.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,21 @@

# Model
model:
_target_: sam2.modeling.sam2_base.SAM2Base
_target_: torchao._models.sam2.modeling.sam2_base.SAM2Base
image_encoder:
_target_: sam2.modeling.backbones.image_encoder.ImageEncoder
_target_: torchao._models.sam2.modeling.backbones.image_encoder.ImageEncoder
scalp: 1
trunk:
_target_: sam2.modeling.backbones.hieradet.Hiera
_target_: torchao._models.sam2.modeling.backbones.hieradet.Hiera
embed_dim: 96
num_heads: 1
stages: [1, 2, 7, 2]
global_att_blocks: [5, 7, 9]
window_pos_embed_bkg_spatial_size: [7, 7]
neck:
_target_: sam2.modeling.backbones.image_encoder.FpnNeck
_target_: torchao._models.sam2.modeling.backbones.image_encoder.FpnNeck
position_encoding:
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
_target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine
num_pos_feats: 256
normalize: true
scale: null
Expand All @@ -27,17 +27,17 @@ model:
fpn_interp_model: nearest

memory_attention:
_target_: sam2.modeling.memory_attention.MemoryAttention
_target_: torchao._models.sam2.modeling.memory_attention.MemoryAttention
d_model: 256
pos_enc_at_input: true
layer:
_target_: sam2.modeling.memory_attention.MemoryAttentionLayer
_target_: torchao._models.sam2.modeling.memory_attention.MemoryAttentionLayer
activation: relu
dim_feedforward: 2048
dropout: 0.1
pos_enc_at_attn: false
self_attention:
_target_: sam2.modeling.sam.transformer.RoPEAttention
_target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention
rope_theta: 10000.0
feat_sizes: [32, 32]
embedding_dim: 256
Expand All @@ -48,7 +48,7 @@ model:
pos_enc_at_cross_attn_keys: true
pos_enc_at_cross_attn_queries: false
cross_attention:
_target_: sam2.modeling.sam.transformer.RoPEAttention
_target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention
rope_theta: 10000.0
feat_sizes: [32, 32]
rope_k_repeat: True
Expand All @@ -60,23 +60,23 @@ model:
num_layers: 4

memory_encoder:
_target_: sam2.modeling.memory_encoder.MemoryEncoder
_target_: torchao._models.sam2.modeling.memory_encoder.MemoryEncoder
out_dim: 64
position_encoding:
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
_target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine
num_pos_feats: 64
normalize: true
scale: null
temperature: 10000
mask_downsampler:
_target_: sam2.modeling.memory_encoder.MaskDownSampler
_target_: torchao._models.sam2.modeling.memory_encoder.MaskDownSampler
kernel_size: 3
stride: 2
padding: 1
fuser:
_target_: sam2.modeling.memory_encoder.Fuser
_target_: torchao._models.sam2.modeling.memory_encoder.Fuser
layer:
_target_: sam2.modeling.memory_encoder.CXBlock
_target_: torchao._models.sam2.modeling.memory_encoder.CXBlock
dim: 256
kernel_size: 7
padding: 3
Expand Down

0 comments on commit a6f8676

Please sign in to comment.