From b3d522d5cb959a27104ef34c68eff9ee42abe383 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lu=C3=ADs=20Gabriel=20Pereira=20Condados?=
 <lcondados@aren.ai>
Date: Wed, 15 May 2024 10:32:13 -0400
Subject: [PATCH 1/5] fix relative path on hubconfig to get cfg_file.

---
 hubconf.py | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/hubconf.py b/hubconf.py
index 2b9e8e9..2b609ed 100644
--- a/hubconf.py
+++ b/hubconf.py
@@ -1,6 +1,7 @@
 dependencies = ['torch', 'torchvision']
 
 import torch
+import os
 try:
   from mmcv.utils import Config, DictAction
 except:
@@ -41,14 +42,15 @@ def metric3d_convnext_large(pretrain=False, **kwargs):
   Returns:
     model (nn.Module): a Metric3D model.
   '''
-  cfg_file = MODEL_TYPE['ConvNeXt-Large']['cfg_file']
+  dirname = os.path.dirname(__file__)
+  cfg_file = os.path.join(dirname, MODEL_TYPE['ConvNeXt-Large']['cfg_file'])
   ckpt_file = MODEL_TYPE['ConvNeXt-Large']['ckpt_file']
 
   cfg = Config.fromfile(cfg_file)
   model = get_configured_monodepth_model(cfg)
   if pretrain:
     model.load_state_dict(
-      torch.hub.load_state_dict_from_url(ckpt_file)['model_state_dict'], 
+      torch.hub.load_state_dict_from_url(ckpt_file)['model_state_dict'],
       strict=False,
     )
   return model
@@ -62,14 +64,15 @@ def metric3d_vit_small(pretrain=False, **kwargs):
   Returns:
     model (nn.Module): a Metric3D model.
   '''
-  cfg_file = MODEL_TYPE['ViT-Small']['cfg_file']
+  dirname = os.path.dirname(__file__)
+  cfg_file = os.path.join(dirname, MODEL_TYPE['ViT-Small']['cfg_file'])
   ckpt_file = MODEL_TYPE['ViT-Small']['ckpt_file']
 
   cfg = Config.fromfile(cfg_file)
   model = get_configured_monodepth_model(cfg)
   if pretrain:
     model.load_state_dict(
-      torch.hub.load_state_dict_from_url(ckpt_file)['model_state_dict'], 
+      torch.hub.load_state_dict_from_url(ckpt_file)['model_state_dict'],
       strict=False,
     )
   return model
@@ -83,14 +86,15 @@ def metric3d_vit_large(pretrain=False, **kwargs):
   Returns:
     model (nn.Module): a Metric3D model.
   '''
-  cfg_file = MODEL_TYPE['ViT-Large']['cfg_file']
+  dirname = os.path.dirname(__file__)
+  cfg_file = os.path.join(dirname, MODEL_TYPE['ViT-Large']['cfg_file'])
   ckpt_file = MODEL_TYPE['ViT-Large']['ckpt_file']
 
   cfg = Config.fromfile(cfg_file)
   model = get_configured_monodepth_model(cfg)
   if pretrain:
     model.load_state_dict(
-      torch.hub.load_state_dict_from_url(ckpt_file)['model_state_dict'], 
+      torch.hub.load_state_dict_from_url(ckpt_file)['model_state_dict'],
       strict=False,
     )
   return model
@@ -104,14 +108,15 @@ def metric3d_vit_giant2(pretrain=False, **kwargs):
   Returns:
     model (nn.Module): a Metric3D model.
   '''
-  cfg_file = MODEL_TYPE['ViT-giant2']['cfg_file']
+  dirname = os.path.dirname(__file__)
+  cfg_file = os.path.join(dirname, MODEL_TYPE['ViT-giant2']['cfg_file'])
   ckpt_file = MODEL_TYPE['ViT-giant2']['ckpt_file']
 
   cfg = Config.fromfile(cfg_file)
   model = get_configured_monodepth_model(cfg)
   if pretrain:
     model.load_state_dict(
-      torch.hub.load_state_dict_from_url(ckpt_file)['model_state_dict'], 
+      torch.hub.load_state_dict_from_url(ckpt_file)['model_state_dict'],
       strict=False,
     )
   return model
@@ -163,7 +168,7 @@ def metric3d_vit_giant2(pretrain=False, **kwargs):
   # un pad
   pred_depth = pred_depth.squeeze()
   pred_depth = pred_depth[pad_info[0] : pred_depth.shape[0] - pad_info[1], pad_info[2] : pred_depth.shape[1] - pad_info[3]]
-  
+
   # upsample to original size
   pred_depth = torch.nn.functional.interpolate(pred_depth[None, None, :, :], rgb_origin.shape[:2], mode='bilinear').squeeze()
   ###################### canonical camera space ######################
@@ -173,14 +178,14 @@ def metric3d_vit_giant2(pretrain=False, **kwargs):
   pred_depth = pred_depth * canonical_to_real_scale # now the depth is metric
   pred_depth = torch.clamp(pred_depth, 0, 300)
 
-  #### you can now do anything with the metric depth 
+  #### you can now do anything with the metric depth
   # such as evaluate predicted depth
   if depth_file is not None:
     gt_depth = cv2.imread(depth_file, -1)
     gt_depth = gt_depth / gt_depth_scale
     gt_depth = torch.from_numpy(gt_depth).float().cuda()
     assert gt_depth.shape == pred_depth.shape
-    
+
     mask = (gt_depth > 1e-8)
     abs_rel_err = (torch.abs(pred_depth[mask] - gt_depth[mask]) / gt_depth[mask]).mean()
     print('abs_rel_err:', abs_rel_err.item())
\ No newline at end of file

From ac4c20b2455e1e5131f85e34e77e1ae43d4a99f4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lu=C3=ADs=20Gabriel=20Pereira=20Condados?=
 <lcondados@aren.ai>
Date: Wed, 15 May 2024 11:46:15 -0400
Subject: [PATCH 2/5] convert from bfloat16 to float

---
 mono/model/decode_heads/RAFTDepthNormalDPTDecoder5.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mono/model/decode_heads/RAFTDepthNormalDPTDecoder5.py b/mono/model/decode_heads/RAFTDepthNormalDPTDecoder5.py
index 9af89f9..9bc0b63 100644
--- a/mono/model/decode_heads/RAFTDepthNormalDPTDecoder5.py
+++ b/mono/model/decode_heads/RAFTDepthNormalDPTDecoder5.py
@@ -216,7 +216,7 @@ def compute_depth_expectation(prob, depth_values):
     return depth
 
 def interpolate_float32(x, size=None, scale_factor=None, mode='nearest', align_corners=None):
-    with torch.autocast(device_type='cuda', dtype=torch.bfloat16, enabled=False):
+    with torch.autocast(device_type='cuda', dtype=torch.float, enabled=False):
         return F.interpolate(x.float(), size=size, scale_factor=scale_factor, mode=mode, align_corners=align_corners)
 
 # def upflow8(flow, mode='bilinear'):
@@ -225,7 +225,7 @@ def interpolate_float32(x, size=None, scale_factor=None, mode='nearest', align_c
 
 def upflow4(flow, mode='bilinear'):
     new_size = (4 * flow.shape[2], 4 * flow.shape[3])
-    with torch.autocast(device_type='cuda', dtype=torch.bfloat16, enabled=False):
+    with torch.autocast(device_type='cuda', dtype=torch.float, enabled=False):
         return  F.interpolate(flow, size=new_size, mode=mode, align_corners=True)
 
 def coords_grid(batch, ht, wd):

From b42a2f55d33d896c244a8f0384798a4713e48169 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lu=C3=ADs=20Gabriel=20Pereira=20Condados?=
 <lcondados@aren.ai>
Date: Wed, 15 May 2024 11:49:09 -0400
Subject: [PATCH 3/5] forcing higher ranges

---
 mono/configs/HourglassDecoder/convlarge.0.3_150.py | 2 +-
 mono/configs/HourglassDecoder/vit.raft5.giant2.py  | 4 ++--
 mono/configs/HourglassDecoder/vit.raft5.small.py   | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/mono/configs/HourglassDecoder/convlarge.0.3_150.py b/mono/configs/HourglassDecoder/convlarge.0.3_150.py
index 37b91c8..99664e3 100644
--- a/mono/configs/HourglassDecoder/convlarge.0.3_150.py
+++ b/mono/configs/HourglassDecoder/convlarge.0.3_150.py
@@ -17,7 +17,7 @@
         focal_length=1000.0,
     ),
     depth_range=(0, 1),
-    depth_normalize=(0.3, 150),
+    depth_normalize=(0.3, 500),
     crop_size = (544, 1216),
 ) 
 
diff --git a/mono/configs/HourglassDecoder/vit.raft5.giant2.py b/mono/configs/HourglassDecoder/vit.raft5.giant2.py
index caf9eb2..5af4e8e 100644
--- a/mono/configs/HourglassDecoder/vit.raft5.giant2.py
+++ b/mono/configs/HourglassDecoder/vit.raft5.giant2.py
@@ -15,7 +15,7 @@
 )
 
 
-max_value = 200
+max_value = 500
 # configs of the canonical space
 data_basic=dict(
     canonical_space = dict(
@@ -25,7 +25,7 @@
     depth_range=(0, 1),
     depth_normalize=(0.1, max_value),
     crop_size = (616, 1064),  # %28 = 0
-     clip_depth_range=(0.1, 200),
+     clip_depth_range=(0.1, 500),
     vit_size=(616,1064)
 ) 
 
diff --git a/mono/configs/HourglassDecoder/vit.raft5.small.py b/mono/configs/HourglassDecoder/vit.raft5.small.py
index 25eb68c..e61f222 100644
--- a/mono/configs/HourglassDecoder/vit.raft5.small.py
+++ b/mono/configs/HourglassDecoder/vit.raft5.small.py
@@ -15,7 +15,7 @@
 )
 
 
-max_value = 200
+max_value = 500
 # configs of the canonical space
 data_basic=dict(
     canonical_space = dict(
@@ -25,7 +25,7 @@
     depth_range=(0, 1),
     depth_normalize=(0.1, max_value),
     crop_size = (616, 1064),  # %28 = 0
-     clip_depth_range=(0.1, 200),
+     clip_depth_range=(0.1, 500),
     vit_size=(616,1064)
 ) 
 

From 595635b53f5727704dd11c3668dd76072453a170 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lu=C3=ADs=20Gabriel=20Pereira=20Condados?=
 <lcondados@aren.ai>
Date: Wed, 15 May 2024 11:53:48 -0400
Subject: [PATCH 4/5] force higher range depth

---
 mono/configs/HourglassDecoder/vit.raft5.large.py | 4 ++--
 mono/configs/HourglassDecoder/vit.raft5.small.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/mono/configs/HourglassDecoder/vit.raft5.large.py b/mono/configs/HourglassDecoder/vit.raft5.large.py
index 4febdcb..9000cee 100644
--- a/mono/configs/HourglassDecoder/vit.raft5.large.py
+++ b/mono/configs/HourglassDecoder/vit.raft5.large.py
@@ -15,7 +15,7 @@
 )
 
 
-max_value = 200
+max_value = 1000
 # configs of the canonical space
 data_basic=dict(
     canonical_space = dict(
@@ -25,7 +25,7 @@
     depth_range=(0, 1),
     depth_normalize=(0.1, max_value),
     crop_size = (616, 1064),  # %28 = 0
-     clip_depth_range=(0.1, 200),
+     clip_depth_range=(0.1, 1000),
     vit_size=(616,1064)
 ) 
 
diff --git a/mono/configs/HourglassDecoder/vit.raft5.small.py b/mono/configs/HourglassDecoder/vit.raft5.small.py
index e61f222..35d9c9b 100644
--- a/mono/configs/HourglassDecoder/vit.raft5.small.py
+++ b/mono/configs/HourglassDecoder/vit.raft5.small.py
@@ -15,7 +15,7 @@
 )
 
 
-max_value = 500
+max_value = 1000
 # configs of the canonical space
 data_basic=dict(
     canonical_space = dict(
@@ -25,7 +25,7 @@
     depth_range=(0, 1),
     depth_normalize=(0.1, max_value),
     crop_size = (616, 1064),  # %28 = 0
-     clip_depth_range=(0.1, 500),
+     clip_depth_range=(0.1, 1000),
     vit_size=(616,1064)
 ) 
 

From 95b91c5acf8d2f125789d7c331c14e9b8b24cc10 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lu=C3=ADs=20Gabriel=20Pereira=20Condados?=
 <lcondados@aren.ai>
Date: Wed, 15 May 2024 11:55:49 -0400
Subject: [PATCH 5/5] returning original ranges

---
 mono/configs/HourglassDecoder/vit.raft5.large.py | 4 ++--
 mono/configs/HourglassDecoder/vit.raft5.small.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/mono/configs/HourglassDecoder/vit.raft5.large.py b/mono/configs/HourglassDecoder/vit.raft5.large.py
index 9000cee..4febdcb 100644
--- a/mono/configs/HourglassDecoder/vit.raft5.large.py
+++ b/mono/configs/HourglassDecoder/vit.raft5.large.py
@@ -15,7 +15,7 @@
 )
 
 
-max_value = 1000
+max_value = 200
 # configs of the canonical space
 data_basic=dict(
     canonical_space = dict(
@@ -25,7 +25,7 @@
     depth_range=(0, 1),
     depth_normalize=(0.1, max_value),
     crop_size = (616, 1064),  # %28 = 0
-     clip_depth_range=(0.1, 1000),
+     clip_depth_range=(0.1, 200),
     vit_size=(616,1064)
 ) 
 
diff --git a/mono/configs/HourglassDecoder/vit.raft5.small.py b/mono/configs/HourglassDecoder/vit.raft5.small.py
index 35d9c9b..25eb68c 100644
--- a/mono/configs/HourglassDecoder/vit.raft5.small.py
+++ b/mono/configs/HourglassDecoder/vit.raft5.small.py
@@ -15,7 +15,7 @@
 )
 
 
-max_value = 1000
+max_value = 200
 # configs of the canonical space
 data_basic=dict(
     canonical_space = dict(
@@ -25,7 +25,7 @@
     depth_range=(0, 1),
     depth_normalize=(0.1, max_value),
     crop_size = (616, 1064),  # %28 = 0
-     clip_depth_range=(0.1, 1000),
+     clip_depth_range=(0.1, 200),
     vit_size=(616,1064)
 )