Merge pull request #10 from andreeaiana/feat_save_recommendations

Feat: save recommendations list
andreeaiana · Mar 19, 2024 · 1ac532e · 1ac532e
2 parents 3fce487 + 1b33d38
commit 1ac532e
Show file tree

Hide file tree

Showing 35 changed files with 636 additions and 46 deletions.
diff --git a/configs/model/caum.yaml b/configs/model/caum.yaml
@@ -47,12 +47,16 @@ outputs:
       "target_sentiments",
       "hist_categories",
       "hist_sentiments",
+      "user_ids",
+      "cand_news_ids",
     ]
 
 # evaluation
 top_k_list: [5, 10]
 num_categ_classes: 18
 num_sent_classes: 3
+save_recs: False
+recs_fpath: "${paths.output_dir}/recommendations.json"
 
 optimizer:
   _target_: torch.optim.Adam

diff --git a/configs/model/cen_news_rec.yaml b/configs/model/cen_news_rec.yaml
@@ -41,12 +41,16 @@ outputs:
       "target_sentiments",
       "hist_categories",
       "hist_sentiments",
+      "user_ids",
+      "cand_news_ids",
     ]
 
 # evaluation
 top_k_list: [5, 10]
 num_categ_classes: 18
 num_sent_classes: 3
+save_recs: False
+recs_fpath: "${paths.output_dir}/recommendations.json"
 
 optimizer:
   _target_: torch.optim.Adam

diff --git a/configs/model/dkn.yaml b/configs/model/dkn.yaml
@@ -33,12 +33,16 @@ outputs:
       "target_sentiments",
       "hist_categories",
       "hist_sentiments",
+      "user_ids",
+      "cand_news_ids",
     ]
 
 # evaluation
 top_k_list: [5, 10]
 num_categ_classes: 18
 num_sent_classes: 3
+save_recs: False
+recs_fpath: "${paths.output_dir}/recommendations.json"
 
 optimizer:
   _target_: torch.optim.Adam

diff --git a/configs/model/lstur.yaml b/configs/model/lstur.yaml
@@ -43,12 +43,16 @@ outputs:
       "target_sentiments",
       "hist_categories",
       "hist_sentiments",
+      "user_ids",
+      "cand_news_ids",
     ]
 
 # evaluation
 top_k_list: [5, 10]
 num_categ_classes: 18
 num_sent_classes: 3
+save_recs: False
+recs_fpath: "${paths.output_dir}/recommendations.json"
 
 optimizer:
   _target_: torch.optim.Adam

diff --git a/configs/model/manner_cr_module.yaml b/configs/model/manner_cr_module.yaml
@@ -34,12 +34,16 @@ outputs:
       "target_sentiments",
       "hist_categories",
       "hist_sentiments",
+      "user_ids",
+      "cand_news_ids",
     ]
 
 # evaluation
 top_k_list: [5, 10]
 num_categ_classes: 18
 num_sent_classes: 3
+save_recs: False
+recs_fpath: "${paths.output_dir}/recommendations.json"
 
 optimizer:
   _target_: torch.optim.Adam

diff --git a/configs/model/manner_module.yaml b/configs/model/manner_module.yaml
@@ -18,12 +18,16 @@ outputs:
       "target_sentiments",
       "hist_categories",
       "hist_sentiments",
+      "user_ids",
+      "cand_news_ids",
     ]
 
 # evaluation
 top_k_list: [5, 10]
 num_categ_classes: 18
 num_sent_classes: 3
+save_recs: False
+recs_fpath: "${paths.output_dir}/recommendations.json"
 
 optimizer: null
 scheduler: null
diff --git a/configs/model/miner.yaml b/configs/model/miner.yaml
@@ -39,12 +39,16 @@ outputs:
       "target_sentiments",
       "hist_categories",
       "hist_sentiments",
+      "user_ids",
+      "cand_news_ids",
     ]
 
 # evaluation
 top_k_list: [5, 10]
 num_categ_classes: 18
 num_sent_classes: 3
+save_recs: False
+recs_fpath: "${paths.output_dir}/recommendations.json"
 
 optimizer:
   _target_: torch.optim.Adam

diff --git a/configs/model/mins.yaml b/configs/model/mins.yaml
@@ -40,12 +40,16 @@ outputs:
       "target_sentiments",
       "hist_categories",
       "hist_sentiments",
+      "user_ids",
+      "cand_news_ids",
     ]
 
 # evaluation
 top_k_list: [5, 10]
 num_categ_classes: 18
 num_sent_classes: 3
+save_recs: False
+recs_fpath: "${paths.output_dir}/recommendations.json"
 
 optimizer:
   _target_: torch.optim.Adam

diff --git a/configs/model/naml.yaml b/configs/model/naml.yaml
@@ -38,12 +38,16 @@ outputs:
       "target_sentiments",
       "hist_categories",
       "hist_sentiments",
+      "user_ids",
+      "cand_news_ids",
     ]
 
 # evaluation
 top_k_list: [5, 10]
 num_categ_classes: 18
 num_sent_classes: 3
+save_recs: False
+recs_fpath: "${paths.output_dir}/recommendations.json"
 
 optimizer:
   _target_: torch.optim.Adam

diff --git a/configs/model/npa.yaml b/configs/model/npa.yaml
@@ -32,12 +32,16 @@ outputs:
       "target_sentiments",
       "hist_categories",
       "hist_sentiments",
+      "user_ids",
+      "cand_news_ids",
     ]
 
 # evaluation
 top_k_list: [5, 10]
 num_categ_classes: 18
 num_sent_classes: 3
+save_recs: False
+recs_fpath: "${paths.output_dir}/recommendations.json"
 
 optimizer:
   _target_: torch.optim.Adam

diff --git a/configs/model/nrms.yaml b/configs/model/nrms.yaml
@@ -35,12 +35,16 @@ outputs:
       "target_sentiments",
       "hist_categories",
       "hist_sentiments",
+      "user_ids",
+      "cand_news_ids",
     ]
 
 # evaluation
 top_k_list: [5, 10]
 num_categ_classes: 18
 num_sent_classes: 3
+save_recs: False
+recs_fpath: "${paths.output_dir}/recommendations.json"
 
 optimizer:
   _target_: torch.optim.Adam

diff --git a/configs/model/senti_debias.yaml b/configs/model/senti_debias.yaml
@@ -45,6 +45,8 @@ outputs:
       "target_sentiments",
       "hist_categories",
       "hist_sentiments",
+      "user_ids",
+      "cand_news_ids",
     ]
 
 # loss coefficients
@@ -55,6 +57,8 @@ beta_coefficient: 10
 top_k_list: [5, 10]
 num_categ_classes: 18
 num_sent_classes: 3
+save_recs: False
+recs_fpath: "${paths.output_dir}/recommendations.json"
 
 optimizer: null
 

diff --git a/configs/model/sentirec.yaml b/configs/model/sentirec.yaml
@@ -41,12 +41,16 @@ outputs:
       "target_sentiments",
       "hist_categories",
       "hist_sentiments",
+      "user_ids",
+      "cand_news_ids",
     ]
 
 # evaluation
 top_k_list: [5, 10]
 num_categ_classes: 18
 num_sent_classes: 3
+save_recs: False
+recs_fpath: "${paths.output_dir}/recommendations.json"
 
 optimizer:
   _target_: torch.optim.Adam

diff --git a/configs/model/tanr.yaml b/configs/model/tanr.yaml
@@ -40,12 +40,16 @@ outputs:
       "target_sentiments",
       "hist_categories",
       "hist_sentiments",
+      "user_ids",
+      "cand_news_ids",
     ]
 
 # evaluation
 top_k_list: [5, 10]
 num_categ_classes: 18
 num_sent_classes: 3
+save_recs: False
+recs_fpath: "${paths.output_dir}/recommendations.json"
 
 optimizer:
   _target_: torch.optim.Adam

diff --git a/newsreclib/data/components/adressa_dataframe.py b/newsreclib/data/components/adressa_dataframe.py
@@ -548,7 +548,7 @@ def _load_behaviors(self) -> pd.DataFrame:
             log.info("Mapping uid to index.")
             behaviors["user"] = behaviors["uid"].apply(lambda x: uid2index.get(x, 0))
 
-            behaviors = behaviors[["user", "history", "candidates", "labels"]]
+            behaviors = behaviors[["uid", "user", "history", "candidates", "labels"]]
 
             # cache processed data
             log.info(
@@ -560,7 +560,7 @@ def _load_behaviors(self) -> pd.DataFrame:
 
     def _process_news_files(
         self, filepath
-    ) -> Tuple[Dict[str, str], Dict[str, str], Dict[str, str], Dict[str, int]]:
+    ) -> Tuple[Dict[str, str], Dict[str, str], Dict[str, str], Dict[str, str]]:
         """Processes the news data.
 
         Adapted from
@@ -604,7 +604,9 @@ def _process_news_files(
                                 == event_dict["category1"].split("|")[-1]
                             )
 
-        nid2index = {k: v for k, v in zip(news_title.keys(), range(1, len(news_title) + 1))}
+        nid2index = {
+            k: "N" + str(v) for k, v in zip(news_title.keys(), range(1, len(news_title) + 1))
+        }
 
         return news_title, news_category, news_subcategory, nid2index
 
@@ -664,7 +666,7 @@ def _process_users(
                         and event_dict["id"] in nid2index
                     ):
                         nindex = nid2index[event_dict["id"]]
-                        uid = event_dict["userId"]
+                        uid = "U" + str(event_dict["userId"])
 
                         if uid not in uid2index:
                             uid2index[uid] = len(uid2index)

diff --git a/newsreclib/data/components/batch.py b/newsreclib/data/components/batch.py
@@ -17,16 +17,19 @@ class RecommendationBatch(TypedDict):
             Dictionary of news from a the users' candidates, mapping news features to values.
         labels:
             Ground truth specifying whether the news is relevant to the user.
-        users:
-            Users included in the batch.
+        user_ids:
+            Original user IDs of the users included in the batch.
+        user_idx:
+            Indices of users included in the batch (e.g., for creating embedding matrix).
     """
 
     batch_hist: torch.Tensor
     batch_cand: torch.Tensor
     x_hist: Dict[str, Any]
     x_cand: Dict[str, Any]
     labels: torch.Tensor
-    users: torch.Tensor
+    user_ids: torch.Tensor
+    user_idx: torch.Tensor
 
 
 class NewsBatch(TypedDict):

diff --git a/newsreclib/data/components/mind_dataframe.py b/newsreclib/data/components/mind_dataframe.py
@@ -590,7 +590,7 @@ def _load_behaviors(self) -> pd.DataFrame:
 
             # cache parsed behaviors
             log.info(f"Caching parsed behaviors of size {len(behaviors)} to {parsed_bhv_file}.")
-            behaviors = behaviors[["user", "history", "candidates", "labels"]]
+            behaviors = behaviors[["uid", "user", "history", "candidates", "labels"]]
             file_utils.to_tsv(behaviors, parsed_bhv_file)
 
         return behaviors

diff --git a/newsreclib/data/components/news_dataset.py b/newsreclib/data/components/news_dataset.py
@@ -101,6 +101,10 @@ def _tokenize_plm(self, text: List[str]):
     def _tokenize_df(self, df: pd.DataFrame) -> Dict[str, Any]:
         batch_out = {}
 
+        # news IDs (i.e., keep only numeric part of unique NID)
+        nids = np.array([int(nid.split("N")[-1]) for nid in df.index.values])
+        batch_out["news_ids"] = torch.from_numpy(nids).long()
+
         if not self.concatenate_inputs:
             # prepare text
             if not self.use_plm: