diff --git a/nlp/desci_sense/evaluation/Evaluation_benchmark.py b/nlp/desci_sense/evaluation/Evaluation_benchmark.py
new file mode 100644
index 00000000..f3f08e1c
--- /dev/null
+++ b/nlp/desci_sense/evaluation/Evaluation_benchmark.py
@@ -0,0 +1,585 @@
+#evaluation of the academic filter, it takes as a input a dataset and a handle table.
+# the handle table holds information about the accounts that published the dataset posts
+
+"""Script to run evaluation of label prediction models.
+
+Usage:
+  filter_evaluation.py [--config=<config>] [--dataset=<dataset>] [--dataset_file=<file>] [--handle_file=<file>]
+
+
+Options:
+--config=<config>  Optional path to configuration file.
+--dataset=<dataset> Optional path to a wandb artifact.
+--dataset_file=<file> Optional dataset file name e.g. labeled_dataset.table.json indeed it should be a table.json format
+--handle_file=<file> Optional file name e.g. labeled_dataset.table.json indeed it should be a table.json format
+
+"""
+from datetime import datetime
+import wandb
+from pathlib import Path
+import pandas as pd
+import numpy as np
+import sys
+import docopt
+import re
+from collections import Counter
+import concurrent.futures
+from tqdm import tqdm
+from sklearn.preprocessing import LabelBinarizer
+from sklearn.metrics import (
+    precision_recall_fscore_support,
+    accuracy_score,
+    confusion_matrix
+)
+import matplotlib.pyplot as plt
+
+
+
+sys.path.append(str(Path(__file__).parents[2]))
+
+from desci_sense.evaluation.utils import (
+    get_dataset, create_custom_confusion_matrix, posts_to_refPosts, obj_str_to_dict, autopct_format,
+    projection_to_list, flatten_list
+)
+from desci_sense.shared_functions.parsers.multi_chain_parser import MultiChainParser
+from desci_sense.shared_functions.init import init_multi_chain_parser_config
+from desci_sense.shared_functions.schema.post import RefPost
+from desci_sense.shared_functions.dataloaders import (
+    scrape_post,
+    convert_text_to_ref_post,
+)
+
+class Evaluation:
+    def __init__(self, config):
+        self.config = config
+
+    class CustomLabelBinarizer(LabelBinarizer):
+        def fit(self, y, order=None):
+            if order is not None:
+                self.classes_ = np.array(order)
+            else:
+                super().fit(y)
+            return self
+
+    def check_topic(self, topics):
+        item_types_allowlist = [
+            "bookSection", "journalArticle", "preprint", "book", "manuscript",
+            "thesis", "presentation", "conferencePaper", "report"
+        ]
+        return int(bool(set(topics).intersection(set(item_types_allowlist))))
+
+    def topic_eval(self, df, tp):
+        bool_topics = list(df.apply(lambda row: self.check_topic(row['Ref item types']), axis=1))
+        try:
+            ratio = sum(bool_topics) / tp
+        except Exception as e:
+            ratio = 0
+            print("Ratio exception:", e)
+        return ratio
+    
+    def row_to_post(self,row:pd.DataFrame):
+        try:
+            ref_post = RefPost(
+                author=row['username'],content=row['Text'],url='',ref_urls=row['urls'],source_network=row['server']
+                )
+        except Exception as e:
+            print('cannot convert to refpost: ',row)
+            print('With exception: ',e)
+            ref_post = None
+        return ref_post
+
+    def dataframe_to_ref_posts(self, df: pd.DataFrame):
+        ref_posts = []
+        for _, row in df.iterrows():
+            ref_post = self.row_to_post(row)        
+            ref_posts.append(ref_post)
+        return ref_posts
+    
+    def prepare_parser_input(self, df):
+        print('Converting posts to refPosts')
+        ref_posts = posts_to_refPosts(df['Text'])
+        return ref_posts
+
+    def pred_labels(self, df,active_list = ["hashtags"] , batch_size=10):
+        model = MultiChainParser(self.config)
+        inputs = self.prepare_parser_input(df)
+        results = model.batch_process_ref_posts(inputs=inputs, active_list=active_list,batch_size=batch_size)
+        try:
+            df['Predicted Label'] = [x.filter_classification.value for x in results]
+            df['Reasoning Steps'] = ["Keywords: " + str(x.debug['topics']['reasoning']) + " Topics: " + str(x.debug['keywords']['reasoning']) for x in results]
+            df['Keywords'] = [x.keywords for x in results]
+            df['Topics'] = [x.topics for x in results]
+            df['Ref item types'] = [x.item_types for x in results]
+            df['academic_keyword'] = [x.research_keyword for x in results]
+        except Exception as e:
+            print("Parser error:", e)
+        return inputs, results
+
+    def normalize_df(self, df):
+        if isinstance(df["True Label"].iloc[0], list):
+            df["True Label"] = df["True Label"].apply(lambda x: x[0])
+
+    def binarize(self, y_pred, y_true):
+        lb = self.CustomLabelBinarizer()
+        lb.fit(['research', 'not_research'], order=['research', 'not_research'])
+        y_true = lb.transform(y_true)
+        y_pred = lb.transform(y_pred)
+        return y_pred, y_true, lb.classes_
+
+    def calculate_scores(self, y_pred, y_true):
+        precision, recall, f1_score, support = precision_recall_fscore_support(y_true, y_pred, average=None)
+        accuracy = accuracy_score(y_pred=y_pred, y_true=y_true)
+        return precision[0], recall[0], f1_score[0], accuracy
+
+    def calculate_feed_score(self, df, name):
+        df1 = df[df["username"] == name]
+        y_pred = df1["Predicted Label"]
+        y_true = df1["True Label"]
+        n = len(y_true)
+        try:
+            y_pred, y_true, labels = self.binarize(y_pred=y_pred, y_true=y_true)
+            precision, recall, f1_score, accuracy = self.calculate_scores(y_pred=y_pred, y_true=y_true)
+            try:
+                cm = confusion_matrix(y_pred=y_pred, y_true=y_true)
+            except:
+                print('No entries in feed of:', name)
+                tp = 0
+            try:
+                tp = cm[0, 0]
+                fn = cm[1, 0]
+            except:
+                print("no FNs")
+                fn = 0
+            try:
+                r_topics = self.topic_eval(df=df1, tp=tp)
+            except:
+                print("No citoids detection")
+                r_topics = 0
+            return pd.Series([precision, recall, f1_score, accuracy, tp, fn, n, r_topics],
+                             index=["precision", "recall", "f1_score", "accuracy", "TP", "FN", "posts count", 'citoid positive ratio'])
+        except Exception as e:
+            print(f"Exception was raised while calculating feed scores: {e}")
+            return pd.Series([0, 0, 0, 0, 0, 0, n, 0], index=["precision", "recall", "f1_score", "accuracy", "TP", "FN", "posts count", 'citoid positive ratio'])
+
+    def weighted_average(self, column_name, df):
+        return (df[column_name] * df['posts count']).sum() / df['posts count'].sum()
+
+    def constr_feed_chart(self, df, df_handles):
+        df_feed_eval = df_handles[["username", "server", "info"]]
+        for column in ["precision", "recall", "f1_score", "accuracy", "posts count"]:
+            df_feed_eval[column] = 0
+        df_feed_eval[["precision", "recall", "f1_score", "accuracy", 'TP', 'FN', "posts count", 'citoid positive ratio']] = df_feed_eval.apply(
+            lambda row: self.calculate_feed_score(df=df, name=row["username"]), axis=1)
+        average_row = [self.weighted_average(column_name=x, df=df_feed_eval) for x in ["precision", "recall", "f1_score", "accuracy"]]
+        tp = df_feed_eval["TP"].sum()
+        r_topics = self.topic_eval(df=df, tp=tp)
+        average_row.extend([tp, df_feed_eval["FN"].sum(), df_feed_eval["posts count"].sum(), r_topics])
+        new_row = ["Average", "", ""] + average_row 
+        new_row = pd.DataFrame([new_row], columns=['username', 'server', 'info', 'precision', 'recall', 'f1_score', 'accuracy', 'TP', 'FN', 'posts count', 'citoid positive ratio'])
+        return df_feed_eval._append(new_row, ignore_index=True)
+    
+    # Zotero Item type analysis
+    def count_zotero_types(self,df : pd.DataFrame):
+        counts_df = df["Ref item types"].apply(flatten_list).apply(Counter).apply(pd.Series).fillna(0).astype(int)
+        total_row = counts_df.sum()
+        total_row.name = 'Total'
+
+        return total_row
+    def count_research_zotero_types(self, df: pd.DataFrame, allow_list=[
+            "bookSection", "journalArticle", "preprint", "book", "manuscript",
+            "thesis", "presentation", "conferencePaper", "report"
+        ]):
+        # Apply the check_topic method to each row and create a boolean mask
+        #mask = df['Ref item types'].apply(flatten_list).apply(lambda x: self.check_topic([item for item in x]) == 1)
+
+        # Filter the DataFrame using the mask
+        #filtered_df = df[mask]
+
+        project_to_allowlist = projection_to_list(allow_list)
+        #This is not good yet TODO, it will count each item only once
+        #filtered_df['Ref item types'] = [project_to_allowlist(x) for x in filtered_df["Ref item types"]]
+        # Count items in the filtered DataFrame
+        #counts_df = filtered_df["Ref item types"].apply(flatten_list).apply(project_to_allowlist).apply(Counter).apply(pd.Series).fillna(0).astype(int)
+        counts_df = df["Ref item types"].apply(flatten_list).apply(project_to_allowlist).apply(Counter).apply(pd.Series).fillna(0).astype(int)
+        total_row = counts_df.sum()
+        #total_row.name = 'Total'
+        return total_row
+
+
+    def build_item_type_pie(self,df:pd.DataFrame):
+        total_counts = self.count_zotero_types(df=df)
+        # Create a pie chart
+        fig1, ax = plt.subplots(figsize=(12, 12))
+        ax.pie(total_counts, labels=total_counts.index, autopct=lambda pct: autopct_format(pct, total_counts), startangle=140)
+        ax.set_title('Distribution of Item Types')
+
+        #plt.show()
+
+        
+
+        total_counts = self.count_research_zotero_types(df=df)
+        # Create a pie chart
+        fig2, ax = plt.subplots(figsize=(12, 12))
+        ax.pie(total_counts, labels=total_counts.index, autopct=lambda pct: autopct_format(pct, total_counts), startangle=140)
+        ax.set_title('Distribution of research item types')
+
+        #plt.show()
+
+        return fig1, fig2
+
+class TwitterEval(Evaluation):
+    def __init__(self, config):
+        super().__init__(config)
+        
+    @staticmethod
+    def check_quotes(urls):
+        quotes = []
+        pattern = re.compile(r'^https://(?:twitter\.com|x\.com)/.+/[0-9]+$')
+        for url in urls:
+            if pattern.match(url):
+                quotes.append(url)
+        return quotes
+
+    def nested_quotes_citoid(self,post:RefPost,steps = 0, ind = 0):
+        multi_chain_parser = MultiChainParser(self.config)
+        
+        result = multi_chain_parser.process_ref_post(post=post,active_list=["hashtags"])
+        print("post urls",result.reference_urls)
+        print("Item types: ",result.item_types)
+        
+        item_types = result.item_types 
+
+        if self.check_topic(result.item_types):
+            print("Yay, citoid found topic")
+            ind = 1
+        else:
+            quotes = self.check_quotes(result.reference_urls)
+            if quotes:
+                print("checking quotes")
+                for url in quotes:
+                    quote = scrape_post(url)
+                    ind, steps, t = self.nested_quotes_citoid(post = quote, steps=steps+1, ind = ind)
+                    
+            else:
+                print("done")
+        return ind, steps, item_types
+
+    def nested_quotes_citoid_parallel(self, inputs):
+        results = []
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            futures = {executor.submit(self.nested_quotes_citoid, post): post for post in inputs}
+            
+            for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
+                post = futures[future]
+                try:
+                    result = future.result()
+                    results.append(result)
+                except Exception as exc:
+                    print(f'Post {post} generated an exception: {exc}')
+                    # Append a default value to maintain length consistency
+                    results.append((0, 0, ['citoid_error']))
+                    
+        return results
+
+    def feed_tweet_type_statistics(self, df, name,update_df = 1,post_types =['quote','thread']):
+        print('Analyze tweets by',name)
+        df1 = df[df["username"] == name]
+        inputs = self.prepare_parser_input(df1)
+        post_count=len(inputs)
+        results = self.nested_quotes_citoid_parallel(inputs)
+        if update_df:
+            # Ensure 'citoid_research' column exists
+            if 'citoid_research' not in df.columns:
+                df['citoid_research'] = None
+
+            # Extract the first element of each tuple in results and ensure lengths match
+            first_elements = [result[0] for result in results[:post_count]]
+            
+            '''# Debugging prints
+            print(f"Length of df1: {len(df1)}")
+            print(f"Length of inputs: {len(inputs)}")
+            print(f"Length of results: {len(results)}")
+            print(f"Length of first_elements: {len(first_elements)}")'''
+        
+            # Assign these elements to the corresponding rows in df1
+            df.loc[df["username"] == name, 'citoid_research'] = first_elements
+        #TODO make more better - defining df1 twice 
+        df1 = df[df["username"] == name]
+        citoid_count = 0
+        quotes_count = 0
+        quoted_citoid_count = 0
+        item_type_list = []
+        for ind, steps, item_type in results:
+            citoid_count = citoid_count +ind
+            item_type_list.append(item_type)
+            if steps:
+                quoted_citoid_count = quoted_citoid_count + ind
+                quotes_count = quotes_count + 1
+        if post_count:
+            quotes_ratio = quotes_count/post_count
+            citoid_ratio = citoid_count/post_count
+        else:
+            quotes_ratio = -1
+            citoid_ratio = -1
+        #thread stat
+        if 'thread' in post_types:
+            thread_count = df1['thread'].sum()
+            citoid_threads = df1[(df1['thread'] ==1) & (df1['citoid_research']==1)].shape[0]
+            if post_count:
+                thread_ratio = thread_count/post_count
+            else:
+                thread_ratio = -1
+            return citoid_count, quoted_citoid_count, quotes_count, post_count, quotes_ratio, citoid_ratio, thread_count, citoid_threads, thread_ratio, item_type_list
+
+        
+        return citoid_count, quoted_citoid_count, quotes_count, post_count, quotes_ratio, citoid_ratio, item_type_list
+
+    def build_post_type_chart(self,df_handles:pd.DataFrame,df:pd.DataFrame,parse_df = 1,post_types = ['quote','thread']):
+        df_feed_eval = df_handles[["username", "server", "info"]].copy()
+        
+        for column in ["citoid_count", "quoted_citoid_count", "quotes_count","post_count","quotes_ratio","citoid_ratio","thread_count","citoid_threads","thread_ratio"]:
+            df_feed_eval[column] = 0
+            df_feed_eval[["citoid_count", "quoted_citoid_count", "quotes_count","post_count","quotes_ratio","citoid_ratio","thread_count","citoid_threads","thread_ratio","Ref item types"]] = df_feed_eval.apply(
+            lambda row: pd.Series(self.feed_tweet_type_statistics(df=df, name=row["username"])), axis=1)
+    
+        #"Total" row
+        post_count = df_feed_eval["post_count"].sum()
+        quotes_count = df_feed_eval["quotes_count"].sum()
+        citoid_count = df_feed_eval["citoid_count"].sum()
+        thread_count = df_feed_eval['thread_count'].sum()
+        citoid_threads = df_feed_eval['citoid_threads'].sum()
+        if post_count:
+            quotes_ratio = quotes_count/post_count
+            citoid_ratio = citoid_count/post_count
+            thread_ratio = thread_count/post_count
+        else:
+            quotes_ratio = -1
+            citoid_ratio = -1
+            thread_ratio = -1
+        new_row = ["Summery","","",citoid_count,df_feed_eval["quoted_citoid_count"].sum(),quotes_count,post_count,quotes_ratio,citoid_ratio,thread_count,citoid_threads,thread_ratio,[]]
+        new_row = pd.DataFrame([new_row], columns=['username', 'server', 'info', "citoid_count", "quoted_citoid_count", "quotes_count","post_count","quotes_ratio","citoid_ratio","thread_count","citoid_threads","thread_ratio","Ref item types"])
+        return df_feed_eval._append(new_row, ignore_index=True)
+
+    def build_thread_chart(self,df_handles:pd.DataFrame,df:pd.DataFrame):
+        df_feed_eval = df_handles[["username", "server", "info"]].copy()
+        for column in ["citoid_count", "thread_citoid_count", "thread_count","post_count","thread_ratio","citoid_ratio"]:
+            df_feed_eval[column] = 0
+            df_feed_eval[["citoid_count", "thread_citoid_count", "thread_count","post_count","thread_ratio","citoid_ratio","Ref item types"]] = df_feed_eval.apply(
+            lambda row: pd.Series(self.feed_tweet_type_statistics(df=df, name=row["username"])), axis=1)
+        #"Total" row
+        post_count = df_feed_eval["post_count"].sum()
+        quotes_count = df_feed_eval["quotes_count"].sum()
+        citoid_count = df_feed_eval["citoid_count"].sum()
+        if post_count:
+            quotes_ratio = quotes_count/post_count
+            citoid_ratio = citoid_count/post_count
+        else:
+            quotes_ratio = -1
+            citoid_ratio = -1
+        new_row = ["Summery","","",citoid_count,df_feed_eval["quoted_citoid_count"].sum(),quotes_count,post_count,quotes_ratio,citoid_ratio,[]]
+        new_row = pd.DataFrame([new_row], columns=['username', 'server', 'info', "citoid_count", "quoted_citoid_count", "quotes_count","post_count","quotes_ratio","citoid_ratio","Ref item types"])
+        return df_feed_eval._append(new_row, ignore_index=True)
+
+                    
+
+
+    
+
+
+
+if __name__ == "__main__":
+    #post = scrape_post('https://x.com/rtk254/status/1741841607421263966')
+    llm_type="mistralai/mistral-7b-"
+
+    config = init_multi_chain_parser_config(
+        llm_type=llm_type,
+        post_process_type="combined"
+    )
+    Eval = TwitterEval(config=config)
+    #q = Eval.nested_quotes_citoid(post=post)
+    #print(q)
+    wandb.login()
+
+    api = wandb.Api()
+
+    #TODO move from testing
+    run = wandb.init(project="post_type_statistics", job_type="evaluation")
+
+    # get artifact path
+
+    dataset_artifact_id = (
+            'common-sense-makers/post_type_stat/non_labeled_tweets:v0'
+        )
+
+    # set artifact as input artifact
+    dataset_artifact = run.use_artifact(dataset_artifact_id)
+
+    # initialize table path
+    # add the option to call table_path =  arguments.get('--dataset')
+
+    # download path to table
+    a_path = dataset_artifact.download()
+    print("The path is",a_path)
+
+    # get dataset file name
+
+    table_path = Path(f"{a_path}/non_labeled_data_table.table.json")
+
+
+    # return the pd df from the table
+    #remember to remove the head TODO
+    df = get_dataset(table_path)
+
+    '''df1 = df[df["username"] == 'sbuckshum']
+
+    ref_posts= Eval.dataframe_to_ref_posts(df1)
+
+    print(ref_posts)'''
+
+
+    table_path = Path(f"{a_path}/handles_chart.table.json")
+
+    df_handles = get_dataset(table_path)
+    df_eval = Eval.build_post_type_chart(df_handles=df_handles,df=df)
+    fig1, fig2 = Eval.build_item_type_pie(df=df_eval)
+    
+    wandb.log({"Quote statistics per feed": wandb.Table(dataframe=df_eval)})
+   #print(df['Ref item types'])
+    #fig1, fig2 = Eval.build_item_type_pie(df=df)
+
+    wandb.log({"item_type_distribution": wandb.Image(fig1)})
+
+    wandb.log({"research_item_type_distribution": wandb.Image(fig2)})
+
+    #true_df = df[df["True Label"] == 'research']
+
+    #fig1 , fig2 = Eval.build_item_type_pie(true_df)
+    #wandb.log({"research_type_distribution": wandb.Image(fig)})
+    config = obj_str_to_dict(config)
+
+    run.config.update(config)
+
+    
+
+    artifact = wandb.Artifact("dataset_stat", type="stat")
+
+    # Create a wandb.Table from the Pandas DataFrame
+    table = wandb.Table(dataframe=df)
+
+    # Add the wandb.Table to the artifact
+    artifact.add(table, "post_stat")
+
+    run.log_artifact(artifact)
+
+    wandb.run.finish()
+
+    
+
+    """inputs = Eval.prepare_parser_input(df)
+
+    results = Eval.nested_quotes_citoid_parallel(inputs)"""
+    """count = 0 
+    errors = []
+    for p in inputs:
+        try: 
+            ind, steps, item_types = Eval.nested_quotes_citoid(post=p)
+            count = count + ind
+        except Exception as e:
+            errors.append(e)
+    print("Count: ", count)
+    print("Errors count: ",len(errors))
+    print("Errors: ",errors)"""
+
+
+    
+    #inputs, results = Eval.pred_labels(df=df)
+
+
+    """
+    arguments = docopt.docopt(__doc__)
+
+    config_path = arguments.get("--config")
+    dataset_path = arguments.get("--dataset")
+    dataset_file = arguments.get("--dataset_file")
+    handle_file = arguments.get("--handle_file")
+
+    current_datetime = datetime.now()
+    time = current_datetime.strftime("%Y%m%d%H%M%S")
+    llm_type="mistralai/mistral-7b-"
+
+    config = init_multi_chain_parser_config(
+        llm_type=llm_type,
+        post_process_type="combined"
+    )
+
+    wandb.login()
+
+    api = wandb.Api()
+    run = wandb.init(project="testing", job_type="evaluation", name= llm_type+ str(time))
+
+    if dataset_path:
+        dataset_artifact_id = dataset_path
+    else:
+        dataset_artifact_id = 'common-sense-makers/filter_evaluation/labeled_tweets_no_threads:v1'
+
+    dataset_artifact = run.use_artifact(dataset_artifact_id)
+    a_path = dataset_artifact.download()
+
+    if dataset_file:
+        table_path = Path(f"{a_path}/{dataset_file}")
+    else:
+        table_path = Path(f"{a_path}/labeled_data_table_no_threads.table.json")
+
+    df = get_dataset(table_path).head(10)
+
+    if handle_file:
+        table_path = Path(f"{a_path}/{dataset_file}")
+    else:
+        table_path = Path(f"{a_path}/handles_chart.table.json")
+
+    df_handles = get_dataset(table_path)
+    evaluator = Evaluation(config=config)
+    evaluator.pred_labels(df=df)
+    evaluator.normalize_df(df)
+    y_pred, y_true, labels = evaluator.binarize(y_pred=df["Predicted Label"], y_true=df["True Label"])
+    precision, recall, f1_score, accuracy = evaluator.calculate_scores(y_pred=y_pred, y_true=y_true)
+    artifact = wandb.Artifact("prediction_evaluation-" + str(time), type="evaluation")
+    table = wandb.Table(dataframe=df)
+    artifact.add(table, "prediction_evaluation")
+
+    try:
+        feed_chart = evaluator.constr_feed_chart(df=df, df_handles=df_handles)
+        wandb.log({"Scores per feed": wandb.Table(dataframe=feed_chart)})
+    except Exception as e:
+        print("An exception was raised building the feed chart:", e)
+
+    try:
+        matrix = confusion_matrix(y_true, y_pred)
+        labels_with_info = [f"(True) {label}" for label in labels]
+        predicted_labels_with_info = [f"(Pred) {label}" for label in labels]
+        wandb.log({
+            "confusion_matrix": wandb.plots.HeatMap(
+                matrix_values=matrix, 
+                y_labels=labels_with_info, 
+                x_labels=predicted_labels_with_info, 
+                show_text=True
+            )
+        })
+    except:
+        print("Not enough examples for constructing confusion matrix")
+
+    meta_data = {
+        "dataset_size": len(df),
+        "precision": pd.Series(precision).mean(),
+        "recall": pd.Series(recall).mean(),
+        "f1_score": pd.Series(f1_score).mean(),
+        "accuracy": accuracy,
+    }
+
+    artifact.metadata.update(meta_data)
+    trans_config = obj_str_to_dict(config)
+    run.config.update(trans_config)
+    run.summary.update(meta_data)
+    wandb.log_artifact(artifact, aliases=["latest"])
+    wandb.run.finish()
+"""
\ No newline at end of file
diff --git a/nlp/desci_sense/evaluation/item_type_stat_pie.py b/nlp/desci_sense/evaluation/item_type_stat_pie.py
new file mode 100644
index 00000000..c10d6b81
--- /dev/null
+++ b/nlp/desci_sense/evaluation/item_type_stat_pie.py
@@ -0,0 +1,69 @@
+from datetime import datetime
+import wandb
+from pathlib import Path
+import pandas as pd
+import numpy as np
+import sys
+
+sys.path.append(str(Path(__file__).parents[2]))
+
+from desci_sense.evaluation.Evaluation_benchmark import TwitterEval
+from desci_sense.evaluation.utils import obj_str_to_dict, get_dataset
+
+if __name__ == "__main__":
+    
+    wandb.login()
+
+    api = wandb.Api()
+
+    #TODO move from testing
+    run = wandb.init(project="testing", job_type="evaluation")
+
+    # get artifact path
+
+    dataset_artifact_id = (
+            'common-sense-makers/filter_evaluation/prediction_evaluation-20240521132713:v0'
+        )
+
+    # set artifact as input artifact
+    dataset_artifact = run.use_artifact(dataset_artifact_id)
+
+    # initialize table path
+    # add the option to call table_path =  arguments.get('--dataset')
+
+    # download path to table
+    a_path = dataset_artifact.download()
+    print("The path is",a_path)
+
+    # get dataset file name
+
+    table_path = Path(f"{a_path}/prediction_evaluation.table.json")
+
+
+    # return the pd df from the table
+    #remember to remove the head TODO
+    df = get_dataset(table_path)
+
+    dataset_run = dataset_artifact.logged_by()
+
+    config = dataset_run.config
+
+    Eval = TwitterEval(config=config)
+
+
+    
+    fig1, fig2 = Eval.build_item_type_pie(df=df)
+
+    wandb.log({"item_type_distribution": wandb.Image(fig1)})
+
+    wandb.log({"allowlist_item_type_distribution": wandb.Image(fig2)})
+
+    true_df = df[df["True Label"] == 'research']
+
+    fig1 , fig2 = Eval.build_item_type_pie(true_df)
+    wandb.log({"research_type_distribution": wandb.Image(fig1)})
+    config = obj_str_to_dict(config)
+
+    run.config.update(config)
+
+    wandb.run.finish()
\ No newline at end of file
diff --git a/nlp/desci_sense/evaluation/utils.py b/nlp/desci_sense/evaluation/utils.py
index 74624335..fe4998ac 100644
--- a/nlp/desci_sense/evaluation/utils.py
+++ b/nlp/desci_sense/evaluation/utils.py
@@ -4,6 +4,7 @@
 import re
 import pandas as pd
 import numpy as np
+from collections import Counter
 import concurrent.futures
 from tqdm import tqdm
 from sklearn.preprocessing import MultiLabelBinarizer
@@ -132,4 +133,21 @@ def create_custom_confusion_matrix(y_true, y_pred, labels):
                 fp_j = ~y_true[:, j] & y_pred[:, j]
                 matrix[i, j] = np.sum(fn_i & fp_j)
 
-    return pd.DataFrame(matrix, index=labels, columns=labels)
\ No newline at end of file
+    return pd.DataFrame(matrix, index=labels, columns=labels)
+
+
+
+def autopct_format(pct, total_counts):
+    total = sum(total_counts)
+    count = int(round(pct * total / 100.0))
+    return f'{pct:.1f}% ({count})'
+
+def projection_to_list(list2):
+    def project_to_list(list1):
+        #return list(set(list1) & set(list2))
+        return [item for item in list1 if item in list2]
+    return project_to_list
+
+def flatten_list(lis:list):
+    return [item for sublist in lis for item in sublist]
+
diff --git a/nlp/desci_sense/schema/Nanopub_schema/semantic_post_auto_prov.md b/nlp/desci_sense/schema/Nanopub_schema/semantic_post_auto_prov.md
index d5ee5659..329be2d6 100644
--- a/nlp/desci_sense/schema/Nanopub_schema/semantic_post_auto_prov.md
+++ b/nlp/desci_sense/schema/Nanopub_schema/semantic_post_auto_prov.md
@@ -25,6 +25,7 @@ In this doc I specify what triplets are to be present in our app auto-published
 sub:provenance {
 	#Worked with Tobias on a more rebust prov, TODO
   cosmo: a prov:SoftwareAgent ;
+    rdfs:label "research_filter_v1" ;
     prov:actedOnBehalfOf x:xHandle .
   sub:activity a cosmo:nlpFacilitatedActivity ;
     prov:wasAssociatedWith cosmo:.
diff --git a/nlp/desci_sense/schema/Nanopub_schema/semantic_post_pubinfo_schema.md b/nlp/desci_sense/schema/Nanopub_schema/semantic_post_pubinfo_schema.md
index b3065226..714ce150 100644
--- a/nlp/desci_sense/schema/Nanopub_schema/semantic_post_pubinfo_schema.md
+++ b/nlp/desci_sense/schema/Nanopub_schema/semantic_post_pubinfo_schema.md
@@ -15,7 +15,7 @@
 
 
 ## Template Schema
-
+```
 sub:pubinfo {
 
   x:xHandle foaf:name "{retractos name}" . 
@@ -34,4 +34,5 @@ sub:pubinfo {
     rdfs:label "CoSMO Semantic Post".
     this: cosmo:hasRootSinger "{eth address}"
     
-}
\ No newline at end of file
+}
+```
diff --git a/nlp/desci_sense/shared_functions/dataloaders/twitter/twitter_utils.py b/nlp/desci_sense/shared_functions/dataloaders/twitter/twitter_utils.py
index 56103680..7702159d 100644
--- a/nlp/desci_sense/shared_functions/dataloaders/twitter/twitter_utils.py
+++ b/nlp/desci_sense/shared_functions/dataloaders/twitter/twitter_utils.py
@@ -5,12 +5,14 @@
 import requests
 from datetime import datetime
 
+
 from ...interface import AppPost, PlatformType
 from ...utils import (
     extract_and_expand_urls,
     normalize_url,
     extract_twitter_status_id,
     remove_dups_ordered,
+    normalize_tweet_url,
 )
 from ...schema.post import RefPost, QuoteRefPost
 
@@ -189,22 +191,6 @@ def extract_status_id(url):
         return None
 
 
-def normalize_tweet_url(url):
-    """
-    Normalize Twitter post URLs to use the x.com domain.
-
-    Parameters:
-    url (str): The original Twitter URL.
-
-    Returns:
-    str: The normalized URL with x.com domain.
-    """
-    if "twitter.com" in url:
-        return url.replace("twitter.com", "x.com")
-    else:
-        return url
-
-
 # TODO combine with method below
 def extract_external_ref_urls(tweet: dict, add_qrt_url: bool = True):
     """
diff --git a/nlp/desci_sense/shared_functions/interface.py b/nlp/desci_sense/shared_functions/interface.py
index 1fd1d80b..db194781 100644
--- a/nlp/desci_sense/shared_functions/interface.py
+++ b/nlp/desci_sense/shared_functions/interface.py
@@ -15,6 +15,7 @@
 from rdflib import URIRef, Literal, Graph
 from .prompting.jinja.topics_template import ALLOWED_TOPICS
 from .filters import SciFilterClassfication
+from .utils import normalize_tweet_urls_in_text, normalize_tweet_url
 
 # for calculating thread length limits
 MAX_CHARS_PER_POST = 280
@@ -91,6 +92,39 @@ class TopicConceptDefinition(OntologyConceptDefinition):
     )
 
 
+class ZoteroItemTypeDefinition(OntologyConceptDefinition):
+    """
+    Definition of the ZoteroItemType predicate which is used to represent a reference's
+    item type according to the Zotero ontology.
+    https://www.zotero.org/support/kb/item_types_and_fields
+    """
+
+    name: str = Field(default="zoteroItemType", description="Concept name.")
+    uri: str = Field(
+        default="https://sense-nets.xyz/hasZoteroItemType",
+        description="Linked data URI for this concept.",
+    )
+    versions: List[str] = Field(
+        ["v0"], description="Which ontology versions is this item included in."
+    )
+
+
+class QuotedPostDefinition(OntologyConceptDefinition):
+    """
+    Definition of quotedPost relation for a post that quotes another post
+    https://github.com/Common-SenseMakers/sensemakers/blob/nlp-dev/nlp/desci_sense/schema/Nanopub_schema/semantic_post_quote_schema.md
+    """
+
+    name: str = Field(default="zoteroItemType", description="Concept name.")
+    uri: str = Field(
+        default="https://sense-nets.xyz/quotesPost",
+        description="Linked data URI for this concept.",
+    )
+    versions: List[str] = Field(
+        ["v0"], description="Which ontology versions is this item included in."
+    )
+
+
 class isAConceptDefintion(OntologyConceptDefinition):
     name: str = Field(default="isA", description="Concept name.")
     uri: str = Field(
@@ -141,24 +175,6 @@ class OntologyInterface(BaseModel):
     ontology_config: NotionOntologyConfig = Field(default_factory=NotionOntologyConfig)
 
 
-# TODO remove - changed to OntologyPredicateDefinition
-class OntologyItem(TypedDict):
-    URI: str
-    display_name: str
-    Name: Optional[str]
-    label: Optional[str]
-    prompt: str
-    notes: Optional[str]
-    valid_subject_types: Optional[str]
-    valid_object_types: Optional[str]
-    versions: Optional[str]
-
-
-# TODO remove - changed to KeywordPredicateDefinition
-class KeywordsSupport(TypedDict):
-    keyWordsOntology: OntologyItem
-
-
 class RefMetadata(BaseModel):
     """
     Schema representing extracted metadata of reference URLs
@@ -248,7 +264,7 @@ def graph_serializer(graph: Graph):
 
     @field_validator(
         "semantics", mode="before"
-    )  # before needed since arbitrary types allowec
+    )  # before needed since arbitrary types allowed
     @classmethod
     def ensure_graph(cls, value: Any):
         if isinstance(value, Graph):
@@ -271,17 +287,6 @@ def lower_case_platform_id(cls, v):
         return v.lower() if isinstance(v, str) else v
 
 
-# class AppPostContent(BaseModel):
-
-
-# class AppPost(BaseModel):
-#     content: str = Field(description="Post content")
-#     url: Optional[str] = Field(description="Post url", default=None)
-#     quoted_thread_url: Optional[str] = Field(
-#         description="Url of quoted thread", default=None
-#     )
-
-
 class AppPost(BaseModel):
     content: str = Field(description="Post content")
     url: Optional[str] = Field(description="Post url", default="")
@@ -290,6 +295,14 @@ class AppPost(BaseModel):
         default=None,
     )
 
+    @validator("content", pre=True, always=True)
+    def normalize_twitter_urls(cls, v):
+        return normalize_tweet_urls_in_text(v) if isinstance(v, str) else v
+
+    @validator("url", pre=True, always=True)
+    def normalize_twitter_url(cls, v):
+        return normalize_tweet_url(v) if isinstance(v, str) else v
+
 
 class AppThread(BaseModel):
     author: Author
@@ -299,6 +312,10 @@ class AppThread(BaseModel):
         default=None,
     )
 
+    @validator("url", pre=True, always=True)
+    def normalize_twitter_url(cls, v):
+        return normalize_tweet_url(v) if isinstance(v, str) else v
+
     @property
     def source_network(self) -> PlatformType:
         return self.author.platformId
@@ -314,25 +331,3 @@ class ParsePostRequest(BaseModel):
         description="Additional params for parser (not used currently)",
         default_factory=dict,
     )
-
-
-# TODO remove - changed to RefMetadata
-class RefMeta(TypedDict):
-    title: str
-    description: str
-    image: str
-
-
-class ReflabelsSupport(TypedDict):
-    labelsOntology: List[OntologyItem]
-    refsMeta: Dict[str, RefMeta]
-
-
-class ParsedSupport(TypedDict):
-    keywords: KeywordsSupport
-    refLabels: ReflabelsSupport
-
-
-class ParserResultDto(TypedDict):
-    semantics: str
-    support: ParsedSupport
diff --git a/nlp/desci_sense/shared_functions/postprocessing/__init__.py b/nlp/desci_sense/shared_functions/postprocessing/__init__.py
index 40aa1781..ec663e23 100644
--- a/nlp/desci_sense/shared_functions/postprocessing/__init__.py
+++ b/nlp/desci_sense/shared_functions/postprocessing/__init__.py
@@ -12,6 +12,8 @@
     ParserSupport,
     ParserResult,
     OntologyInterface,
+    ZoteroItemTypeDefinition,
+    QuotedPostDefinition,
 )
 
 from ..configs import ParserChainType, PostProcessType
@@ -134,6 +136,10 @@ class CombinedParserOutput(BaseModel):
         default_factory=list,
         description="List of extracted reference metadata returned by metadata extractor",
     )
+    quoted_post_url: Optional[str] = Field(
+        default=None,
+        description="URL of quoted post, if processed post quotes another post.",
+    )
     debug: Optional[Dict] = Field(
         default_factory=dict,
         description="Diagnostic information for debugging purposes.",
@@ -374,6 +380,39 @@ def convert_keywords_to_rdf_triplets(keywords: List[str]) -> List[RDFTriplet]:
     return triplets
 
 
+def create_quoted_post_triplet(quoted_post_url: str):
+    triplet = RDFTriplet(
+        predicate=URIRef(QuotedPostDefinition().uri),
+        object=URIRef(quoted_post_url),
+    )
+    return triplet
+
+
+def convert_item_types_to_rdf_triplets(
+    item_types: List[str], reference_urls: List[str]
+) -> List[RDFTriplet]:
+    """
+    Converts item type and reference url information into RDF triplets
+    using the ZoteroItemTypeDefinition predicate.
+    For example,
+    convert_item_types_to_rdf_triplets(['preprint'], ['https://arxiv.org/abs/2402.04607']) -->
+    `[RDFTriplet(subject=rdflib.term.URIRef('https://arxiv.org/abs/2402.04607'), predicate=rdflib.term.URIRef('https://sense-nets.xyz/hasZoteroItemType'), object=rdflib.term.Literal('preprint'))]`
+
+
+    """
+    assert len(reference_urls) == len(item_types)
+    triplets = [
+        RDFTriplet(
+            subject=URIRef(ref_url),
+            predicate=URIRef(ZoteroItemTypeDefinition().uri),
+            object=Literal(item_type),
+        )
+        for ref_url, item_type in zip(reference_urls, item_types)
+    ]
+
+    return triplets
+
+
 def convert_triplets_to_graph(triplets: List[RDFTriplet]) -> Graph:
     """Convert list of rdf triplets to rdf graph"""
     g = Graph()
@@ -421,6 +460,9 @@ def combine_from_raw_results(
 
     combined = CombinedParserOutput(**combined_parser_output)
 
+    # add quoted post url
+    combined.quoted_post_url = post.quoted_url
+
     if unprocessed_urls:
         # add unprocessed urls to result
         combined.reference_urls += unprocessed_urls
@@ -487,6 +529,19 @@ def post_process_firebase(
         for t in kw_triplets:
             graph.add(t.to_tuple())
 
+    # add item type triplets
+    item_type_triplets = convert_item_types_to_rdf_triplets(
+        combined_parser_output.item_types,
+        combined_parser_output.reference_urls,
+    )
+    for t in item_type_triplets:
+        graph.add(t.to_tuple())
+
+    # add quotesPost triplet if present
+    if combined_parser_output.quoted_post_url:
+        triplet = create_quoted_post_triplet(combined_parser_output.quoted_post_url)
+        graph.add(triplet.to_tuple())
+
     # gather support info
     parser_support: ParserSupport = get_support_data(
         ontology_base.ontology_interface,
diff --git a/nlp/desci_sense/shared_functions/preprocessing/__init__.py b/nlp/desci_sense/shared_functions/preprocessing/__init__.py
index 96796ccb..0972682f 100644
--- a/nlp/desci_sense/shared_functions/preprocessing/__init__.py
+++ b/nlp/desci_sense/shared_functions/preprocessing/__init__.py
@@ -287,18 +287,25 @@ def preproc_parser_input(parser_input: ParserInput) -> PreprocParserInput:
     """
     orig_thread = parser_input.thread_post
     new_thread = trim_thread_by_length(orig_thread, parser_input.max_chars)
+    included_urls = new_thread.md_ref_urls()
 
     # get reference urls from trimmed posts
-    # TODO handle urls possibly trimmed from trimmed post (currently will be ignored!)
     excluded_urls = []
     num_posts_after_trim = len(new_thread.posts)
-    excluded_posts = orig_thread.posts[num_posts_after_trim:]
+
+    # (num_posts_after_trim - 1) to handle urls possibly trimmed from trimmed post
+    excluded_posts = orig_thread.posts[(num_posts_after_trim - 1) :]
     for p in excluded_posts:
-        excluded_urls += p.md_ref_urls()
+        potential_excluded_urls = p.md_ref_urls()
+        excluded_urls += [
+            url for url in potential_excluded_urls if url not in included_urls
+        ]
 
     # remove dups
     excluded_urls = remove_dups_ordered(excluded_urls)
 
+    assert set(included_urls + excluded_urls) == set(orig_thread.md_ref_urls())
+
     preprocessed_input = PreprocParserInput(
         post_to_parse=new_thread,
         unparsed_urls=excluded_urls,
diff --git a/nlp/desci_sense/shared_functions/preprocessing/threads.py b/nlp/desci_sense/shared_functions/preprocessing/threads.py
index 2f1a7806..5e05c708 100644
--- a/nlp/desci_sense/shared_functions/preprocessing/threads.py
+++ b/nlp/desci_sense/shared_functions/preprocessing/threads.py
@@ -26,6 +26,7 @@ def create_thread_from_posts(posts: List[QuoteRefPost]):
         author=author,
         content=content,
         url=posts[0].url,
+        quoted_url=posts[0].quoted_url,
         source_network=posts[0].source_network,
         ref_urls=all_ref_urls,
         posts=posts_copy,
diff --git a/nlp/desci_sense/shared_functions/prompting/post_renderers/quote_ref_post_renderer.py b/nlp/desci_sense/shared_functions/prompting/post_renderers/quote_ref_post_renderer.py
index 4a9761c0..92861bae 100644
--- a/nlp/desci_sense/shared_functions/prompting/post_renderers/quote_ref_post_renderer.py
+++ b/nlp/desci_sense/shared_functions/prompting/post_renderers/quote_ref_post_renderer.py
@@ -42,7 +42,7 @@ def render_quote_post_content(
 
     processed_content = post.content
 
-    if post.quoted_url:
+    if post.quoted_url and post.quoted_url in ordered_refs:
         # add quoted post url to end of quote post content if not present there
         if post.quoted_url not in processed_content:
             processed_content += f" {post.quoted_url}"
diff --git a/nlp/desci_sense/shared_functions/utils.py b/nlp/desci_sense/shared_functions/utils.py
index 3680bdd9..fa4399e7 100644
--- a/nlp/desci_sense/shared_functions/utils.py
+++ b/nlp/desci_sense/shared_functions/utils.py
@@ -448,3 +448,42 @@ def trim_parts_to_length(part_lengths: List[int], max_length: int) -> List[int]:
             break
 
     return trimmed_part_lengths
+
+
+def normalize_tweet_url(url):
+    """
+    Normalize Twitter post URLs to use the x.com domain.
+
+    Parameters:
+    url (str): The original Twitter URL.
+
+    Returns:
+    str: The normalized URL with x.com domain.
+    """
+    if "twitter.com" in url:
+        return url.replace("twitter.com", "x.com")
+    else:
+        return url
+
+
+def normalize_tweet_urls_in_text(text: str) -> str:
+    """
+    Normalize all occurrences of Twitter URLs to uniform format (using x.com).
+
+    Args:
+        text (str): Input string.
+
+    Returns:
+        str: String after normalization.
+    """
+    extracted_urls, orig_urls = extract_and_expand_urls(
+        text,
+        return_orig_urls=True,
+    )
+    normalized_urls = [normalize_tweet_url(url) for url in extracted_urls]
+
+    # Replace all occurrences of orig_urls in text with normalized_urls
+    for orig_url, normalized_url in zip(orig_urls, normalized_urls):
+        text = text.replace(orig_url, normalized_url)
+
+    return text
diff --git a/nlp/notebooks/XdatasetLogEdit.ipynb b/nlp/notebooks/XdatasetLogEdit.ipynb
index 3747ba2f..d4d3bf99 100644
--- a/nlp/notebooks/XdatasetLogEdit.ipynb
+++ b/nlp/notebooks/XdatasetLogEdit.ipynb
@@ -2,24 +2,16 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 19,
    "metadata": {},
    "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mshahar-r-oriel\u001b[0m (\u001b[33mcommon-sense-makers\u001b[0m). Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n"
-     ]
-    },
     {
      "data": {
       "text/plain": [
        "True"
       ]
      },
-     "execution_count": 1,
+     "execution_count": 19,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -32,11 +24,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
     "import sys\n",
+    "import pandas as pd\n",
     "sys.path.append(\"../\")\n",
     "\n",
     "from desci_sense.shared_functions.init import init_multi_chain_parser_config\n",
@@ -46,366 +39,821 @@
     "from desci_sense.shared_functions.dataloaders import (\n",
     "    scrape_post,\n",
     "    convert_text_to_ref_post,\n",
-    ")"
+    ")\n",
+    "from desci_sense.evaluation.Evaluation_benchmark import TwitterEval\n",
+    "\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "                      id                created_at  account_id  username  \\\n",
-      "0    1780618156832109056 2024-04-17 15:23:45+00:00    14349894  mbauwens   \n",
-      "1    1780617597089669120 2024-04-17 15:21:32+00:00    14349894  mbauwens   \n",
-      "2    1780617135049322496 2024-04-17 15:19:41+00:00    14349894  mbauwens   \n",
-      "3    1780616454812905984 2024-04-17 15:16:59+00:00    14349894  mbauwens   \n",
-      "4    1780608301534789632 2024-04-17 14:44:35+00:00    14349894  mbauwens   \n",
-      "..                   ...                       ...         ...       ...   \n",
-      "705  1778714568895729920 2024-04-12 09:19:34+00:00    17900290    1Br0wn   \n",
-      "706  1778710540912562432 2024-04-12 09:03:34+00:00    17900290    1Br0wn   \n",
-      "707  1778698871310348544 2024-04-12 08:17:12+00:00    17900290    1Br0wn   \n",
-      "708  1778690751909470464 2024-04-12 07:44:56+00:00    17900290    1Br0wn   \n",
-      "709  1778479139709710848 2024-04-11 17:44:04+00:00    17900290    1Br0wn   \n",
-      "\n",
-      "                                                  urls  \\\n",
-      "0    [https://en.wikipedia.org/wiki/Epic_of_evoluti...   \n",
-      "1    [https://wiki.p2pfoundation.net/Category:Therm...   \n",
-      "2    [https://lifehacker.com/tech/ai-is-running-out...   \n",
-      "3            [https://www.journaloffreespeechlaw.org/]   \n",
-      "4    [https://wiki.p2pfoundation.net/Andrew_Targows...   \n",
-      "..                                                 ...   \n",
-      "705  [https://www.axios.com/2024/04/10/ai-artificia...   \n",
-      "706  [https://www.euractiv.com/section/digital/news...   \n",
-      "707  [https://www.theguardian.com/society/2024/apr/...   \n",
-      "708  [https://twitter.com/HalSinger/status/17785538...   \n",
-      "709  [https://twitter.com/gateklons/status/17784779...   \n",
-      "\n",
-      "                                                  text       server  \\\n",
-      "0    A concept you should know about:\\n\\n* The epic...  twitter.com   \n",
-      "1    Quotation as selected  by The Alternative:\\n\\n...  twitter.com   \n",
-      "2    AI is running out of internet:\\n\\n\"AI is runni...  twitter.com   \n",
-      "3    * Journal of Free Speech Law,\\n\\nhttps://t.co/...  twitter.com   \n",
-      "4    Articles from our 'Civilizational Analysis' va...  twitter.com   \n",
-      "..                                                 ...          ...   \n",
-      "705  \"I think they know exactly what they do,\" @Ves...  twitter.com   \n",
-      "706  \"The Council of the EU is preparing a call to ...  twitter.com   \n",
-      "707  Whatever happened to de minimis non curat lex?...  twitter.com   \n",
-      "708  Deaton’s book 'concludes that “Joe Biden does ...  twitter.com   \n",
-      "709  Exactly as intended! 🥳 “EU firms decreased dat...  twitter.com   \n",
-      "\n",
-      "                                               tootURL  \n",
-      "0    https://twitter.com/mbauwens/status/1780618156...  \n",
-      "1    https://twitter.com/mbauwens/status/1780617597...  \n",
-      "2    https://twitter.com/mbauwens/status/1780617135...  \n",
-      "3    https://twitter.com/mbauwens/status/1780616454...  \n",
-      "4    https://twitter.com/mbauwens/status/1780608301...  \n",
-      "..                                                 ...  \n",
-      "705  https://twitter.com/1Br0wn/status/177871456889...  \n",
-      "706  https://twitter.com/1Br0wn/status/177871054091...  \n",
-      "707  https://twitter.com/1Br0wn/status/177869887131...  \n",
-      "708  https://twitter.com/1Br0wn/status/177869075190...  \n",
-      "709  https://twitter.com/1Br0wn/status/177847913970...  \n",
-      "\n",
-      "[710 rows x 8 columns]\n"
+      "0       1800616621209461094\n",
+      "1       1800616621209461094\n",
+      "2       1785773444275040720\n",
+      "3       1757738391414755395\n",
+      "4       1757129457742065841\n",
+      "               ...         \n",
+      "7063    1790849327692206266\n",
+      "7064    1790841655597101546\n",
+      "7065    1790823532877820098\n",
+      "7066    1790750738466877446\n",
+      "7067    1790489757144735855\n",
+      "Name: conversation_id, Length: 7068, dtype: object\n"
      ]
     }
    ],
    "source": [
-    "df = pd.read_json(\"/Users/shaharorielkagan/sensemakers/nlp/notebooks/data/mappedTweets-2.json\")\n",
-    "print(df)"
+    "df = pd.read_json(\"/Users/shaharorielkagan/Downloads/allTweetsFlattened-2.json\",dtype={'conversation_id': str})\n",
+    "print(df['conversation_id'])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "                      id                created_at  account_id  username  \\\n",
-      "0    1780618156832109056 2024-04-17 15:23:45+00:00    14349894  mbauwens   \n",
-      "1    1780617597089669120 2024-04-17 15:21:32+00:00    14349894  mbauwens   \n",
-      "2    1780617135049322496 2024-04-17 15:19:41+00:00    14349894  mbauwens   \n",
-      "3    1780616454812905984 2024-04-17 15:16:59+00:00    14349894  mbauwens   \n",
-      "4    1780608301534789632 2024-04-17 14:44:35+00:00    14349894  mbauwens   \n",
-      "..                   ...                       ...         ...       ...   \n",
-      "705  1778714568895729920 2024-04-12 09:19:34+00:00    17900290    1Br0wn   \n",
-      "706  1778710540912562432 2024-04-12 09:03:34+00:00    17900290    1Br0wn   \n",
-      "707  1778698871310348544 2024-04-12 08:17:12+00:00    17900290    1Br0wn   \n",
-      "708  1778690751909470464 2024-04-12 07:44:56+00:00    17900290    1Br0wn   \n",
-      "709  1778479139709710848 2024-04-11 17:44:04+00:00    17900290    1Br0wn   \n",
-      "\n",
-      "                                                  urls  \\\n",
-      "0    [https://en.wikipedia.org/wiki/Epic_of_evoluti...   \n",
-      "1    [https://wiki.p2pfoundation.net/Category:Therm...   \n",
-      "2    [https://lifehacker.com/tech/ai-is-running-out...   \n",
-      "3            [https://www.journaloffreespeechlaw.org/]   \n",
-      "4    [https://wiki.p2pfoundation.net/Andrew_Targows...   \n",
-      "..                                                 ...   \n",
-      "705  [https://www.axios.com/2024/04/10/ai-artificia...   \n",
-      "706  [https://www.euractiv.com/section/digital/news...   \n",
-      "707  [https://www.theguardian.com/society/2024/apr/...   \n",
-      "708  [https://twitter.com/HalSinger/status/17785538...   \n",
-      "709  [https://twitter.com/gateklons/status/17784779...   \n",
-      "\n",
-      "                                                  Text       server  \\\n",
-      "0    A concept you should know about:\\n\\n* The epic...  twitter.com   \n",
-      "1    Quotation as selected  by The Alternative:\\n\\n...  twitter.com   \n",
-      "2    AI is running out of internet:\\n\\n\"AI is runni...  twitter.com   \n",
-      "3    * Journal of Free Speech Law,\\n\\nhttps://t.co/...  twitter.com   \n",
-      "4    Articles from our 'Civilizational Analysis' va...  twitter.com   \n",
-      "..                                                 ...          ...   \n",
-      "705  \"I think they know exactly what they do,\" @Ves...  twitter.com   \n",
-      "706  \"The Council of the EU is preparing a call to ...  twitter.com   \n",
-      "707  Whatever happened to de minimis non curat lex?...  twitter.com   \n",
-      "708  Deaton’s book 'concludes that “Joe Biden does ...  twitter.com   \n",
-      "709  Exactly as intended! 🥳 “EU firms decreased dat...  twitter.com   \n",
-      "\n",
-      "                                               postURL  \n",
-      "0    https://twitter.com/mbauwens/status/1780618156...  \n",
-      "1    https://twitter.com/mbauwens/status/1780617597...  \n",
-      "2    https://twitter.com/mbauwens/status/1780617135...  \n",
-      "3    https://twitter.com/mbauwens/status/1780616454...  \n",
-      "4    https://twitter.com/mbauwens/status/1780608301...  \n",
-      "..                                                 ...  \n",
-      "705  https://twitter.com/1Br0wn/status/177871456889...  \n",
-      "706  https://twitter.com/1Br0wn/status/177871054091...  \n",
-      "707  https://twitter.com/1Br0wn/status/177869887131...  \n",
-      "708  https://twitter.com/1Br0wn/status/177869075190...  \n",
-      "709  https://twitter.com/1Br0wn/status/177847913970...  \n",
+      "0                                                      []\n",
+      "1                                                      []\n",
+      "2       [https://twitter.com/BelTel/status/17857549561...\n",
+      "3                                                      []\n",
+      "4       [https://twitter.com/jonny/status/175712945774...\n",
+      "                              ...                        \n",
+      "7063    [https://www.nature.com/articles/s41581-024-00...\n",
+      "7064    [https://twitter.com/mpshanahan/status/1790743...\n",
+      "7065    [https://twitter.com/mindthebrainICN/status/17...\n",
+      "7066    [https://twitter.com/UCLMentalHealth/status/17...\n",
+      "7067    [https://www.eventbrite.co.uk/e/insider-outsid...\n",
+      "Name: urls, Length: 7068, dtype: object\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/var/folders/j3/1_zy_b4s2517g8j_rjjdhd4h0000gn/T/ipykernel_41416/2646320256.py:5: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame\n",
       "\n",
-      "[710 rows x 8 columns]\n"
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  df['urls'][n] = df['urls'][n] + [df['quoted_tweet'][n]['url']]\n"
      ]
     }
    ],
+   "source": [
+    "for n in range(len(df['quoted_tweet'])):\n",
+    "    \n",
+    "    if type(df['quoted_tweet'][n]) != float:\n",
+    "        \n",
+    "        df['urls'][n] = df['urls'][n] + [df['quoted_tweet'][n]['url']] \n",
+    "print(df['urls'])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "df.rename(columns={'text': 'Text'}, inplace=True)\n",
-    "print(df)\n"
+    "df['server'] = 'twitter.com'\n",
+    "df['ref_count'] = df['urls'].apply(len) \n",
+    "\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#We need to group by conversetion id and sort by creation time\n",
+    "df['created_at'] = pd.to_datetime(df['created_at'])\n",
+    "\n",
+    "# Step 3: Group by 'conversation_id' and sort by 'created_at'\n",
+    "df = df.sort_values(by=['conversation_id', 'created_at'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "                      id                created_at  account_id  username  \\\n",
-      "0    1780618156832109056 2024-04-17 15:23:45+00:00    14349894  mbauwens   \n",
-      "1    1780617597089669120 2024-04-17 15:21:32+00:00    14349894  mbauwens   \n",
-      "2    1780617135049322496 2024-04-17 15:19:41+00:00    14349894  mbauwens   \n",
-      "3    1780616454812905984 2024-04-17 15:16:59+00:00    14349894  mbauwens   \n",
-      "4    1780608301534789632 2024-04-17 14:44:35+00:00    14349894  mbauwens   \n",
-      "..                   ...                       ...         ...       ...   \n",
-      "705  1778714568895729920 2024-04-12 09:19:34+00:00    17900290    1Br0wn   \n",
-      "706  1778710540912562432 2024-04-12 09:03:34+00:00    17900290    1Br0wn   \n",
-      "707  1778698871310348544 2024-04-12 08:17:12+00:00    17900290    1Br0wn   \n",
-      "708  1778690751909470464 2024-04-12 07:44:56+00:00    17900290    1Br0wn   \n",
-      "709  1778479139709710848 2024-04-11 17:44:04+00:00    17900290    1Br0wn   \n",
+      "                       id                created_at           author_id  \\\n",
+      "0     1651581145228668928 2023-04-27 13:36:44+00:00            64017773   \n",
+      "1     1095012246504292352 2019-02-11 17:30:38+00:00  714022989411586048   \n",
+      "2     1097758010863022080 2019-02-19 07:21:19+00:00          4757597112   \n",
+      "3     1778091831026446592 2024-04-10 16:05:02+00:00          4901659043   \n",
+      "4     1778263104142389248 2024-04-11 03:25:37+00:00          4901659043   \n",
+      "...                   ...                       ...                 ...   \n",
+      "6390  1801318358261841920 2024-06-13 18:18:58+00:00  828361667918569472   \n",
+      "6391  1801320990917497088 2024-06-13 18:29:25+00:00            49270737   \n",
+      "6392  1801328344446681344 2024-06-13 18:58:39+00:00  828361667918569472   \n",
+      "6393   684753729799622656 2016-01-06 15:09:45+00:00          3238593536   \n",
+      "6394   881142725764145152 2017-07-01 13:29:31+00:00          4757597112   \n",
       "\n",
-      "                                                  urls  \\\n",
-      "0    [https://en.wikipedia.org/wiki/Epic_of_evoluti...   \n",
-      "1    [https://wiki.p2pfoundation.net/Category:Therm...   \n",
-      "2    [https://lifehacker.com/tech/ai-is-running-out...   \n",
-      "3            [https://www.journaloffreespeechlaw.org/]   \n",
-      "4    [https://wiki.p2pfoundation.net/Andrew_Targows...   \n",
-      "..                                                 ...   \n",
-      "705  [https://www.axios.com/2024/04/10/ai-artificia...   \n",
-      "706  [https://www.euractiv.com/section/digital/news...   \n",
-      "707  [https://www.theguardian.com/society/2024/apr/...   \n",
-      "708  [https://twitter.com/HalSinger/status/17785538...   \n",
-      "709  [https://twitter.com/gateklons/status/17784779...   \n",
+      "          conversation_id      username  \\\n",
+      "0     1039183605740785664    petersuber   \n",
+      "1     1095012246504292352    DanKotliar   \n",
+      "2     1097758010863022080      EconFeld   \n",
+      "3     1152989885617385472     aj_boston   \n",
+      "4     1199172884146601989     aj_boston   \n",
+      "...                   ...           ...   \n",
+      "6390  1801318358261842036  AnnaCiaunica   \n",
+      "6391  1801320990917496977     KamounLab   \n",
+      "6392  1801328344446681269  AnnaCiaunica   \n",
+      "6393   684753729799622656         robin   \n",
+      "6394   881142725764145152      EconFeld   \n",
       "\n",
-      "                                                  text       server  \\\n",
-      "0    A concept you should know about:\\n\\n* The epic...  twitter.com   \n",
-      "1    Quotation as selected  by The Alternative:\\n\\n...  twitter.com   \n",
-      "2    AI is running out of internet:\\n\\n\"AI is runni...  twitter.com   \n",
-      "3    * Journal of Free Speech Law,\\n\\nhttps://t.co/...  twitter.com   \n",
-      "4    Articles from our 'Civilizational Analysis' va...  twitter.com   \n",
-      "..                                                 ...          ...   \n",
-      "705  \"I think they know exactly what they do,\" @Ves...  twitter.com   \n",
-      "706  \"The Council of the EU is preparing a call to ...  twitter.com   \n",
-      "707  Whatever happened to de minimis non curat lex?...  twitter.com   \n",
-      "708  Deaton’s book 'concludes that “Joe Biden does ...  twitter.com   \n",
-      "709  Exactly as intended! 🥳 “EU firms decreased dat...  twitter.com   \n",
+      "                                           name  \\\n",
+      "0     Peter Suber (@petersuber@fediscience.org)   \n",
+      "1                                   Dan Kotliar   \n",
+      "2                                      Jan Feld   \n",
+      "3                              arthur j. boston   \n",
+      "4                              arthur j. boston   \n",
+      "...                                         ...   \n",
+      "6390    Anna Ciaunica @annaciaunica.bsky.social   \n",
+      "6391                             Sophien Kamoun   \n",
+      "6392    Anna Ciaunica @annaciaunica.bsky.social   \n",
+      "6393                                Robin heart   \n",
+      "6394                                   Jan Feld   \n",
+      "\n",
+      "                                                   urls  \\\n",
+      "0     [https://researchintegrityjournal.biomedcentra...   \n",
+      "1     [https://twitter.com/DmK121/status/10950122465...   \n",
+      "2     [https://theconversation.com/research-shows-st...   \n",
+      "3     [https://twitter.com/aj_boston/status/17780918...   \n",
+      "4     [https://twitter.com/aj_boston/status/17782631...   \n",
+      "...                                                 ...   \n",
+      "6390  [https://twitter.com/AnnaCiaunica/status/18013...   \n",
+      "6391  [https://twitter.com/alexandrepedro/status/180...   \n",
+      "6392  [https://twitter.com/marianne_brkr/status/1801...   \n",
+      "6393       [http://www.smule.com/p/381007451_243129440]   \n",
+      "6394  [https://twitter.com/joshuasgoodman/status/880...   \n",
+      "\n",
+      "                                                   Text  \\\n",
+      "0     Update. \"The cost of peer review was estimated...   \n",
+      "1     Super excited about my talk at @CIMethods this...   \n",
+      "2     Do you wonder why in many universities profess...   \n",
+      "3     FINALLY @criterionchannl https://twitter.com/a...   \n",
+      "4     @hbomax Watched: \\nCurb Your Enthusiasm\\n📺 Sea...   \n",
+      "...                                                 ...   \n",
+      "6390  Off to Palermo 😎 https://twitter.com/AnnaCiaun...   \n",
+      "6391  Always keep a spare key with someone else. Or ...   \n",
+      "6392  Super cool paper here 👇🏼 https://twitter.com/m...   \n",
+      "6393  Awesome cover of \"Let me be there\" via #Smule:...   \n",
+      "6394  I agree, excellent title and paper Mr. @uZoeli...   \n",
       "\n",
-      "                                               tootURL  \n",
-      "0    https://twitter.com/mbauwens/status/1780618156...  \n",
-      "1    https://twitter.com/mbauwens/status/1780617597...  \n",
-      "2    https://twitter.com/mbauwens/status/1780617135...  \n",
-      "3    https://twitter.com/mbauwens/status/1780616454...  \n",
-      "4    https://twitter.com/mbauwens/status/1780608301...  \n",
-      "..                                                 ...  \n",
-      "705  https://twitter.com/1Br0wn/status/177871456889...  \n",
-      "706  https://twitter.com/1Br0wn/status/177871054091...  \n",
-      "707  https://twitter.com/1Br0wn/status/177869887131...  \n",
-      "708  https://twitter.com/1Br0wn/status/177869075190...  \n",
-      "709  https://twitter.com/1Br0wn/status/177847913970...  \n",
+      "                                                    url  \\\n",
+      "0     https://x.com/petersuber/status/16515811452286...   \n",
+      "1     https://x.com/DanKotliar/status/10950122465042...   \n",
+      "2     https://x.com/EconFeld/status/1097758010863022080   \n",
+      "3     https://x.com/aj_boston/status/177809183102644...   \n",
+      "4     https://x.com/aj_boston/status/177826310414238...   \n",
+      "...                                                 ...   \n",
+      "6390  https://x.com/AnnaCiaunica/status/180131835826...   \n",
+      "6391  https://x.com/KamounLab/status/180132099091749...   \n",
+      "6392  https://x.com/AnnaCiaunica/status/180132834444...   \n",
+      "6393      https://x.com/robin/status/684753729799622656   \n",
+      "6394   https://x.com/EconFeld/status/881142725764145152   \n",
       "\n",
-      "[710 rows x 8 columns]\n"
+      "                                           quoted_tweet       server  \\\n",
+      "0                                                   NaN  twitter.com   \n",
+      "1                                                   NaN  twitter.com   \n",
+      "2                                                   NaN  twitter.com   \n",
+      "3                                                   NaN  twitter.com   \n",
+      "4                                                   NaN  twitter.com   \n",
+      "...                                                 ...          ...   \n",
+      "6390                                                NaN  twitter.com   \n",
+      "6391  {'id': '1800911525278081311', 'created_at': '2...  twitter.com   \n",
+      "6392  {'id': '1801262094307766300', 'created_at': '2...  twitter.com   \n",
+      "6393                                                NaN  twitter.com   \n",
+      "6394                                                NaN  twitter.com   \n",
+      "\n",
+      "      ref_count  \n",
+      "0             1  \n",
+      "1             1  \n",
+      "2             1  \n",
+      "3             1  \n",
+      "4             1  \n",
+      "...         ...  \n",
+      "6390          1  \n",
+      "6391          2  \n",
+      "6392          2  \n",
+      "6393          1  \n",
+      "6394          1  \n",
+      "\n",
+      "[6395 rows x 12 columns]\n"
+     ]
+    }
+   ],
+   "source": [
+    "#Now filter conversations to 10min to approximate threads\n",
+    "def filter_by_time_difference(group):\n",
+    "    earliest_time = group['created_at'].min()\n",
+    "    return group[group['created_at'] - earliest_time <= pd.Timedelta(minutes=10)]\n",
+    "\n",
+    "filtered_df = df.groupby('conversation_id').apply(filter_by_time_difference).reset_index(drop=True)\n",
+    "\n",
+    "# Verify the result\n",
+    "print(filtered_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "count\n",
+      "1     3940\n",
+      "2      460\n",
+      "3      125\n",
+      "4       43\n",
+      "5       36\n",
+      "6       26\n",
+      "7       15\n",
+      "8       14\n",
+      "9       10\n",
+      "10       6\n",
+      "11       8\n",
+      "12       2\n",
+      "13       3\n",
+      "14       3\n",
+      "16       1\n",
+      "17       3\n",
+      "25       1\n",
+      "Name: count, dtype: int64\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Count the number of rows that share the same value in 'conversation_id'\n",
+    "conversation_counts = filtered_df['conversation_id'].value_counts()\n",
+    "\n",
+    "# Display the counts\n",
+    "\n",
+    "distribution_of_counts = conversation_counts.value_counts().sort_index()\n",
+    "\n",
+    "print(distribution_of_counts)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "wandb version 0.17.2 is available!  To upgrade, please run:\n",
+       " $ pip install wandb --upgrade"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Tracking run with wandb version 0.16.6"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Run data is saved locally in <code>/Users/shaharorielkagan/sensemakers/nlp/notebooks/wandb/run-20240624_105928-3fvr9vjh</code>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Syncing run <strong><a href='https://wandb.ai/common-sense-makers/post_type_statistics/runs/3fvr9vjh' target=\"_blank\">soft-snowball-2</a></strong> to <a href='https://wandb.ai/common-sense-makers/post_type_statistics' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/run' target=\"_blank\">docs</a>)<br/>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View project at <a href='https://wandb.ai/common-sense-makers/post_type_statistics' target=\"_blank\">https://wandb.ai/common-sense-makers/post_type_statistics</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View run at <a href='https://wandb.ai/common-sense-makers/post_type_statistics/runs/3fvr9vjh' target=\"_blank\">https://wandb.ai/common-sense-makers/post_type_statistics/runs/3fvr9vjh</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e2b07cfe043a4360be38b716cad95fe8",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\\r'), FloatProgress(value=1.0, max=1.0)))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View run <strong style=\"color:#cdcd00\">soft-snowball-2</strong> at: <a href='https://wandb.ai/common-sense-makers/post_type_statistics/runs/3fvr9vjh' target=\"_blank\">https://wandb.ai/common-sense-makers/post_type_statistics/runs/3fvr9vjh</a><br/> View project at: <a href='https://wandb.ai/common-sense-makers/post_type_statistics' target=\"_blank\">https://wandb.ai/common-sense-makers/post_type_statistics</a><br/>Synced 4 W&B file(s), 2 media file(s), 2 artifact file(s) and 0 other file(s)"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Find logs at: <code>./wandb/run-20240624_105928-3fvr9vjh/logs</code>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "import wandb\n",
+    "# Initialize wandb\n",
+    "wandb.init(project=\"post_type_statistics\")\n",
+    "\n",
+    "\n",
+    "\n",
+    "# Prepare data for logging\n",
+    "data = [[count, num_conversations] for count, num_conversations in distribution_of_counts.items()]\n",
+    "\n",
+    "# Log data to wandb\n",
+    "wandb.log({\n",
+    "    \"conversation_length_distribution\": wandb.Table(\n",
+    "        columns=[\"Number of Rows per Conversation\", \"Number of Conversations\"],\n",
+    "        data=data\n",
+    "    )\n",
+    "})\n",
+    "\n",
+    "# Plot the distribution with wandb\n",
+    "wandb.log({\n",
+    "    \"conversation_length_distribution_plot\": wandb.plot.bar(\n",
+    "        wandb.Table(data=data, columns=[\"Number of Rows per Conversation\", \"Number of Conversations\"]),\n",
+    "        \"Number of Rows per Conversation\",\n",
+    "        \"Number of Conversations\",\n",
+    "        title=\"Distribution of Conversation Lengths\"\n",
+    "    )\n",
+    "})\n",
+    "\n",
+    "wandb.run.finish()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0       0\n",
+      "1       0\n",
+      "2       0\n",
+      "3       0\n",
+      "4       0\n",
+      "       ..\n",
+      "4691    0\n",
+      "4692    0\n",
+      "4693    0\n",
+      "4694    0\n",
+      "4695    0\n",
+      "Name: thread, Length: 4696, dtype: int64\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Function to concatenate text, merge urls, and set thread\n",
+    "def collapse_thread(group):\n",
+    "    earliest_item = group.iloc[0].copy()\n",
+    "    if len(group) > 1:\n",
+    "        earliest_item['Text'] = ' /'.join(group['Text'])\n",
+    "        earliest_item['urls'] = list(set(url for sublist in group['urls'] for url in sublist))\n",
+    "        earliest_item['thread'] = 1\n",
+    "    else:\n",
+    "        earliest_item['thread'] = 0\n",
+    "    return earliest_item\n",
+    "\n",
+    "# Apply the function to each group\n",
+    "collapsed_df = filtered_df.groupby('conversation_id').apply(collapse_thread).reset_index(drop=True)\n",
+    "\n",
+    "# Verify the result\n",
+    "print(collapsed_df['thread'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "756"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "collapsed_df['thread'].sum()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_handles = pd.read_json('/Users/shaharorielkagan/Downloads/userAccounts.json')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_handles['server'] = 'twitter.com'\n",
+    "df_handles.rename(columns={'description': 'info'}, inplace=True)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "del df_handles['entities']\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "llm_type=\"mistralai/mistral-7b-\"\n",
+    "\n",
+    "config = init_multi_chain_parser_config(\n",
+    "        llm_type=llm_type,\n",
+    "        post_process_type=\"combined\"\n",
+    "    )\n",
+    "Eval = TwitterEval(config=config)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "df_eval = Eval.build_post_type_chart(df_handles=df_handles,df=collapsed_df)\n",
+    "fig1, fig2 = Eval.build_item_type_pie(df=df_eval)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2\n"
      ]
     }
    ],
    "source": [
-    "df['server'] = 'twitter.com'\n",
-    "print(df)"
+    "import pandas as pd\n",
+    "\n",
+    "# Example dataframe\n",
+    "data = {\n",
+    "    'thread': [1, 0, 1, 1, 0],\n",
+    "    'cited_research': [1, 0, 0, 1, 1]\n",
+    "}\n",
+    "\n",
+    "df = pd.DataFrame(data)\n",
+    "count = df[(df['thread'] ==1) & (df['cited_research']==1)].shape[0]\n",
+    "# Display the shape of the dataframe\n",
+    "print(count)  # Output: (5, 2)\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "Eval.feed_tweet_type_statistics(df=collapsed_df,name='sbuckshum')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "collapsed_df['citoid_research'] = None"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df1 = collapsed_df[collapsed_df[\"username\"] == 'sbuckshum']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "pandas.core.series.Series"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "type(df1.iloc[1])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
+     "name": "stderr",
      "output_type": "stream",
      "text": [
-      "<class 'list'>\n"
+      "\u001b[32m2024-06-20 18:03:57.389\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mdesci_sense.shared_functions.parsers.multi_chain_parser\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m75\u001b[0m - \u001b[1mInitializing MultiChainParser. PostProcessType=combined\u001b[0m\n",
+      "\u001b[32m2024-06-20 18:03:57.394\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mdesci_sense.shared_functions.parsers.multi_chain_parser\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m82\u001b[0m - \u001b[1mInitializing post parsers...\u001b[0m\n",
+      "\u001b[32m2024-06-20 18:03:57.395\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mdesci_sense.shared_functions.parsers.post_parser_chain\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m27\u001b[0m - \u001b[1mInitializing parser chain 'multi_refs_tagger' \u001b[0m\n",
+      "\u001b[32m2024-06-20 18:03:57.437\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mdesci_sense.shared_functions.parsers.post_parser_chain\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m27\u001b[0m - \u001b[1mInitializing parser chain 'topics' \u001b[0m\n",
+      "\u001b[32m2024-06-20 18:03:57.448\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mdesci_sense.shared_functions.parsers.post_parser_chain\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m27\u001b[0m - \u001b[1mInitializing parser chain 'keywords' \u001b[0m\n",
+      "\u001b[32m2024-06-20 18:03:57.461\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mdesci_sense.shared_functions.parsers.post_parser_chain\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m27\u001b[0m - \u001b[1mInitializing parser chain 'hashtags' \u001b[0m\n"
+     ]
+    },
+    {
+     "ename": "AttributeError",
+     "evalue": "'Series' object has no attribute 'md_ref_urls'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[14], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m Eval\u001b[39m.\u001b[39;49mnested_quotes_citoid(df1\u001b[39m.\u001b[39;49miloc[\u001b[39m1\u001b[39;49m])\n",
+      "File \u001b[0;32m~/sensemakers/nlp/notebooks/../desci_sense/evaluation/Evaluation_benchmark.py:246\u001b[0m, in \u001b[0;36mTwitterEval.nested_quotes_citoid\u001b[0;34m(self, post, steps, ind)\u001b[0m\n\u001b[1;32m    243\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mnested_quotes_citoid\u001b[39m(\u001b[39mself\u001b[39m,post:RefPost,steps \u001b[39m=\u001b[39m \u001b[39m0\u001b[39m, ind \u001b[39m=\u001b[39m \u001b[39m0\u001b[39m):\n\u001b[1;32m    244\u001b[0m     multi_chain_parser \u001b[39m=\u001b[39m MultiChainParser(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mconfig)\n\u001b[0;32m--> 246\u001b[0m     result \u001b[39m=\u001b[39m multi_chain_parser\u001b[39m.\u001b[39;49mprocess_ref_post(post\u001b[39m=\u001b[39;49mpost,active_list\u001b[39m=\u001b[39;49m[\u001b[39m\"\u001b[39;49m\u001b[39mhashtags\u001b[39;49m\u001b[39m\"\u001b[39;49m])\n\u001b[1;32m    247\u001b[0m     \u001b[39mprint\u001b[39m(\u001b[39m\"\u001b[39m\u001b[39mpost urls\u001b[39m\u001b[39m\"\u001b[39m,result\u001b[39m.\u001b[39mreference_urls)\n\u001b[1;32m    248\u001b[0m     \u001b[39mprint\u001b[39m(\u001b[39m\"\u001b[39m\u001b[39mItem types: \u001b[39m\u001b[39m\"\u001b[39m,result\u001b[39m.\u001b[39mitem_types)\n",
+      "File \u001b[0;32m~/sensemakers/nlp/notebooks/../desci_sense/shared_functions/parsers/multi_chain_parser.py:241\u001b[0m, in \u001b[0;36mMultiChainParser.process_ref_post\u001b[0;34m(self, post, active_list, unprocessed_urls)\u001b[0m\n\u001b[1;32m    238\u001b[0m \u001b[39mif\u001b[39;00m unprocessed_urls \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m    239\u001b[0m     unprocessed_urls \u001b[39m=\u001b[39m []\n\u001b[0;32m--> 241\u001b[0m md_dict \u001b[39m=\u001b[39m extract_posts_ref_metadata_dict(\n\u001b[1;32m    242\u001b[0m     [post],\n\u001b[1;32m    243\u001b[0m     \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mconfig\u001b[39m.\u001b[39;49mmetadata_extract_config\u001b[39m.\u001b[39;49mextraction_method,\n\u001b[1;32m    244\u001b[0m )\n\u001b[1;32m    245\u001b[0m \u001b[39m# if no filter specified, run all chains\u001b[39;00m\n\u001b[1;32m    246\u001b[0m \u001b[39mif\u001b[39;00m active_list \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n",
+      "File \u001b[0;32m~/sensemakers/nlp/notebooks/../desci_sense/shared_functions/web_extractors/metadata_extractors.py:167\u001b[0m, in \u001b[0;36mextract_posts_ref_metadata_dict\u001b[0;34m(posts, md_type)\u001b[0m\n\u001b[1;32m    159\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mextract_posts_ref_metadata_dict\u001b[39m(\n\u001b[1;32m    160\u001b[0m     posts: List[RefPost],\n\u001b[1;32m    161\u001b[0m     md_type: MetadataExtractionType \u001b[39m=\u001b[39m MetadataExtractionType\u001b[39m.\u001b[39mCITOID,\n\u001b[1;32m    162\u001b[0m ) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m Dict[\u001b[39mstr\u001b[39m, RefMetadata]:\n\u001b[1;32m    163\u001b[0m \u001b[39m    \u001b[39m\u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m    164\u001b[0m \u001b[39m    Extract all reference urls from posts and fetch metadata for them.\u001b[39;00m\n\u001b[1;32m    165\u001b[0m \u001b[39m    Return dict of metadata keyed by url.\u001b[39;00m\n\u001b[1;32m    166\u001b[0m \u001b[39m    \"\"\"\u001b[39;00m\n\u001b[0;32m--> 167\u001b[0m     all_ref_urls \u001b[39m=\u001b[39m \u001b[39mlist\u001b[39m(\u001b[39mset\u001b[39m(flatten([p\u001b[39m.\u001b[39;49mmd_ref_urls() \u001b[39mfor\u001b[39;49;00m p \u001b[39min\u001b[39;49;00m posts])))\n\u001b[1;32m    168\u001b[0m     md_dict \u001b[39m=\u001b[39m extract_all_metadata_to_dict(\n\u001b[1;32m    169\u001b[0m         all_ref_urls, md_type, max_summary_length\u001b[39m=\u001b[39m\u001b[39m500\u001b[39m\n\u001b[1;32m    170\u001b[0m     )\n\u001b[1;32m    171\u001b[0m     \u001b[39mreturn\u001b[39;00m md_dict\n",
+      "File \u001b[0;32m~/sensemakers/nlp/notebooks/../desci_sense/shared_functions/web_extractors/metadata_extractors.py:167\u001b[0m, in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m    159\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mextract_posts_ref_metadata_dict\u001b[39m(\n\u001b[1;32m    160\u001b[0m     posts: List[RefPost],\n\u001b[1;32m    161\u001b[0m     md_type: MetadataExtractionType \u001b[39m=\u001b[39m MetadataExtractionType\u001b[39m.\u001b[39mCITOID,\n\u001b[1;32m    162\u001b[0m ) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m Dict[\u001b[39mstr\u001b[39m, RefMetadata]:\n\u001b[1;32m    163\u001b[0m \u001b[39m    \u001b[39m\u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m    164\u001b[0m \u001b[39m    Extract all reference urls from posts and fetch metadata for them.\u001b[39;00m\n\u001b[1;32m    165\u001b[0m \u001b[39m    Return dict of metadata keyed by url.\u001b[39;00m\n\u001b[1;32m    166\u001b[0m \u001b[39m    \"\"\"\u001b[39;00m\n\u001b[0;32m--> 167\u001b[0m     all_ref_urls \u001b[39m=\u001b[39m \u001b[39mlist\u001b[39m(\u001b[39mset\u001b[39m(flatten([p\u001b[39m.\u001b[39;49mmd_ref_urls() \u001b[39mfor\u001b[39;00m p \u001b[39min\u001b[39;00m posts])))\n\u001b[1;32m    168\u001b[0m     md_dict \u001b[39m=\u001b[39m extract_all_metadata_to_dict(\n\u001b[1;32m    169\u001b[0m         all_ref_urls, md_type, max_summary_length\u001b[39m=\u001b[39m\u001b[39m500\u001b[39m\n\u001b[1;32m    170\u001b[0m     )\n\u001b[1;32m    171\u001b[0m     \u001b[39mreturn\u001b[39;00m md_dict\n",
+      "File \u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/core/generic.py:5989\u001b[0m, in \u001b[0;36mNDFrame.__getattr__\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m   5982\u001b[0m \u001b[39mif\u001b[39;00m (\n\u001b[1;32m   5983\u001b[0m     name \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_internal_names_set\n\u001b[1;32m   5984\u001b[0m     \u001b[39mand\u001b[39;00m name \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_metadata\n\u001b[1;32m   5985\u001b[0m     \u001b[39mand\u001b[39;00m name \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_accessors\n\u001b[1;32m   5986\u001b[0m     \u001b[39mand\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_info_axis\u001b[39m.\u001b[39m_can_hold_identifiers_and_holds_name(name)\n\u001b[1;32m   5987\u001b[0m ):\n\u001b[1;32m   5988\u001b[0m     \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m[name]\n\u001b[0;32m-> 5989\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mobject\u001b[39;49m\u001b[39m.\u001b[39;49m\u001b[39m__getattribute__\u001b[39;49m(\u001b[39mself\u001b[39;49m, name)\n",
+      "\u001b[0;31mAttributeError\u001b[0m: 'Series' object has no attribute 'md_ref_urls'"
      ]
     }
    ],
    "source": [
-    "print(type(df['urls'][0]))"
+    "Eval.nested_quotes_citoid(df1.iloc[1])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [],
    "source": [
-    "df_handles = df"
+    "inputs = Eval.dataframe_to_ref_posts(df1)\n",
+    "#Eval.nested_quotes_citoid_parallel(inputs)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "0     twitter.com\n",
-      "1     twitter.com\n",
-      "2     twitter.com\n",
-      "3     twitter.com\n",
-      "4     twitter.com\n",
-      "5     twitter.com\n",
-      "6     twitter.com\n",
-      "7     twitter.com\n",
-      "8     twitter.com\n",
-      "9     twitter.com\n",
-      "10    twitter.com\n",
-      "11    twitter.com\n",
-      "12    twitter.com\n",
-      "13    twitter.com\n",
-      "14    twitter.com\n",
-      "Name: server, dtype: object\n"
+      "<class 'desci_sense.shared_functions.schema.post.RefPost'>\n"
      ]
     }
    ],
    "source": [
-    "df_handles['server'] = 'twitter.com'\n",
-    "print(df_handles['server'])"
+    "print(type(inputs[0]))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
+     "name": "stderr",
      "output_type": "stream",
      "text": [
-      "           username                   id  \\\n",
-      "0          mbauwens             14349894   \n",
-      "1      AnnaLeptikon            550845228   \n",
-      "2      nickmvincent   890345761690595328   \n",
-      "3          LChoshen  1006797311593377792   \n",
-      "4           yoginho             91583793   \n",
-      "5           CAdjovu  1364791141267251200   \n",
-      "6    EugeneVinitsky  1054895671164063744   \n",
-      "7         lizilla93           2787254215   \n",
-      "8   danwilliamsphil            217942278   \n",
-      "9        CriticalAI  1290793547122302976   \n",
-      "10   BrettButtliere            561750764   \n",
-      "11      jbakcoleman           1419907861   \n",
-      "12        yoavartzi            322460769   \n",
-      "13       ChombaBupe           1248939194   \n",
-      "14           1Br0wn             17900290   \n",
-      "\n",
-      "                                                 info  \\\n",
-      "0   updates on p2p and commons developments; peer,...   \n",
-      "1   nobody. Interested in cognitive science, ratio...   \n",
-      "2   Assistant professor @SFU_CompSci, HCI, HCML, w...   \n",
-      "3   🥇 #NLProc researcher\\n🥈 Opinionatedly Summariz...   \n",
-      "4   I'm on Mastodon: \\n@yoginho@spore.social\\n\\nht...   \n",
-      "5   Director @ledgerback | Curator @_distroid | In...   \n",
-      "6   Anti-cynic. Artificial narrow intelligence. Au...   \n",
-      "7   Scientist @ArcadiaScience working on computati...   \n",
-      "8   Philosopher, University of Sussex. Tweets in p...   \n",
-      "9   Critical AI's first issue out: https://read.du...   \n",
-      "10  developing the future of science and society. ...   \n",
-      "11  Associate Research Scientist @columbiajourn. A...   \n",
-      "12  Research/prof @cs_cornell + @cornell_tech🚡 / h...   \n",
-      "13  Tech entrepreneur | machine intelligence https...   \n",
-      "14  💻 regulation/policy (🔑 #privacy 🗳) ⛷🚴🏻‍♂️🥾🗺 Vi...   \n",
-      "\n",
-      "                                                 name       server  \n",
-      "0                                      Michel Bauwens  twitter.com  \n",
-      "1                                          Anna Riedl  twitter.com  \n",
-      "2                                        Nick Vincent  twitter.com  \n",
-      "3                                ♻️ Leshem Choshen ♻️  twitter.com  \n",
-      "4                 Yogi Jaeger 💙 @yoginho@spore.social  twitter.com  \n",
-      "5                                      Charles Adjovu  twitter.com  \n",
-      "6                                     Eugene Vinitsky  twitter.com  \n",
-      "7                                  Elizabeth McDaniel  twitter.com  \n",
-      "8                                        Dan Williams  twitter.com  \n",
-      "9   Critical AI : first issue out! https://read.du...  twitter.com  \n",
-      "10                                   Zr. Nabu Kudurru  twitter.com  \n",
-      "11                                    Joe Bak-Coleman  twitter.com  \n",
-      "12                                         Yoav Artzi  twitter.com  \n",
-      "13                                        Chomba Bupe  twitter.com  \n",
-      "14                                   Ian Brown 🇮🇨 🦣 🦋  twitter.com  \n"
+      "\u001b[32m2024-06-20 16:50:22.610\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mdesci_sense.shared_functions.utils\u001b[0m:\u001b[36munshorten_url\u001b[0m:\u001b[36m123\u001b[0m - \u001b[33m\u001b[1m[unshorten_url] RequestException for url https://www.teqsa.gov.au/About-us/engagement/consultation\u001b[0m\n"
      ]
     },
     {
-     "name": "stderr",
+     "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Python(12627) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.\n"
+      "author='TEQSA' content='With leading Australian experts, TEQSA has supported the development of assessment reform guiding principles to support the #HigherEd sector in responding to the opportunities and risks posed by #GenAI.\\n\\nFeedback on the principles closes 20 October.\\n\\n🤖 https://www.teqsa.gov.au/About-us/engagement/consultation https://twitter.com/TEQSAGov/status/1707292887883628989/photo/1' url='https://x.com/TEQSAGov/status/1707292887883628989' created_at=datetime.datetime(2023, 9, 28, 7, 15, 19, tzinfo=datetime.timezone.utc) metadata={'allSameType': True, 'combinedMediaUrl': None, 'communityNote': None, 'conversationID': '1707292887883628989', 'date': 'Thu Sep 28 07:15:19 +0000 2023', 'date_epoch': 1695885319, 'hasMedia': True, 'hashtags': ['HigherEd', 'GenAI'], 'likes': 49, 'mediaURLs': ['https://pbs.twimg.com/media/F7GE9z1aMAEU6k0.jpg'], 'media_extended': [{'altText': 'Consultation. Assessment reform for the age of artificial intelligence. Feedbackcloses Friday 20 October. teqsa.gov.au/consultation', 'size': {'height': 1080, 'width': 1080}, 'thumbnail_url': 'https://pbs.twimg.com/media/F7GE9z1aMAEU6k0.jpg', 'type': 'image', 'url': 'https://pbs.twimg.com/media/F7GE9z1aMAEU6k0.jpg'}], 'pollData': None, 'possibly_sensitive': False, 'qrt': None, 'qrtURL': None, 'replies': 0, 'retweets': 29, 'text': 'With leading Australian experts, TEQSA has supported the development of assessment reform guiding principles to support the #HigherEd sector in responding to the opportunities and risks posed by #GenAI.\\n\\nFeedback on the principles closes 20 October.\\n\\n🤖 https://www.teqsa.gov.au/About-us/engagement/consultation https://t.co/v9d6MeF5hi', 'tweetID': '1707292887883628989', 'tweetURL': 'https://twitter.com/TEQSAGov/status/1707292887883628989', 'user_name': 'TEQSA', 'user_profile_image_url': 'https://pbs.twimg.com/profile_images/765051369707311104/qSUt7iaC_normal.jpg', 'user_screen_name': 'TEQSAGov'} source_network='twitter' ref_urls=['https://www.teqsa.gov.au/About-us/engagement/consultation']\n"
      ]
     }
    ],
    "source": [
-    "del df_handles['entities']\n",
-    "print(df_handles)"
+    "p=scrape_post('https://twitter.com/TEQSAGov/status/1707292887883628989')\n",
+    "print(p)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 55,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig1.savefig('example_figure.png')\n",
+    "\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [
     {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Python(12883) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.\n",
-      "Python(12884) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.\n",
-      "Python(12885) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.\n",
-      "Python(12886) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.\n"
-     ]
+     "data": {
+      "text/html": [
+       "Finishing last run (ID:9h7tode8) before initializing another..."
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "04cd33810e9144fab45e7485a9416413",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\\r'), FloatProgress(value=1.0, max=1.0)))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View run <strong style=\"color:#cdcd00\">desert-cherry-1</strong> at: <a href='https://wandb.ai/common-sense-makers/post_type_stat/runs/9h7tode8' target=\"_blank\">https://wandb.ai/common-sense-makers/post_type_stat/runs/9h7tode8</a><br/> View project at: <a href='https://wandb.ai/common-sense-makers/post_type_stat' target=\"_blank\">https://wandb.ai/common-sense-makers/post_type_stat</a><br/>Synced 5 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Find logs at: <code>./wandb/run-20240620_120255-9h7tode8/logs</code>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Successfully finished last run (ID:9h7tode8). Initializing new run:<br/>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a08b3a80ec034374a0f22065f5968e8e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "VBox(children=(Label(value='Waiting for wandb.init()...\\r'), FloatProgress(value=0.011114446754153405, max=1.0…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "wandb version 0.17.2 is available!  To upgrade, please run:\n",
+       " $ pip install wandb --upgrade"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     },
     {
      "data": {
@@ -422,7 +870,7 @@
     {
      "data": {
       "text/html": [
-       "Run data is saved locally in <code>/Users/shaharorielkagan/sensemakers/nlp/notebooks/wandb/run-20240418_145303-1h780x8q</code>"
+       "Run data is saved locally in <code>/Users/shaharorielkagan/sensemakers/nlp/notebooks/wandb/run-20240620_120330-kx4uq3n0</code>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -434,7 +882,7 @@
     {
      "data": {
       "text/html": [
-       "Syncing run <strong><a href='https://wandb.ai/common-sense-makers/filter_evaluation/runs/1h780x8q' target=\"_blank\">easy-dew-8</a></strong> to <a href='https://wandb.ai/common-sense-makers/filter_evaluation' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/run' target=\"_blank\">docs</a>)<br/>"
+       "Syncing run <strong><a href='https://wandb.ai/common-sense-makers/post_type_stat/runs/kx4uq3n0' target=\"_blank\">vague-deluge-2</a></strong> to <a href='https://wandb.ai/common-sense-makers/post_type_stat' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/run' target=\"_blank\">docs</a>)<br/>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -446,7 +894,7 @@
     {
      "data": {
       "text/html": [
-       " View project at <a href='https://wandb.ai/common-sense-makers/filter_evaluation' target=\"_blank\">https://wandb.ai/common-sense-makers/filter_evaluation</a>"
+       " View project at <a href='https://wandb.ai/common-sense-makers/post_type_stat' target=\"_blank\">https://wandb.ai/common-sense-makers/post_type_stat</a>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -458,7 +906,7 @@
     {
      "data": {
       "text/html": [
-       " View run at <a href='https://wandb.ai/common-sense-makers/filter_evaluation/runs/1h780x8q' target=\"_blank\">https://wandb.ai/common-sense-makers/filter_evaluation/runs/1h780x8q</a>"
+       " View run at <a href='https://wandb.ai/common-sense-makers/post_type_stat/runs/kx4uq3n0' target=\"_blank\">https://wandb.ai/common-sense-makers/post_type_stat/runs/kx4uq3n0</a>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -470,12 +918,12 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "172f3e79de4d47e285937a321111e345",
+       "model_id": "d4d758ed427340de9a5e933b2db8643b",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
-       "VBox(children=(Label(value='0.301 MB of 0.301 MB uploaded\\r'), FloatProgress(value=1.0, max=1.0)))"
+       "VBox(children=(Label(value='3.910 MB of 3.910 MB uploaded\\r'), FloatProgress(value=1.0, max=1.0)))"
       ]
      },
      "metadata": {},
@@ -484,7 +932,7 @@
     {
      "data": {
       "text/html": [
-       " View run <strong style=\"color:#cdcd00\">easy-dew-8</strong> at: <a href='https://wandb.ai/common-sense-makers/filter_evaluation/runs/1h780x8q' target=\"_blank\">https://wandb.ai/common-sense-makers/filter_evaluation/runs/1h780x8q</a><br/> View project at: <a href='https://wandb.ai/common-sense-makers/filter_evaluation' target=\"_blank\">https://wandb.ai/common-sense-makers/filter_evaluation</a><br/>Synced 4 W&B file(s), 0 media file(s), 2 artifact file(s) and 0 other file(s)"
+       " View run <strong style=\"color:#cdcd00\">vague-deluge-2</strong> at: <a href='https://wandb.ai/common-sense-makers/post_type_stat/runs/kx4uq3n0' target=\"_blank\">https://wandb.ai/common-sense-makers/post_type_stat/runs/kx4uq3n0</a><br/> View project at: <a href='https://wandb.ai/common-sense-makers/post_type_stat' target=\"_blank\">https://wandb.ai/common-sense-makers/post_type_stat</a><br/>Synced 4 W&B file(s), 0 media file(s), 2 artifact file(s) and 0 other file(s)"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -496,7 +944,7 @@
     {
      "data": {
       "text/html": [
-       "Find logs at: <code>./wandb/run-20240418_145303-1h780x8q/logs</code>"
+       "Find logs at: <code>./wandb/run-20240620_120330-kx4uq3n0/logs</code>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -507,11 +955,11 @@
     }
    ],
    "source": [
-    "wandb.init(project=\"filter_evaluation\")\n",
+    "wandb.init(project=\"post_type_stat\")\n",
     "artifact = wandb.Artifact(\"non_labeled_tweets\", type=\"dataset\")\n",
     "\n",
     "# Create a wandb.Table from the Pandas DataFrame\n",
-    "table1 = wandb.Table(dataframe=df)\n",
+    "table1 = wandb.Table(dataframe=collapsed_df)\n",
     "table2 = wandb.Table(dataframe=df_handles)\n",
     "\n",
     "\n",
@@ -677,16 +1125,6 @@
     "wandb.run.finish()"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#add a column that counts references\n",
-    "df['ref_count'] = df['urls'].apply(len)"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 6,
@@ -1935,6 +2373,21 @@
     "print(df_labeled)"
    ]
   },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Thread stat"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "code",
    "execution_count": 51,
@@ -2077,69 +2530,6 @@
     "wandb.run.finish()"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>username</th>\n",
-       "      <th>id</th>\n",
-       "      <th>info</th>\n",
-       "      <th>name</th>\n",
-       "      <th>server</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>mbauwens</td>\n",
-       "      <td>14349894</td>\n",
-       "      <td>updates on p2p and commons developments; peer,...</td>\n",
-       "      <td>Michel Bauwens</td>\n",
-       "      <td>twitter.com</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "   username        id                                               info  \\\n",
-       "0  mbauwens  14349894  updates on p2p and commons developments; peer,...   \n",
-       "\n",
-       "             name       server  \n",
-       "0  Michel Bauwens  twitter.com  "
-      ]
-     },
-     "execution_count": 8,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df_handles[df_handles['username']=='mbauwens']"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 8,
@@ -2369,129 +2759,6 @@
     "\n",
     "wandb.run.finish()"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'batch_size': 5, 'wandb_config': \"entity='common-sense-makers' project='st-demo-sandbox'\", 'parser_configs': [\"name='refs_tagger' type=<ParserChainType.REFERENCE_TAGGER: 'reference_tagger'> llm_config=LLMConfig(llm_type='mistralai/mixtral-8x7b-instruct:nitro', temperature='0.6') use_metadata=True is_ref=True\", \"name='multi_refs_tagger' type=<ParserChainType.MULTI_REF_TAGGER: 'multi_reference_tagger'> llm_config=LLMConfig(llm_type='mistralai/mixtral-8x7b-instruct:nitro', temperature='0.6') use_metadata=True is_multi_ref=True\", \"name='topics' type=<ParserChainType.TOPICS: 'topics'> llm_config=LLMConfig(llm_type='mistralai/mixtral-8x7b-instruct:nitro', temperature='0.6') use_metadata=True is_topic=True\", \"name='keywords' type=<ParserChainType.KEYWORDS: 'keywords'> llm_config=LLMConfig(llm_type='mistralai/mixtral-8x7b-instruct:nitro', temperature='0.6') use_metadata=True max_keywords=6\", \"name='hashtags' type=<ParserChainType.HASHTAGS: 'hashtags'> llm_config=LLMConfig(llm_type='mistralai/mistral-7b-instruct', temperature='0.6') use_metadata=False max_hashtags=20\"], 'post_process_type': 'PostProcessType.COMBINED', 'openrouter_api_config': \"openrouter_api_base='https://openrouter.ai/api/v1' openrouter_api_key='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' openrouter_referer='http://localhost:3000'\", 'metadata_extract_config': \"extraction_method=<MetadataExtractionType.CITOID: 'citoid'> max_summary_length=500\"}\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(dataset_run_id.config)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 21,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "   A  B  C\n",
-      "0  1  4  7\n",
-      "1  2  5  8\n",
-      "2  3  6  9\n"
-     ]
-    }
-   ],
-   "source": [
-    "import pandas as pd\n",
-    "data = {\n",
-    "    'A': [1, 2, 3],\n",
-    "    'B': [4, 5, 6],\n",
-    "    'C': [7, 8, 9]\n",
-    "}\n",
-    "df = pd.DataFrame(data)\n",
-    "print(df)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 23,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "6"
-      ]
-     },
-     "execution_count": 23,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df.iloc[2,1]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 27,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "list = []\n",
-    "list.extend([1,2])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 28,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[1, 2]\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(list)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 29,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[[1 2 3]\n",
-      " [4 5 6]\n",
-      " [7 8 9]]\n",
-      "Element at row 2, column 3: 6\n"
-     ]
-    }
-   ],
-   "source": [
-    "import numpy as np\n",
-    "\n",
-    "# Creating a NumPy matrix (2D array)\n",
-    "matrix = np.array([\n",
-    "    [1, 2, 3],\n",
-    "    [4, 5, 6],\n",
-    "    [7, 8, 9]\n",
-    "])\n",
-    "print(matrix)\n",
-    "# Accessing the element at row 2, column 3 (1-based description)\n",
-    "element = matrix[1, 2]  # Using zero-based indexing, so 1 is the second row, 2 is the third column\n",
-    "\n",
-    "print(\"Element at row 2, column 3:\", element)\n"
-   ]
   }
  ],
  "metadata": {
diff --git a/nlp/notebooks/add_item_types_firebase.ipynb b/nlp/notebooks/add_item_types_firebase.ipynb
new file mode 100644
index 00000000..b6aa4ddc
--- /dev/null
+++ b/nlp/notebooks/add_item_types_firebase.ipynb
@@ -0,0 +1,208 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import nest_asyncio\n",
+    "nest_asyncio.apply()\n",
+    "from pathlib import Path\n",
+    "from datetime import datetime\n",
+    "import re\n",
+    "\n",
+    "import sys\n",
+    "sys.path.append(\"../\")\n",
+    "\n",
+    "from typing import List\n",
+    "from desci_sense.shared_functions.init import init_multi_chain_parser_config\n",
+    "from desci_sense.shared_functions.parsers.multi_chain_parser import MultiChainParser\n",
+    "from desci_sense.evaluation.utils import get_dataset, obj_to_json, obj_str_to_dict\n",
+    "from desci_sense.shared_functions.dataloaders import (\n",
+    "    scrape_post,\n",
+    "    convert_text_to_ref_post,\n",
+    ")\n",
+    "from desci_sense.shared_functions.configs import (\n",
+    "    OpenrouterAPIConfig,\n",
+    "    WandbConfig,\n",
+    "    LLMConfig,\n",
+    "    KeywordPParserChainConfig,\n",
+    "    RefTaggerChainConfig,\n",
+    "    TopicsPParserChainConfig,\n",
+    "    validate_env_var,\n",
+    "    MultiParserChainConfig,\n",
+    "    ParserChainType,\n",
+    "    PostProcessType,\n",
+    ") "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from desci_sense.shared_functions.interface import (\n",
+    "    RDFTriplet,\n",
+    "    isAConceptDefintion,\n",
+    "    KeywordConceptDefinition,\n",
+    "    ParserSupport,\n",
+    "    ParserResult,\n",
+    "    OntologyInterface,\n",
+    "    ZoteroItemTypeDefinition,\n",
+    "    )\n",
+    "from rdflib.namespace import RDF\n",
+    "from rdflib import URIRef, Literal, Graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "TEST_POST_TEXT_W_REF = \"\"\"\n",
+    "I really liked this paper!\n",
+    "https://arxiv.org/abs/2402.04607\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[32m2024-07-08 15:57:35.197\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mdesci_sense.shared_functions.parsers.multi_chain_parser\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m76\u001b[0m - \u001b[1mInitializing MultiChainParser. PostProcessType=combined\u001b[0m\n",
+      "\u001b[32m2024-07-08 15:57:35.199\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mdesci_sense.shared_functions.parsers.multi_chain_parser\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m83\u001b[0m - \u001b[1mInitializing post parsers...\u001b[0m\n",
+      "\u001b[32m2024-07-08 15:57:35.199\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mdesci_sense.shared_functions.parsers.post_parser_chain\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m27\u001b[0m - \u001b[1mInitializing parser chain 'multi_refs_tagger' \u001b[0m\n",
+      "\u001b[32m2024-07-08 15:57:35.241\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mdesci_sense.shared_functions.parsers.post_parser_chain\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m27\u001b[0m - \u001b[1mInitializing parser chain 'topics' \u001b[0m\n",
+      "\u001b[32m2024-07-08 15:57:35.268\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mdesci_sense.shared_functions.parsers.post_parser_chain\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m27\u001b[0m - \u001b[1mInitializing parser chain 'keywords' \u001b[0m\n",
+      "\u001b[32m2024-07-08 15:57:35.296\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mdesci_sense.shared_functions.parsers.post_parser_chain\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m27\u001b[0m - \u001b[1mInitializing parser chain 'hashtags' \u001b[0m\n",
+      "\u001b[32m2024-07-08 15:57:35.411\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mdesci_sense.shared_functions.web_extractors.citoid\u001b[0m:\u001b[36mbefore_retry\u001b[0m:\u001b[36m84\u001b[0m - \u001b[33m\u001b[1mRetry attempt 1\u001b[0m\n",
+      "\u001b[32m2024-07-08 15:57:35.412\u001b[0m | \u001b[34m\u001b[1mDEBUG   \u001b[0m | \u001b[36mdesci_sense.shared_functions.web_extractors.citoid\u001b[0m:\u001b[36mfetch_citation\u001b[0m:\u001b[36m154\u001b[0m - \u001b[34m\u001b[1mfetching citoid data for: https://arxiv.org/abs/2402.04607\u001b[0m\n",
+      "\u001b[32m2024-07-08 15:57:37.558\u001b[0m | \u001b[34m\u001b[1mDEBUG   \u001b[0m | \u001b[36mdesci_sense.shared_functions.parsers.multi_chain_parser\u001b[0m:\u001b[36mprocess_ref_post\u001b[0m:\u001b[36m265\u001b[0m - \u001b[34m\u001b[1mProcessing post with parsers: ['multi_refs_tagger', 'topics', 'keywords', 'hashtags']\u001b[0m\n",
+      "\u001b[32m2024-07-08 15:57:37.559\u001b[0m | \u001b[34m\u001b[1mDEBUG   \u001b[0m | \u001b[36mdesci_sense.shared_functions.parsers.multi_chain_parser\u001b[0m:\u001b[36mprocess_ref_post\u001b[0m:\u001b[36m267\u001b[0m - \u001b[34m\u001b[1mInstantiating prompts...\u001b[0m\n",
+      "\u001b[32m2024-07-08 15:57:37.560\u001b[0m | \u001b[34m\u001b[1mDEBUG   \u001b[0m | \u001b[36mdesci_sense.shared_functions.parsers.multi_chain_parser\u001b[0m:\u001b[36mprocess_ref_post\u001b[0m:\u001b[36m272\u001b[0m - \u001b[34m\u001b[1mInvoking parallel chain...\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "multi_config = init_multi_chain_parser_config(llm_type='google/gemma-7b-it:free',\n",
+    "                                        post_process_type=\"combined\")\n",
+    "multi_config.post_process_type = PostProcessType.COMBINED\n",
+    "mcp = MultiChainParser(multi_config)\n",
+    "res = mcp.process_text(TEST_POST_TEXT_W_REF)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['https://arxiv.org/abs/2402.04607']"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "res.reference_urls"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['preprint']"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "res.item_types"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def convert_item_types_to_rdf_triplets(item_types: List[str], reference_urls: List[str]) -> List[RDFTriplet]:\n",
+    "    assert len(res.reference_urls) == len(res.item_types)\n",
+    "    triplets = [\n",
+    "        RDFTriplet(\n",
+    "            subject=URIRef(ref_url),\n",
+    "            predicate=URIRef(ZoteroItemTypeDefinition().uri),\n",
+    "            object=Literal(item_type),\n",
+    "        )\n",
+    "        for ref_url, item_type in zip(reference_urls, item_types)\n",
+    "    ]\n",
+    "\n",
+    "    return triplets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ValidationError",
+     "evalue": "2 validation errors for RDFTriplet\nsubject.is-instance[Literal]\n  Input should be an instance of Literal [type=is_instance_of, input_value='https://arxiv.org/abs/2402.04607', input_type=str]\n    For further information visit https://errors.pydantic.dev/2.6/v/is_instance_of\nsubject.is-instance[URIRef]\n  Input should be an instance of URIRef [type=is_instance_of, input_value='https://arxiv.org/abs/2402.04607', input_type=str]\n    For further information visit https://errors.pydantic.dev/2.6/v/is_instance_of",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mValidationError\u001b[0m                           Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[13], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mconvert_item_types_to_rdf_triplets\u001b[49m\u001b[43m(\u001b[49m\u001b[43mres\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mitem_types\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mres\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mreference_urls\u001b[49m\u001b[43m)\u001b[49m\n",
+      "Cell \u001b[0;32mIn[12], line 3\u001b[0m, in \u001b[0;36mconvert_item_types_to_rdf_triplets\u001b[0;34m(item_types, reference_urls)\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mconvert_item_types_to_rdf_triplets\u001b[39m(item_types: List[\u001b[38;5;28mstr\u001b[39m], reference_urls: List[\u001b[38;5;28mstr\u001b[39m]) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m List[RDFTriplet]:\n\u001b[1;32m      2\u001b[0m     \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(res\u001b[38;5;241m.\u001b[39mreference_urls) \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mlen\u001b[39m(res\u001b[38;5;241m.\u001b[39mitem_types)\n\u001b[0;32m----> 3\u001b[0m     triplets \u001b[38;5;241m=\u001b[39m \u001b[43m[\u001b[49m\n\u001b[1;32m      4\u001b[0m \u001b[43m        \u001b[49m\u001b[43mRDFTriplet\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m      5\u001b[0m \u001b[43m            \u001b[49m\u001b[43msubject\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mref_url\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m      6\u001b[0m \u001b[43m            \u001b[49m\u001b[43mpredicate\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mURIRef\u001b[49m\u001b[43m(\u001b[49m\u001b[43mZoteroItemTypeDefinition\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43muri\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m      7\u001b[0m \u001b[43m            \u001b[49m\u001b[38;5;28;43mobject\u001b[39;49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mLiteral\u001b[49m\u001b[43m(\u001b[49m\u001b[43mitem_type\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m      8\u001b[0m \u001b[43m        \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m      9\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mref_url\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mitem_type\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mzip\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mreference_urls\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mitem_types\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     10\u001b[0m \u001b[43m    \u001b[49m\u001b[43m]\u001b[49m\n\u001b[1;32m     12\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m triplets\n",
+      "Cell \u001b[0;32mIn[12], line 4\u001b[0m, in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mconvert_item_types_to_rdf_triplets\u001b[39m(item_types: List[\u001b[38;5;28mstr\u001b[39m], reference_urls: List[\u001b[38;5;28mstr\u001b[39m]) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m List[RDFTriplet]:\n\u001b[1;32m      2\u001b[0m     \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(res\u001b[38;5;241m.\u001b[39mreference_urls) \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mlen\u001b[39m(res\u001b[38;5;241m.\u001b[39mitem_types)\n\u001b[1;32m      3\u001b[0m     triplets \u001b[38;5;241m=\u001b[39m [\n\u001b[0;32m----> 4\u001b[0m         \u001b[43mRDFTriplet\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m      5\u001b[0m \u001b[43m            \u001b[49m\u001b[43msubject\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mref_url\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m      6\u001b[0m \u001b[43m            \u001b[49m\u001b[43mpredicate\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mURIRef\u001b[49m\u001b[43m(\u001b[49m\u001b[43mZoteroItemTypeDefinition\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43muri\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m      7\u001b[0m \u001b[43m            \u001b[49m\u001b[38;5;28;43mobject\u001b[39;49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mLiteral\u001b[49m\u001b[43m(\u001b[49m\u001b[43mitem_type\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m      8\u001b[0m \u001b[43m        \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m      9\u001b[0m         \u001b[38;5;28;01mfor\u001b[39;00m ref_url, item_type \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mzip\u001b[39m(reference_urls, item_types)\n\u001b[1;32m     10\u001b[0m     ]\n\u001b[1;32m     12\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m triplets\n",
+      "File \u001b[0;32m~/anaconda3/envs/asensebot/lib/python3.11/site-packages/pydantic/main.py:171\u001b[0m, in \u001b[0;36mBaseModel.__init__\u001b[0;34m(self, **data)\u001b[0m\n\u001b[1;32m    169\u001b[0m \u001b[38;5;66;03m# `__tracebackhide__` tells pytest and some other tools to omit this function from tracebacks\u001b[39;00m\n\u001b[1;32m    170\u001b[0m __tracebackhide__ \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m--> 171\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m__pydantic_validator__\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalidate_python\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mself_instance\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m)\u001b[49m\n",
+      "\u001b[0;31mValidationError\u001b[0m: 2 validation errors for RDFTriplet\nsubject.is-instance[Literal]\n  Input should be an instance of Literal [type=is_instance_of, input_value='https://arxiv.org/abs/2402.04607', input_type=str]\n    For further information visit https://errors.pydantic.dev/2.6/v/is_instance_of\nsubject.is-instance[URIRef]\n  Input should be an instance of URIRef [type=is_instance_of, input_value='https://arxiv.org/abs/2402.04607', input_type=str]\n    For further information visit https://errors.pydantic.dev/2.6/v/is_instance_of"
+     ]
+    }
+   ],
+   "source": [
+    "convert_item_types_to_rdf_triplets(res.item_types, res.reference_urls)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "asensebot",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/nlp/tests/test_citoid.py b/nlp/tests/test_citoid.py
index 1a6b5bb7..d21a1dbb 100644
--- a/nlp/tests/test_citoid.py
+++ b/nlp/tests/test_citoid.py
@@ -57,8 +57,6 @@ def test_i99():
 
 from urllib.parse import urlparse
 
-from url_normalize import url_normalize
-
 
 def identify_social_media(url):
     """
diff --git a/nlp/tests/test_multi_chain_app_interface.py b/nlp/tests/test_multi_chain_app_interface.py
index 0dbd83f5..2cdfdfb3 100644
--- a/nlp/tests/test_multi_chain_app_interface.py
+++ b/nlp/tests/test_multi_chain_app_interface.py
@@ -106,34 +106,34 @@ def test_thread_trim():
     ]
     assert len(res.multi_reference_tagger) == 5
     assert res.multi_reference_tagger[3:] == [["default"], ["default"]]
-    assert TARGET_THREAD_RENDER in res.debug["multi_reference_tagger"]["prompt"]
+    # assert TARGET_THREAD_RENDER in res.debug["multi_reference_tagger"]["prompt"]
     assert no_empty_lists(res.multi_reference_tagger)
 
 
-def test_batch():
-    multi_config = MultiParserChainConfig(
-        parser_configs=[
-            MultiRefTaggerChainConfig(
-                name="multi_ref_tagger",
-                llm_config=LLMConfig(llm_type="google/gemma-7b-it"),
-                post_renderer=PostRendererType.THREAD_REF_POST,
-            )
-        ],
-        post_process_type=PostProcessType.COMBINED,
-        metadata_extract_config=MetadataExtractionConfig(extraction_method="citoid"),
-    )
-    mcp = MultiChainParser(multi_config)
-    thread = get_thread_1()
-    pi_1 = ParserInput(thread_post=thread, max_posts=1)
-    pi_2 = ParserInput(thread_post=thread, max_posts=3)
-    pi_3 = ParserInput(thread_post=thread, max_posts=4)
-    batch = [pi_1, pi_2, pi_3]
-    res = mcp.batch_process_parser_inputs(batch)
-    assert len(res[0].debug["multi_reference_tagger"]["reasoning"]) == 1
-    assert len(res[1].debug["multi_reference_tagger"]["reasoning"]) == 3
-    assert len(res[2].debug["multi_reference_tagger"]["reasoning"]) == 4
-    for result in res:
-        assert no_empty_lists(result.multi_reference_tagger)
+# def test_batch():
+#     multi_config = MultiParserChainConfig(
+#         parser_configs=[
+#             MultiRefTaggerChainConfig(
+#                 name="multi_ref_tagger",
+#                 llm_config=LLMConfig(llm_type="google/gemma-7b-it"),
+#                 post_renderer=PostRendererType.THREAD_REF_POST,
+#             )
+#         ],
+#         post_process_type=PostProcessType.COMBINED,
+#         metadata_extract_config=MetadataExtractionConfig(extraction_method="citoid"),
+#     )
+#     mcp = MultiChainParser(multi_config)
+#     thread = get_thread_1()
+#     pi_1 = ParserInput(thread_post=thread, max_posts=1)
+#     pi_2 = ParserInput(thread_post=thread, max_posts=3)
+#     pi_3 = ParserInput(thread_post=thread, max_posts=4)
+#     batch = [pi_1, pi_2, pi_3]
+#     res = mcp.batch_process_parser_inputs(batch)
+#     assert len(res[0].debug["multi_reference_tagger"]["reasoning"]) == 1
+#     assert len(res[1].debug["multi_reference_tagger"]["reasoning"]) == 3
+#     assert len(res[2].debug["multi_reference_tagger"]["reasoning"]) == 4
+#     for result in res:
+#         assert no_empty_lists(result.multi_reference_tagger)
 
 
 def test_citoid_unprocessed_urls():
@@ -163,8 +163,6 @@ def test_citoid_unprocessed_urls():
 # "mistralai/mistral-7b-instruct:free"
 # "google/gemma-7b-it"
 if __name__ == "__main__":
-    thread = get_thread_1()
-    pi_1 = ParserInput(thread_post=thread, max_posts=1)
     multi_config = MultiParserChainConfig(
         parser_configs=[
             MultiRefTaggerChainConfig(
@@ -177,11 +175,20 @@ def test_citoid_unprocessed_urls():
         metadata_extract_config=MetadataExtractionConfig(extraction_method="citoid"),
     )
     mcp = MultiChainParser(multi_config)
-    res = mcp.process_parser_input(pi_1)
-    assert res.filter_classification == SciFilterClassfication.CITOID_DETECTED_RESEARCH
-    
-    
-    
+    thread = get_thread_1()
+    pi = ParserInput(thread_post=thread, max_posts=1)
+    res = mcp.process_parser_input(pi)
+    assert res.reference_urls == [
+        "https://x.com/FDAadcomms/status/1798104612635070611",
+        "https://journals.sagepub.com/doi/10.1177/20451253231198466",
+        "https://www.youtube.com/watch?feature=youtu.be&si=kjMtNR1Hwe7NZ8as&v=WknlkmJee4E",
+        "https://x.com/eturnermd1/status/1798046087737180395",
+        "https://x.com/FDAadcomms/status/1798107142219796794",
+    ]
+    assert len(res.multi_reference_tagger) == 5
+    assert res.multi_reference_tagger[3:] == [["default"], ["default"]]
+    assert TARGET_THREAD_RENDER in res.debug["multi_reference_tagger"]["prompt"]
+    assert no_empty_lists(res.multi_reference_tagger)
     # parse_request = create_post_request()
     # multi_config = MultiParserChainConfig(
     #     parser_configs=[
diff --git a/nlp/tests/test_multi_chain_post_processing.py b/nlp/tests/test_multi_chain_post_processing.py
index 43610f6e..70d8d900 100644
--- a/nlp/tests/test_multi_chain_post_processing.py
+++ b/nlp/tests/test_multi_chain_post_processing.py
@@ -11,7 +11,7 @@
 import os
 import pytest
 from pydantic import ValidationError
-
+from rdflib import URIRef, Literal, Graph
 from utils import create_multi_chain_for_tests, create_multi_config_for_tests
 from desci_sense.shared_functions.parsers.multi_chain_parser import MultiChainParser
 from desci_sense.shared_functions.filters import SciFilterClassfication
@@ -27,10 +27,17 @@
     ParserChainType,
     PostProcessType,
 )  # Adjust the import as necessary
+from desci_sense.shared_functions.postprocessing import (
+    convert_item_types_to_rdf_triplets,
+)
 from desci_sense.shared_functions.dataloaders import (
     scrape_post,
     convert_text_to_ref_post,
 )
+from desci_sense.shared_functions.interface import (
+    RDFTriplet,
+    ZoteroItemTypeDefinition,
+)
 
 TEST_POST_TEXT_W_REF = """
 I really liked this paper!
@@ -56,6 +63,16 @@ def test_firebase_pp():
     res = mcp.process_text(TEST_POST_TEXT_W_REF)
     len(res.support.refs_meta) == 1
     assert res.filter_classification == SciFilterClassfication.CITOID_DETECTED_RESEARCH
+    # check item types
+    expected = [
+        RDFTriplet(
+            subject=URIRef("https://arxiv.org/abs/2402.04607"),
+            predicate=URIRef(ZoteroItemTypeDefinition().uri),
+            object=Literal("preprint"),
+        ),
+    ]
+    for triplet in expected:
+        assert (triplet.subject, triplet.predicate, triplet.object) in res.semantics
 
 
 def test_multi_chain_batch_pp_simple():
@@ -110,18 +127,71 @@ def test_multi_chain_batch_pp_combined():
     )
 
 
-if __name__ == "__main__":
-    # get a few posts for input
-    urls = [
-        "https://mastodon.social/@psmaldino@qoto.org/111405098400404613",
-        "https://mastodon.social/@UlrikeHahn@fediscience.org/111732713776994953",
-        "https://mastodon.social/@ronent/111687038322549430",
+def test_convert_item_types_to_rdf_triplets_single_entry():
+    item_types = ["preprint"]
+    reference_urls = ["https://arxiv.org/abs/2402.04607"]
+    result = convert_item_types_to_rdf_triplets(item_types, reference_urls)
+
+    expected = [
+        RDFTriplet(
+            subject=URIRef("https://arxiv.org/abs/2402.04607"),
+            predicate=URIRef(ZoteroItemTypeDefinition().uri),
+            object=Literal("preprint"),
+        )
     ]
-    posts = [scrape_post(url) for url in urls]
-    multi_config = create_multi_config_for_tests(llm_type="google/gemma-7b-it")
-    multi_chain_parser = MultiChainParser(multi_config)
-    multi_chain_parser.config.post_process_type = PostProcessType.FIREBASE
-    res = multi_chain_parser.batch_process_ref_posts(posts)
+
+    assert len(result) == len(expected)
+    for res, exp in zip(result, expected):
+        assert res.subject == exp.subject
+        assert res.predicate == exp.predicate
+        assert res.object == exp.object
+
+
+def test_convert_item_types_to_rdf_triplets_multiple_entries():
+    item_types = ["journalArticle", "book"]
+    reference_urls = ["https://example.com/article1", "https://example.com/book1"]
+    result = convert_item_types_to_rdf_triplets(item_types, reference_urls)
+
+    expected = [
+        RDFTriplet(
+            subject=URIRef("https://example.com/article1"),
+            predicate=URIRef(ZoteroItemTypeDefinition().uri),
+            object=Literal("journalArticle"),
+        ),
+        RDFTriplet(
+            subject=URIRef("https://example.com/book1"),
+            predicate=URIRef(ZoteroItemTypeDefinition().uri),
+            object=Literal("book"),
+        ),
+    ]
+
+    assert len(result) == len(expected)
+    for res, exp in zip(result, expected):
+        assert res.subject == exp.subject
+        assert res.predicate == exp.predicate
+        assert res.object == exp.object
+
+
+def test_convert_item_types_to_rdf_triplets_empty():
+    item_types = []
+    reference_urls = []
+    result = convert_item_types_to_rdf_triplets(item_types, reference_urls)
+    assert result == []
+
+
+def test_convert_item_types_to_rdf_triplets_mismatched_lengths():
+    item_types = ["preprint", "book"]
+    reference_urls = ["https://arxiv.org/abs/2402.04607"]
+
+    with pytest.raises(AssertionError):
+        convert_item_types_to_rdf_triplets(item_types, reference_urls)
+
+
+if __name__ == "__main__":
+    multi_config = create_multi_config_for_tests()
+    multi_config.post_process_type = PostProcessType.COMBINED
+    mcp = MultiChainParser(multi_config)
+    res = mcp.process_text(TEST_POST_TEXT_W_REF)
 
     # len(res.support.refs_meta) == 1
     # assert "test" in mcp.pparsers
diff --git a/nlp/tests/test_post_app_interface.py b/nlp/tests/test_post_app_interface.py
index b5db3242..872f0ee3 100644
--- a/nlp/tests/test_post_app_interface.py
+++ b/nlp/tests/test_post_app_interface.py
@@ -1,5 +1,6 @@
 import sys
 from pathlib import Path
+import pytest
 
 ROOT = Path(__file__).parents[1]
 sys.path.append(str(ROOT))
@@ -21,7 +22,12 @@
     ThreadRefPost,
     QuoteRefPost,
 )
-from desci_sense.shared_functions.interface import AppThread, ParsePostRequest
+from desci_sense.shared_functions.interface import (
+    AppThread,
+    ParsePostRequest,
+    Author,
+    AppPost,
+)
 from desci_sense.shared_functions.dataloaders import scrape_post
 from desci_sense.shared_functions.dataloaders.twitter.twitter_utils import (
     extract_external_ref_urls,
@@ -36,6 +42,40 @@
     preproc_parser_input,
 )
 
+QUOTED_THREAD_I123 = {
+    "author": {
+        "platformId": "twitter",
+        "id": "author_123",
+        "username": "user123",
+        "name": "John Doe",
+    },
+    "url": "https://x.com/fchollet/status/1810833882037825646",
+    "thread": [
+        {
+            "url": "https://x.com/fchollet/status/1810833882037825646",
+            "content": "The fact is that tech bubbles have very little to do with the technical or even commercial merits of the technology they form around. They can happen with worthless narratives or with entirely grounded ones. They don't even require unrealistic revenue projections!",
+        }
+    ],
+}
+
+TEST_POST_I123 = {
+    "author": {
+        "platformId": "twitter",
+        "id": "author_456",
+        "username": "user546",
+        "name": "Sarah Gore",
+    },
+    "url": "https://example.com/post/2",
+    "thread": [
+        {
+            "url": "https://example.com/post/2",
+            "content": "Something deep alluded to here: - In theory we have free markets as superhuman information processors (Hayek) - In practice, that information processing is limited by the bounded, fallible & biased cognition of individual investors wielding outsized influence https://twitter.com/fchollet/status/1810833882037825646",
+            "quotedThread": QUOTED_THREAD_I123,
+        }
+    ],
+}
+
+
 TEST_THREAD_INTERFACE_2 = {
     "url": "https://example.com/post/2",
     "thread": [
@@ -278,10 +318,67 @@ def test_load_real_thread():
     ]
 
 
+def test_author_platform_id_lowercase():
+    author = Author(id="123", name="John Doe", username="johndoe", platformId="Twitter")
+    assert author.platformId == "twitter"
+
+
+def test_author_invalid_platform_id():
+    with pytest.raises(ValidationError):
+        Author(
+            id="123", name="John Doe", username="johndoe", platformId=123
+        )  # platformId should be a string
+
+
+def test_app_post_normalize_content_urls():
+    content = "Check out this tweet: https://twitter.com/user/status/1234567890"
+    expected_content = "Check out this tweet: https://x.com/user/status/1234567890"
+    post = AppPost(content=content)
+    assert post.content == expected_content
+
+
+def test_app_post_normalize_url():
+    url = "https://twitter.com/user/status/1234567890"
+    expected_url = "https://x.com/user/status/1234567890"
+    post = AppPost(content="Test post", url=url)
+    assert post.url == expected_url
+
+
+def test_app_post_no_normalization_needed():
+    content = "This is a test post with no Twitter URLs."
+    url = "https://example.com"
+    post = AppPost(content=content, url=url)
+    assert post.content == content
+    assert post.url == url
+
+
+def test_app_thread_normalize_url():
+    url = "https://twitter.com/user/status/1234567890"
+    expected_url = "https://x.com/user/status/1234567890"
+    author = Author(id="123", name="John Doe", username="johndoe", platformId="twitter")
+    post = AppPost(content="Test post", url=url)
+    thread = AppThread(author=author, thread=[post], url=url)
+    assert thread.url == expected_url
+
+
+def test_app_thread_author():
+    author = Author(id="123", name="John Doe", username="johndoe", platformId="Twitter")
+    post = AppPost(content="Test post")
+    thread = AppThread(author=author, thread=[post])
+    assert thread.author.name == "John Doe"
+    assert thread.author.platformId == "twitter"
+
+
+def test_i123():
+    thread = AppThread.model_validate(TEST_POST_I123)
+    thread_ref_post = convert_thread_interface_to_ref_post(thread)
+    assert thread_ref_post.md_ref_urls() == [
+        "https://x.com/fchollet/status/1810833882037825646"
+    ]
 
 
 if __name__ == "__main__":
-    thread = AppThread.model_validate(TEST_OVERLENGTH_THREAD_INTERFACE)
+    thread = AppThread.model_validate(TEST_POST_I123)
     thread_ref_post = convert_thread_interface_to_ref_post(thread)
-    pi = ParserInput(thread_post=thread_ref_post, max_posts=30)
-    proc_pi = preproc_parser_input(pi)
+    # pi = ParserInput(thread_post=thread_ref_post, max_posts=30)
+    # proc_pi = preproc_parser_input(pi)
diff --git a/nlp/tests/test_quoted_post_firebase.py b/nlp/tests/test_quoted_post_firebase.py
new file mode 100644
index 00000000..29f4fc91
--- /dev/null
+++ b/nlp/tests/test_quoted_post_firebase.py
@@ -0,0 +1,150 @@
+import sys
+from pathlib import Path
+
+# https://stackoverflow.com/a/63539722/2882125
+import nest_asyncio
+
+nest_asyncio.apply()
+
+ROOT = Path(__file__).parents[1]
+sys.path.append(str(ROOT))
+import os
+import pytest
+from pydantic import ValidationError, BaseModel
+from rdflib import URIRef, Literal, Graph
+from utils import (
+    create_multi_chain_for_tests,
+    create_multi_config_for_tests,
+    get_thread_1,
+    no_empty_lists,
+    create_post_request,
+)
+from desci_sense.shared_functions.schema.post import ThreadRefPost, QuoteRefPost
+from desci_sense.shared_functions.parsers.multi_chain_parser import MultiChainParser
+from desci_sense.shared_functions.configs import (
+    OpenrouterAPIConfig,
+    WandbConfig,
+    LLMConfig,
+    MultiRefTaggerChainConfig,
+    KeywordPParserChainConfig,
+    RefTaggerChainConfig,
+    TopicsPParserChainConfig,
+    validate_env_var,
+    MultiParserChainConfig,
+    ParserChainType,
+    PostProcessType,
+    PostRendererType,
+    MetadataExtractionConfig,
+)  # Adjust the import as necessary
+from desci_sense.shared_functions.dataloaders import (
+    scrape_post,
+    convert_text_to_ref_post,
+)
+from desci_sense.shared_functions.interface import (RDFTriplet, QuotedPostDefinition,)
+from desci_sense.shared_functions.preprocessing import ParserInput
+from desci_sense.shared_functions.filters import SciFilterClassfication
+
+TEST_POST_TEXT_W_REF = """
+I really liked this paper!
+https://arxiv.org/abs/2402.04607
+"""
+
+TARGET_THREAD_RENDER = """- Author: Eiko Fried
+- Content: After careful consideration, the FDA advisory comission voted today 9:2 that MDMA has *not* been shown to be effective for treating PTSD, given massive concerns around validity threats in this literature. They also voted 10:1 that MDMA has *not* shown to be safe. <quoted ref_1>@eturnermd1 #MDMAadcomm VOTE 1/2: Do the available data show that the drug is effective in patients with posttraumatic
+stress disorder?
+2-Yes
+9-No
+0-Abstain https://twitter.com/FDAadcomms/status/1798104612635070611/photo/1</quote>
+---
+📄Many mentioned reasons overlap with those we summarized recently in our review paper: 
+<ref_2>
+
+📺 I also summarize them for a lay audience in this YouTube video: 
+<ref_3>
+- References: 
+<ref_1> 
+url: https://x.com/FDAadcomms/status/1798104612635070611
+item_type: forumPost
+title: Twitter post
+summary: None
+==========
+<ref_2> 
+url: https://journals.sagepub.com/doi/10.1177/20451253231198466
+item_type: journalArticle
+title: History repeating: guidelines to address common problems in psychedelic science
+summary: None
+==========
+<ref_3> 
+url: https://www.youtube.com/watch?feature=youtu.be&si=kjMtNR1Hwe7NZ8as&v=WknlkmJee4E
+item_type: videoRecording
+title: Psychedelic treatments for mental health problems: promises and pitfalls
+summary: In this lecture, I summarize promises and pitfalls of psychedelic treatments for mental health problems. No scientific background knowledge is required to vi...
+=========="""
+
+
+def test_thread_quoted_url():
+    thread = get_thread_1()
+    pi_1 = ParserInput(thread_post=thread, max_posts=30)
+    multi_config = MultiParserChainConfig(
+        parser_configs=[
+            MultiRefTaggerChainConfig(
+                name="multi_ref_tagger",
+                llm_config=LLMConfig(llm_type="mistralai/mistral-7b-instruct:free"),
+                post_renderer=PostRendererType.THREAD_REF_POST,
+            )
+        ],
+        post_process_type=PostProcessType.COMBINED,
+        metadata_extract_config=MetadataExtractionConfig(extraction_method="citoid"),
+    )
+    mcp = MultiChainParser(multi_config)
+    res = mcp.process_parser_input(pi_1)
+    assert res.quoted_post_url == thread.quoted_url
+    assert thread.quoted_url == 'https://x.com/FDAadcomms/status/1798104612635070611'
+    
+
+def test_firebase_quoted_post_pp():
+    thread = get_thread_1()
+    pi_1 = ParserInput(thread_post=thread, max_posts=30)
+    multi_config = MultiParserChainConfig(
+        parser_configs=[
+            MultiRefTaggerChainConfig(
+                name="multi_ref_tagger",
+                llm_config=LLMConfig(llm_type="mistralai/mistral-7b-instruct:free"),
+                post_renderer=PostRendererType.THREAD_REF_POST,
+            )
+        ],
+        post_process_type=PostProcessType.FIREBASE,
+        metadata_extract_config=MetadataExtractionConfig(extraction_method="citoid"),
+    )
+    mcp = MultiChainParser(multi_config)
+    res = mcp.process_parser_input(pi_1)
+    expected = RDFTriplet(
+            predicate=URIRef(QuotedPostDefinition().uri),
+            object=URIRef("https://x.com/FDAadcomms/status/1798104612635070611"),
+        )
+    assert (expected.subject, expected.predicate, expected.object) in res.semantics
+    
+
+# "mistralai/mixtral-8x7b-instruct"
+# "mistralai/mistral-7b-instruct:free"
+# "google/gemma-7b-it"
+if __name__ == "__main__":
+    thread = get_thread_1()
+    pi_1 = ParserInput(thread_post=thread, max_posts=30)
+    multi_config = MultiParserChainConfig(
+        parser_configs=[
+            MultiRefTaggerChainConfig(
+                name="multi_ref_tagger",
+                llm_config=LLMConfig(llm_type="mistralai/mistral-7b-instruct:free"),
+                post_renderer=PostRendererType.THREAD_REF_POST,
+            )
+        ],
+        post_process_type=PostProcessType.FIREBASE,
+        metadata_extract_config=MetadataExtractionConfig(extraction_method="citoid"),
+    )
+    mcp = MultiChainParser(multi_config)
+    res = mcp.process_parser_input(pi_1)
+
+    
+    
+
diff --git a/nlp/tests/test_twitter.py b/nlp/tests/test_twitter.py
index 3b99a4ef..e07ecd8d 100644
--- a/nlp/tests/test_twitter.py
+++ b/nlp/tests/test_twitter.py
@@ -1,10 +1,13 @@
 import sys
 from pathlib import Path
 
+
 ROOT = Path(__file__).parents[1]
 sys.path.append(str(ROOT))
 import pytest
 
+from desci_sense.shared_functions.utils import normalize_tweet_urls_in_text
+
 from desci_sense.shared_functions.dataloaders import (
     PostScrapeError,
     UnknownSocialMediaTypeError,
@@ -112,6 +115,48 @@ def test_problem_tweet_i31():
         ), f"{case} has_refs? = {tweet.has_refs()} - mismatch with {label}"
 
 
+def test_normalize_single_twitter_url():
+    text = "Check out this tweet: https://twitter.com/user/status/1234567890"
+    expected = "Check out this tweet: https://x.com/user/status/1234567890"
+    assert normalize_tweet_urls_in_text(text) == expected
+
+
+def test_normalize_multiple_twitter_urls():
+    text = "First tweet: https://twitter.com/user1/status/1234567890 and second tweet: https://twitter.com/user2/status/0987654321"
+    expected = "First tweet: https://x.com/user1/status/1234567890 and second tweet: https://x.com/user2/status/0987654321"
+    assert normalize_tweet_urls_in_text(text) == expected
+
+
+def test_mixed_urls():
+    text = "Tweet: https://twitter.com/user/status/1234567890 and a non-Twitter URL: https://example.com/page"
+    expected = "Tweet: https://x.com/user/status/1234567890 and a non-Twitter URL: https://example.com/page"
+    assert normalize_tweet_urls_in_text(text) == expected
+
+
+def test_no_twitter_url():
+    text = "This text contains no Twitter URLs, only this: https://example.com/page"
+    expected = "This text contains no Twitter URLs, only this: https://example.com/page"
+    assert normalize_tweet_urls_in_text(text) == expected
+
+
+def test_empty_string():
+    text = ""
+    expected = ""
+    assert normalize_tweet_urls_in_text(text) == expected
+
+
+def test_url_with_http():
+    text = "Check out this tweet: http://twitter.com/user/status/1234567890"
+    expected = "Check out this tweet: https://x.com/user/status/1234567890"
+    assert normalize_tweet_urls_in_text(text) == expected
+
+
+def test_mixed_case_url():
+    text = "Check out this tweet: https://Twitter.com/user/status/1234567890"
+    expected = "Check out this tweet: https://x.com/user/status/1234567890"
+    assert normalize_tweet_urls_in_text(text) == expected
+
+
 if __name__ == "__main__":
     post_url = "https://twitter.com/example/status/1234567890"
     tweet = scrape_post(post_url)