diff --git a/nlp/desci_sense/evaluation/Evaluation_benchmark.py b/nlp/desci_sense/evaluation/Evaluation_benchmark.py new file mode 100644 index 00000000..f3f08e1c --- /dev/null +++ b/nlp/desci_sense/evaluation/Evaluation_benchmark.py @@ -0,0 +1,585 @@ +#evaluation of the academic filter, it takes as a input a dataset and a handle table. +# the handle table holds information about the accounts that published the dataset posts + +"""Script to run evaluation of label prediction models. + +Usage: + filter_evaluation.py [--config=] [--dataset=] [--dataset_file=] [--handle_file=] + + +Options: +--config= Optional path to configuration file. +--dataset= Optional path to a wandb artifact. +--dataset_file= Optional dataset file name e.g. labeled_dataset.table.json indeed it should be a table.json format +--handle_file= Optional file name e.g. labeled_dataset.table.json indeed it should be a table.json format + +""" +from datetime import datetime +import wandb +from pathlib import Path +import pandas as pd +import numpy as np +import sys +import docopt +import re +from collections import Counter +import concurrent.futures +from tqdm import tqdm +from sklearn.preprocessing import LabelBinarizer +from sklearn.metrics import ( + precision_recall_fscore_support, + accuracy_score, + confusion_matrix +) +import matplotlib.pyplot as plt + + + +sys.path.append(str(Path(__file__).parents[2])) + +from desci_sense.evaluation.utils import ( + get_dataset, create_custom_confusion_matrix, posts_to_refPosts, obj_str_to_dict, autopct_format, + projection_to_list, flatten_list +) +from desci_sense.shared_functions.parsers.multi_chain_parser import MultiChainParser +from desci_sense.shared_functions.init import init_multi_chain_parser_config +from desci_sense.shared_functions.schema.post import RefPost +from desci_sense.shared_functions.dataloaders import ( + scrape_post, + convert_text_to_ref_post, +) + +class Evaluation: + def __init__(self, config): + self.config = config + + class CustomLabelBinarizer(LabelBinarizer): + def fit(self, y, order=None): + if order is not None: + self.classes_ = np.array(order) + else: + super().fit(y) + return self + + def check_topic(self, topics): + item_types_allowlist = [ + "bookSection", "journalArticle", "preprint", "book", "manuscript", + "thesis", "presentation", "conferencePaper", "report" + ] + return int(bool(set(topics).intersection(set(item_types_allowlist)))) + + def topic_eval(self, df, tp): + bool_topics = list(df.apply(lambda row: self.check_topic(row['Ref item types']), axis=1)) + try: + ratio = sum(bool_topics) / tp + except Exception as e: + ratio = 0 + print("Ratio exception:", e) + return ratio + + def row_to_post(self,row:pd.DataFrame): + try: + ref_post = RefPost( + author=row['username'],content=row['Text'],url='',ref_urls=row['urls'],source_network=row['server'] + ) + except Exception as e: + print('cannot convert to refpost: ',row) + print('With exception: ',e) + ref_post = None + return ref_post + + def dataframe_to_ref_posts(self, df: pd.DataFrame): + ref_posts = [] + for _, row in df.iterrows(): + ref_post = self.row_to_post(row) + ref_posts.append(ref_post) + return ref_posts + + def prepare_parser_input(self, df): + print('Converting posts to refPosts') + ref_posts = posts_to_refPosts(df['Text']) + return ref_posts + + def pred_labels(self, df,active_list = ["hashtags"] , batch_size=10): + model = MultiChainParser(self.config) + inputs = self.prepare_parser_input(df) + results = model.batch_process_ref_posts(inputs=inputs, active_list=active_list,batch_size=batch_size) + try: + df['Predicted Label'] = [x.filter_classification.value for x in results] + df['Reasoning Steps'] = ["Keywords: " + str(x.debug['topics']['reasoning']) + " Topics: " + str(x.debug['keywords']['reasoning']) for x in results] + df['Keywords'] = [x.keywords for x in results] + df['Topics'] = [x.topics for x in results] + df['Ref item types'] = [x.item_types for x in results] + df['academic_keyword'] = [x.research_keyword for x in results] + except Exception as e: + print("Parser error:", e) + return inputs, results + + def normalize_df(self, df): + if isinstance(df["True Label"].iloc[0], list): + df["True Label"] = df["True Label"].apply(lambda x: x[0]) + + def binarize(self, y_pred, y_true): + lb = self.CustomLabelBinarizer() + lb.fit(['research', 'not_research'], order=['research', 'not_research']) + y_true = lb.transform(y_true) + y_pred = lb.transform(y_pred) + return y_pred, y_true, lb.classes_ + + def calculate_scores(self, y_pred, y_true): + precision, recall, f1_score, support = precision_recall_fscore_support(y_true, y_pred, average=None) + accuracy = accuracy_score(y_pred=y_pred, y_true=y_true) + return precision[0], recall[0], f1_score[0], accuracy + + def calculate_feed_score(self, df, name): + df1 = df[df["username"] == name] + y_pred = df1["Predicted Label"] + y_true = df1["True Label"] + n = len(y_true) + try: + y_pred, y_true, labels = self.binarize(y_pred=y_pred, y_true=y_true) + precision, recall, f1_score, accuracy = self.calculate_scores(y_pred=y_pred, y_true=y_true) + try: + cm = confusion_matrix(y_pred=y_pred, y_true=y_true) + except: + print('No entries in feed of:', name) + tp = 0 + try: + tp = cm[0, 0] + fn = cm[1, 0] + except: + print("no FNs") + fn = 0 + try: + r_topics = self.topic_eval(df=df1, tp=tp) + except: + print("No citoids detection") + r_topics = 0 + return pd.Series([precision, recall, f1_score, accuracy, tp, fn, n, r_topics], + index=["precision", "recall", "f1_score", "accuracy", "TP", "FN", "posts count", 'citoid positive ratio']) + except Exception as e: + print(f"Exception was raised while calculating feed scores: {e}") + return pd.Series([0, 0, 0, 0, 0, 0, n, 0], index=["precision", "recall", "f1_score", "accuracy", "TP", "FN", "posts count", 'citoid positive ratio']) + + def weighted_average(self, column_name, df): + return (df[column_name] * df['posts count']).sum() / df['posts count'].sum() + + def constr_feed_chart(self, df, df_handles): + df_feed_eval = df_handles[["username", "server", "info"]] + for column in ["precision", "recall", "f1_score", "accuracy", "posts count"]: + df_feed_eval[column] = 0 + df_feed_eval[["precision", "recall", "f1_score", "accuracy", 'TP', 'FN', "posts count", 'citoid positive ratio']] = df_feed_eval.apply( + lambda row: self.calculate_feed_score(df=df, name=row["username"]), axis=1) + average_row = [self.weighted_average(column_name=x, df=df_feed_eval) for x in ["precision", "recall", "f1_score", "accuracy"]] + tp = df_feed_eval["TP"].sum() + r_topics = self.topic_eval(df=df, tp=tp) + average_row.extend([tp, df_feed_eval["FN"].sum(), df_feed_eval["posts count"].sum(), r_topics]) + new_row = ["Average", "", ""] + average_row + new_row = pd.DataFrame([new_row], columns=['username', 'server', 'info', 'precision', 'recall', 'f1_score', 'accuracy', 'TP', 'FN', 'posts count', 'citoid positive ratio']) + return df_feed_eval._append(new_row, ignore_index=True) + + # Zotero Item type analysis + def count_zotero_types(self,df : pd.DataFrame): + counts_df = df["Ref item types"].apply(flatten_list).apply(Counter).apply(pd.Series).fillna(0).astype(int) + total_row = counts_df.sum() + total_row.name = 'Total' + + return total_row + def count_research_zotero_types(self, df: pd.DataFrame, allow_list=[ + "bookSection", "journalArticle", "preprint", "book", "manuscript", + "thesis", "presentation", "conferencePaper", "report" + ]): + # Apply the check_topic method to each row and create a boolean mask + #mask = df['Ref item types'].apply(flatten_list).apply(lambda x: self.check_topic([item for item in x]) == 1) + + # Filter the DataFrame using the mask + #filtered_df = df[mask] + + project_to_allowlist = projection_to_list(allow_list) + #This is not good yet TODO, it will count each item only once + #filtered_df['Ref item types'] = [project_to_allowlist(x) for x in filtered_df["Ref item types"]] + # Count items in the filtered DataFrame + #counts_df = filtered_df["Ref item types"].apply(flatten_list).apply(project_to_allowlist).apply(Counter).apply(pd.Series).fillna(0).astype(int) + counts_df = df["Ref item types"].apply(flatten_list).apply(project_to_allowlist).apply(Counter).apply(pd.Series).fillna(0).astype(int) + total_row = counts_df.sum() + #total_row.name = 'Total' + return total_row + + + def build_item_type_pie(self,df:pd.DataFrame): + total_counts = self.count_zotero_types(df=df) + # Create a pie chart + fig1, ax = plt.subplots(figsize=(12, 12)) + ax.pie(total_counts, labels=total_counts.index, autopct=lambda pct: autopct_format(pct, total_counts), startangle=140) + ax.set_title('Distribution of Item Types') + + #plt.show() + + + + total_counts = self.count_research_zotero_types(df=df) + # Create a pie chart + fig2, ax = plt.subplots(figsize=(12, 12)) + ax.pie(total_counts, labels=total_counts.index, autopct=lambda pct: autopct_format(pct, total_counts), startangle=140) + ax.set_title('Distribution of research item types') + + #plt.show() + + return fig1, fig2 + +class TwitterEval(Evaluation): + def __init__(self, config): + super().__init__(config) + + @staticmethod + def check_quotes(urls): + quotes = [] + pattern = re.compile(r'^https://(?:twitter\.com|x\.com)/.+/[0-9]+$') + for url in urls: + if pattern.match(url): + quotes.append(url) + return quotes + + def nested_quotes_citoid(self,post:RefPost,steps = 0, ind = 0): + multi_chain_parser = MultiChainParser(self.config) + + result = multi_chain_parser.process_ref_post(post=post,active_list=["hashtags"]) + print("post urls",result.reference_urls) + print("Item types: ",result.item_types) + + item_types = result.item_types + + if self.check_topic(result.item_types): + print("Yay, citoid found topic") + ind = 1 + else: + quotes = self.check_quotes(result.reference_urls) + if quotes: + print("checking quotes") + for url in quotes: + quote = scrape_post(url) + ind, steps, t = self.nested_quotes_citoid(post = quote, steps=steps+1, ind = ind) + + else: + print("done") + return ind, steps, item_types + + def nested_quotes_citoid_parallel(self, inputs): + results = [] + with concurrent.futures.ThreadPoolExecutor() as executor: + futures = {executor.submit(self.nested_quotes_citoid, post): post for post in inputs} + + for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)): + post = futures[future] + try: + result = future.result() + results.append(result) + except Exception as exc: + print(f'Post {post} generated an exception: {exc}') + # Append a default value to maintain length consistency + results.append((0, 0, ['citoid_error'])) + + return results + + def feed_tweet_type_statistics(self, df, name,update_df = 1,post_types =['quote','thread']): + print('Analyze tweets by',name) + df1 = df[df["username"] == name] + inputs = self.prepare_parser_input(df1) + post_count=len(inputs) + results = self.nested_quotes_citoid_parallel(inputs) + if update_df: + # Ensure 'citoid_research' column exists + if 'citoid_research' not in df.columns: + df['citoid_research'] = None + + # Extract the first element of each tuple in results and ensure lengths match + first_elements = [result[0] for result in results[:post_count]] + + '''# Debugging prints + print(f"Length of df1: {len(df1)}") + print(f"Length of inputs: {len(inputs)}") + print(f"Length of results: {len(results)}") + print(f"Length of first_elements: {len(first_elements)}")''' + + # Assign these elements to the corresponding rows in df1 + df.loc[df["username"] == name, 'citoid_research'] = first_elements + #TODO make more better - defining df1 twice + df1 = df[df["username"] == name] + citoid_count = 0 + quotes_count = 0 + quoted_citoid_count = 0 + item_type_list = [] + for ind, steps, item_type in results: + citoid_count = citoid_count +ind + item_type_list.append(item_type) + if steps: + quoted_citoid_count = quoted_citoid_count + ind + quotes_count = quotes_count + 1 + if post_count: + quotes_ratio = quotes_count/post_count + citoid_ratio = citoid_count/post_count + else: + quotes_ratio = -1 + citoid_ratio = -1 + #thread stat + if 'thread' in post_types: + thread_count = df1['thread'].sum() + citoid_threads = df1[(df1['thread'] ==1) & (df1['citoid_research']==1)].shape[0] + if post_count: + thread_ratio = thread_count/post_count + else: + thread_ratio = -1 + return citoid_count, quoted_citoid_count, quotes_count, post_count, quotes_ratio, citoid_ratio, thread_count, citoid_threads, thread_ratio, item_type_list + + + return citoid_count, quoted_citoid_count, quotes_count, post_count, quotes_ratio, citoid_ratio, item_type_list + + def build_post_type_chart(self,df_handles:pd.DataFrame,df:pd.DataFrame,parse_df = 1,post_types = ['quote','thread']): + df_feed_eval = df_handles[["username", "server", "info"]].copy() + + for column in ["citoid_count", "quoted_citoid_count", "quotes_count","post_count","quotes_ratio","citoid_ratio","thread_count","citoid_threads","thread_ratio"]: + df_feed_eval[column] = 0 + df_feed_eval[["citoid_count", "quoted_citoid_count", "quotes_count","post_count","quotes_ratio","citoid_ratio","thread_count","citoid_threads","thread_ratio","Ref item types"]] = df_feed_eval.apply( + lambda row: pd.Series(self.feed_tweet_type_statistics(df=df, name=row["username"])), axis=1) + + #"Total" row + post_count = df_feed_eval["post_count"].sum() + quotes_count = df_feed_eval["quotes_count"].sum() + citoid_count = df_feed_eval["citoid_count"].sum() + thread_count = df_feed_eval['thread_count'].sum() + citoid_threads = df_feed_eval['citoid_threads'].sum() + if post_count: + quotes_ratio = quotes_count/post_count + citoid_ratio = citoid_count/post_count + thread_ratio = thread_count/post_count + else: + quotes_ratio = -1 + citoid_ratio = -1 + thread_ratio = -1 + new_row = ["Summery","","",citoid_count,df_feed_eval["quoted_citoid_count"].sum(),quotes_count,post_count,quotes_ratio,citoid_ratio,thread_count,citoid_threads,thread_ratio,[]] + new_row = pd.DataFrame([new_row], columns=['username', 'server', 'info', "citoid_count", "quoted_citoid_count", "quotes_count","post_count","quotes_ratio","citoid_ratio","thread_count","citoid_threads","thread_ratio","Ref item types"]) + return df_feed_eval._append(new_row, ignore_index=True) + + def build_thread_chart(self,df_handles:pd.DataFrame,df:pd.DataFrame): + df_feed_eval = df_handles[["username", "server", "info"]].copy() + for column in ["citoid_count", "thread_citoid_count", "thread_count","post_count","thread_ratio","citoid_ratio"]: + df_feed_eval[column] = 0 + df_feed_eval[["citoid_count", "thread_citoid_count", "thread_count","post_count","thread_ratio","citoid_ratio","Ref item types"]] = df_feed_eval.apply( + lambda row: pd.Series(self.feed_tweet_type_statistics(df=df, name=row["username"])), axis=1) + #"Total" row + post_count = df_feed_eval["post_count"].sum() + quotes_count = df_feed_eval["quotes_count"].sum() + citoid_count = df_feed_eval["citoid_count"].sum() + if post_count: + quotes_ratio = quotes_count/post_count + citoid_ratio = citoid_count/post_count + else: + quotes_ratio = -1 + citoid_ratio = -1 + new_row = ["Summery","","",citoid_count,df_feed_eval["quoted_citoid_count"].sum(),quotes_count,post_count,quotes_ratio,citoid_ratio,[]] + new_row = pd.DataFrame([new_row], columns=['username', 'server', 'info', "citoid_count", "quoted_citoid_count", "quotes_count","post_count","quotes_ratio","citoid_ratio","Ref item types"]) + return df_feed_eval._append(new_row, ignore_index=True) + + + + + + + + +if __name__ == "__main__": + #post = scrape_post('https://x.com/rtk254/status/1741841607421263966') + llm_type="mistralai/mistral-7b-" + + config = init_multi_chain_parser_config( + llm_type=llm_type, + post_process_type="combined" + ) + Eval = TwitterEval(config=config) + #q = Eval.nested_quotes_citoid(post=post) + #print(q) + wandb.login() + + api = wandb.Api() + + #TODO move from testing + run = wandb.init(project="post_type_statistics", job_type="evaluation") + + # get artifact path + + dataset_artifact_id = ( + 'common-sense-makers/post_type_stat/non_labeled_tweets:v0' + ) + + # set artifact as input artifact + dataset_artifact = run.use_artifact(dataset_artifact_id) + + # initialize table path + # add the option to call table_path = arguments.get('--dataset') + + # download path to table + a_path = dataset_artifact.download() + print("The path is",a_path) + + # get dataset file name + + table_path = Path(f"{a_path}/non_labeled_data_table.table.json") + + + # return the pd df from the table + #remember to remove the head TODO + df = get_dataset(table_path) + + '''df1 = df[df["username"] == 'sbuckshum'] + + ref_posts= Eval.dataframe_to_ref_posts(df1) + + print(ref_posts)''' + + + table_path = Path(f"{a_path}/handles_chart.table.json") + + df_handles = get_dataset(table_path) + df_eval = Eval.build_post_type_chart(df_handles=df_handles,df=df) + fig1, fig2 = Eval.build_item_type_pie(df=df_eval) + + wandb.log({"Quote statistics per feed": wandb.Table(dataframe=df_eval)}) + #print(df['Ref item types']) + #fig1, fig2 = Eval.build_item_type_pie(df=df) + + wandb.log({"item_type_distribution": wandb.Image(fig1)}) + + wandb.log({"research_item_type_distribution": wandb.Image(fig2)}) + + #true_df = df[df["True Label"] == 'research'] + + #fig1 , fig2 = Eval.build_item_type_pie(true_df) + #wandb.log({"research_type_distribution": wandb.Image(fig)}) + config = obj_str_to_dict(config) + + run.config.update(config) + + + + artifact = wandb.Artifact("dataset_stat", type="stat") + + # Create a wandb.Table from the Pandas DataFrame + table = wandb.Table(dataframe=df) + + # Add the wandb.Table to the artifact + artifact.add(table, "post_stat") + + run.log_artifact(artifact) + + wandb.run.finish() + + + + """inputs = Eval.prepare_parser_input(df) + + results = Eval.nested_quotes_citoid_parallel(inputs)""" + """count = 0 + errors = [] + for p in inputs: + try: + ind, steps, item_types = Eval.nested_quotes_citoid(post=p) + count = count + ind + except Exception as e: + errors.append(e) + print("Count: ", count) + print("Errors count: ",len(errors)) + print("Errors: ",errors)""" + + + + #inputs, results = Eval.pred_labels(df=df) + + + """ + arguments = docopt.docopt(__doc__) + + config_path = arguments.get("--config") + dataset_path = arguments.get("--dataset") + dataset_file = arguments.get("--dataset_file") + handle_file = arguments.get("--handle_file") + + current_datetime = datetime.now() + time = current_datetime.strftime("%Y%m%d%H%M%S") + llm_type="mistralai/mistral-7b-" + + config = init_multi_chain_parser_config( + llm_type=llm_type, + post_process_type="combined" + ) + + wandb.login() + + api = wandb.Api() + run = wandb.init(project="testing", job_type="evaluation", name= llm_type+ str(time)) + + if dataset_path: + dataset_artifact_id = dataset_path + else: + dataset_artifact_id = 'common-sense-makers/filter_evaluation/labeled_tweets_no_threads:v1' + + dataset_artifact = run.use_artifact(dataset_artifact_id) + a_path = dataset_artifact.download() + + if dataset_file: + table_path = Path(f"{a_path}/{dataset_file}") + else: + table_path = Path(f"{a_path}/labeled_data_table_no_threads.table.json") + + df = get_dataset(table_path).head(10) + + if handle_file: + table_path = Path(f"{a_path}/{dataset_file}") + else: + table_path = Path(f"{a_path}/handles_chart.table.json") + + df_handles = get_dataset(table_path) + evaluator = Evaluation(config=config) + evaluator.pred_labels(df=df) + evaluator.normalize_df(df) + y_pred, y_true, labels = evaluator.binarize(y_pred=df["Predicted Label"], y_true=df["True Label"]) + precision, recall, f1_score, accuracy = evaluator.calculate_scores(y_pred=y_pred, y_true=y_true) + artifact = wandb.Artifact("prediction_evaluation-" + str(time), type="evaluation") + table = wandb.Table(dataframe=df) + artifact.add(table, "prediction_evaluation") + + try: + feed_chart = evaluator.constr_feed_chart(df=df, df_handles=df_handles) + wandb.log({"Scores per feed": wandb.Table(dataframe=feed_chart)}) + except Exception as e: + print("An exception was raised building the feed chart:", e) + + try: + matrix = confusion_matrix(y_true, y_pred) + labels_with_info = [f"(True) {label}" for label in labels] + predicted_labels_with_info = [f"(Pred) {label}" for label in labels] + wandb.log({ + "confusion_matrix": wandb.plots.HeatMap( + matrix_values=matrix, + y_labels=labels_with_info, + x_labels=predicted_labels_with_info, + show_text=True + ) + }) + except: + print("Not enough examples for constructing confusion matrix") + + meta_data = { + "dataset_size": len(df), + "precision": pd.Series(precision).mean(), + "recall": pd.Series(recall).mean(), + "f1_score": pd.Series(f1_score).mean(), + "accuracy": accuracy, + } + + artifact.metadata.update(meta_data) + trans_config = obj_str_to_dict(config) + run.config.update(trans_config) + run.summary.update(meta_data) + wandb.log_artifact(artifact, aliases=["latest"]) + wandb.run.finish() +""" \ No newline at end of file diff --git a/nlp/desci_sense/evaluation/item_type_stat_pie.py b/nlp/desci_sense/evaluation/item_type_stat_pie.py new file mode 100644 index 00000000..c10d6b81 --- /dev/null +++ b/nlp/desci_sense/evaluation/item_type_stat_pie.py @@ -0,0 +1,69 @@ +from datetime import datetime +import wandb +from pathlib import Path +import pandas as pd +import numpy as np +import sys + +sys.path.append(str(Path(__file__).parents[2])) + +from desci_sense.evaluation.Evaluation_benchmark import TwitterEval +from desci_sense.evaluation.utils import obj_str_to_dict, get_dataset + +if __name__ == "__main__": + + wandb.login() + + api = wandb.Api() + + #TODO move from testing + run = wandb.init(project="testing", job_type="evaluation") + + # get artifact path + + dataset_artifact_id = ( + 'common-sense-makers/filter_evaluation/prediction_evaluation-20240521132713:v0' + ) + + # set artifact as input artifact + dataset_artifact = run.use_artifact(dataset_artifact_id) + + # initialize table path + # add the option to call table_path = arguments.get('--dataset') + + # download path to table + a_path = dataset_artifact.download() + print("The path is",a_path) + + # get dataset file name + + table_path = Path(f"{a_path}/prediction_evaluation.table.json") + + + # return the pd df from the table + #remember to remove the head TODO + df = get_dataset(table_path) + + dataset_run = dataset_artifact.logged_by() + + config = dataset_run.config + + Eval = TwitterEval(config=config) + + + + fig1, fig2 = Eval.build_item_type_pie(df=df) + + wandb.log({"item_type_distribution": wandb.Image(fig1)}) + + wandb.log({"allowlist_item_type_distribution": wandb.Image(fig2)}) + + true_df = df[df["True Label"] == 'research'] + + fig1 , fig2 = Eval.build_item_type_pie(true_df) + wandb.log({"research_type_distribution": wandb.Image(fig1)}) + config = obj_str_to_dict(config) + + run.config.update(config) + + wandb.run.finish() \ No newline at end of file diff --git a/nlp/desci_sense/evaluation/utils.py b/nlp/desci_sense/evaluation/utils.py index 74624335..fe4998ac 100644 --- a/nlp/desci_sense/evaluation/utils.py +++ b/nlp/desci_sense/evaluation/utils.py @@ -4,6 +4,7 @@ import re import pandas as pd import numpy as np +from collections import Counter import concurrent.futures from tqdm import tqdm from sklearn.preprocessing import MultiLabelBinarizer @@ -132,4 +133,21 @@ def create_custom_confusion_matrix(y_true, y_pred, labels): fp_j = ~y_true[:, j] & y_pred[:, j] matrix[i, j] = np.sum(fn_i & fp_j) - return pd.DataFrame(matrix, index=labels, columns=labels) \ No newline at end of file + return pd.DataFrame(matrix, index=labels, columns=labels) + + + +def autopct_format(pct, total_counts): + total = sum(total_counts) + count = int(round(pct * total / 100.0)) + return f'{pct:.1f}% ({count})' + +def projection_to_list(list2): + def project_to_list(list1): + #return list(set(list1) & set(list2)) + return [item for item in list1 if item in list2] + return project_to_list + +def flatten_list(lis:list): + return [item for sublist in lis for item in sublist] + diff --git a/nlp/desci_sense/schema/Nanopub_schema/semantic_post_auto_prov.md b/nlp/desci_sense/schema/Nanopub_schema/semantic_post_auto_prov.md index d5ee5659..329be2d6 100644 --- a/nlp/desci_sense/schema/Nanopub_schema/semantic_post_auto_prov.md +++ b/nlp/desci_sense/schema/Nanopub_schema/semantic_post_auto_prov.md @@ -25,6 +25,7 @@ In this doc I specify what triplets are to be present in our app auto-published sub:provenance { #Worked with Tobias on a more rebust prov, TODO cosmo: a prov:SoftwareAgent ; + rdfs:label "research_filter_v1" ; prov:actedOnBehalfOf x:xHandle . sub:activity a cosmo:nlpFacilitatedActivity ; prov:wasAssociatedWith cosmo:. diff --git a/nlp/desci_sense/schema/Nanopub_schema/semantic_post_pubinfo_schema.md b/nlp/desci_sense/schema/Nanopub_schema/semantic_post_pubinfo_schema.md index b3065226..714ce150 100644 --- a/nlp/desci_sense/schema/Nanopub_schema/semantic_post_pubinfo_schema.md +++ b/nlp/desci_sense/schema/Nanopub_schema/semantic_post_pubinfo_schema.md @@ -15,7 +15,7 @@ ## Template Schema - +``` sub:pubinfo { x:xHandle foaf:name "{retractos name}" . @@ -34,4 +34,5 @@ sub:pubinfo { rdfs:label "CoSMO Semantic Post". this: cosmo:hasRootSinger "{eth address}" -} \ No newline at end of file +} +``` diff --git a/nlp/desci_sense/shared_functions/dataloaders/twitter/twitter_utils.py b/nlp/desci_sense/shared_functions/dataloaders/twitter/twitter_utils.py index 56103680..7702159d 100644 --- a/nlp/desci_sense/shared_functions/dataloaders/twitter/twitter_utils.py +++ b/nlp/desci_sense/shared_functions/dataloaders/twitter/twitter_utils.py @@ -5,12 +5,14 @@ import requests from datetime import datetime + from ...interface import AppPost, PlatformType from ...utils import ( extract_and_expand_urls, normalize_url, extract_twitter_status_id, remove_dups_ordered, + normalize_tweet_url, ) from ...schema.post import RefPost, QuoteRefPost @@ -189,22 +191,6 @@ def extract_status_id(url): return None -def normalize_tweet_url(url): - """ - Normalize Twitter post URLs to use the x.com domain. - - Parameters: - url (str): The original Twitter URL. - - Returns: - str: The normalized URL with x.com domain. - """ - if "twitter.com" in url: - return url.replace("twitter.com", "x.com") - else: - return url - - # TODO combine with method below def extract_external_ref_urls(tweet: dict, add_qrt_url: bool = True): """ diff --git a/nlp/desci_sense/shared_functions/interface.py b/nlp/desci_sense/shared_functions/interface.py index 1fd1d80b..db194781 100644 --- a/nlp/desci_sense/shared_functions/interface.py +++ b/nlp/desci_sense/shared_functions/interface.py @@ -15,6 +15,7 @@ from rdflib import URIRef, Literal, Graph from .prompting.jinja.topics_template import ALLOWED_TOPICS from .filters import SciFilterClassfication +from .utils import normalize_tweet_urls_in_text, normalize_tweet_url # for calculating thread length limits MAX_CHARS_PER_POST = 280 @@ -91,6 +92,39 @@ class TopicConceptDefinition(OntologyConceptDefinition): ) +class ZoteroItemTypeDefinition(OntologyConceptDefinition): + """ + Definition of the ZoteroItemType predicate which is used to represent a reference's + item type according to the Zotero ontology. + https://www.zotero.org/support/kb/item_types_and_fields + """ + + name: str = Field(default="zoteroItemType", description="Concept name.") + uri: str = Field( + default="https://sense-nets.xyz/hasZoteroItemType", + description="Linked data URI for this concept.", + ) + versions: List[str] = Field( + ["v0"], description="Which ontology versions is this item included in." + ) + + +class QuotedPostDefinition(OntologyConceptDefinition): + """ + Definition of quotedPost relation for a post that quotes another post + https://github.com/Common-SenseMakers/sensemakers/blob/nlp-dev/nlp/desci_sense/schema/Nanopub_schema/semantic_post_quote_schema.md + """ + + name: str = Field(default="zoteroItemType", description="Concept name.") + uri: str = Field( + default="https://sense-nets.xyz/quotesPost", + description="Linked data URI for this concept.", + ) + versions: List[str] = Field( + ["v0"], description="Which ontology versions is this item included in." + ) + + class isAConceptDefintion(OntologyConceptDefinition): name: str = Field(default="isA", description="Concept name.") uri: str = Field( @@ -141,24 +175,6 @@ class OntologyInterface(BaseModel): ontology_config: NotionOntologyConfig = Field(default_factory=NotionOntologyConfig) -# TODO remove - changed to OntologyPredicateDefinition -class OntologyItem(TypedDict): - URI: str - display_name: str - Name: Optional[str] - label: Optional[str] - prompt: str - notes: Optional[str] - valid_subject_types: Optional[str] - valid_object_types: Optional[str] - versions: Optional[str] - - -# TODO remove - changed to KeywordPredicateDefinition -class KeywordsSupport(TypedDict): - keyWordsOntology: OntologyItem - - class RefMetadata(BaseModel): """ Schema representing extracted metadata of reference URLs @@ -248,7 +264,7 @@ def graph_serializer(graph: Graph): @field_validator( "semantics", mode="before" - ) # before needed since arbitrary types allowec + ) # before needed since arbitrary types allowed @classmethod def ensure_graph(cls, value: Any): if isinstance(value, Graph): @@ -271,17 +287,6 @@ def lower_case_platform_id(cls, v): return v.lower() if isinstance(v, str) else v -# class AppPostContent(BaseModel): - - -# class AppPost(BaseModel): -# content: str = Field(description="Post content") -# url: Optional[str] = Field(description="Post url", default=None) -# quoted_thread_url: Optional[str] = Field( -# description="Url of quoted thread", default=None -# ) - - class AppPost(BaseModel): content: str = Field(description="Post content") url: Optional[str] = Field(description="Post url", default="") @@ -290,6 +295,14 @@ class AppPost(BaseModel): default=None, ) + @validator("content", pre=True, always=True) + def normalize_twitter_urls(cls, v): + return normalize_tweet_urls_in_text(v) if isinstance(v, str) else v + + @validator("url", pre=True, always=True) + def normalize_twitter_url(cls, v): + return normalize_tweet_url(v) if isinstance(v, str) else v + class AppThread(BaseModel): author: Author @@ -299,6 +312,10 @@ class AppThread(BaseModel): default=None, ) + @validator("url", pre=True, always=True) + def normalize_twitter_url(cls, v): + return normalize_tweet_url(v) if isinstance(v, str) else v + @property def source_network(self) -> PlatformType: return self.author.platformId @@ -314,25 +331,3 @@ class ParsePostRequest(BaseModel): description="Additional params for parser (not used currently)", default_factory=dict, ) - - -# TODO remove - changed to RefMetadata -class RefMeta(TypedDict): - title: str - description: str - image: str - - -class ReflabelsSupport(TypedDict): - labelsOntology: List[OntologyItem] - refsMeta: Dict[str, RefMeta] - - -class ParsedSupport(TypedDict): - keywords: KeywordsSupport - refLabels: ReflabelsSupport - - -class ParserResultDto(TypedDict): - semantics: str - support: ParsedSupport diff --git a/nlp/desci_sense/shared_functions/postprocessing/__init__.py b/nlp/desci_sense/shared_functions/postprocessing/__init__.py index 40aa1781..ec663e23 100644 --- a/nlp/desci_sense/shared_functions/postprocessing/__init__.py +++ b/nlp/desci_sense/shared_functions/postprocessing/__init__.py @@ -12,6 +12,8 @@ ParserSupport, ParserResult, OntologyInterface, + ZoteroItemTypeDefinition, + QuotedPostDefinition, ) from ..configs import ParserChainType, PostProcessType @@ -134,6 +136,10 @@ class CombinedParserOutput(BaseModel): default_factory=list, description="List of extracted reference metadata returned by metadata extractor", ) + quoted_post_url: Optional[str] = Field( + default=None, + description="URL of quoted post, if processed post quotes another post.", + ) debug: Optional[Dict] = Field( default_factory=dict, description="Diagnostic information for debugging purposes.", @@ -374,6 +380,39 @@ def convert_keywords_to_rdf_triplets(keywords: List[str]) -> List[RDFTriplet]: return triplets +def create_quoted_post_triplet(quoted_post_url: str): + triplet = RDFTriplet( + predicate=URIRef(QuotedPostDefinition().uri), + object=URIRef(quoted_post_url), + ) + return triplet + + +def convert_item_types_to_rdf_triplets( + item_types: List[str], reference_urls: List[str] +) -> List[RDFTriplet]: + """ + Converts item type and reference url information into RDF triplets + using the ZoteroItemTypeDefinition predicate. + For example, + convert_item_types_to_rdf_triplets(['preprint'], ['https://arxiv.org/abs/2402.04607']) --> + `[RDFTriplet(subject=rdflib.term.URIRef('https://arxiv.org/abs/2402.04607'), predicate=rdflib.term.URIRef('https://sense-nets.xyz/hasZoteroItemType'), object=rdflib.term.Literal('preprint'))]` + + + """ + assert len(reference_urls) == len(item_types) + triplets = [ + RDFTriplet( + subject=URIRef(ref_url), + predicate=URIRef(ZoteroItemTypeDefinition().uri), + object=Literal(item_type), + ) + for ref_url, item_type in zip(reference_urls, item_types) + ] + + return triplets + + def convert_triplets_to_graph(triplets: List[RDFTriplet]) -> Graph: """Convert list of rdf triplets to rdf graph""" g = Graph() @@ -421,6 +460,9 @@ def combine_from_raw_results( combined = CombinedParserOutput(**combined_parser_output) + # add quoted post url + combined.quoted_post_url = post.quoted_url + if unprocessed_urls: # add unprocessed urls to result combined.reference_urls += unprocessed_urls @@ -487,6 +529,19 @@ def post_process_firebase( for t in kw_triplets: graph.add(t.to_tuple()) + # add item type triplets + item_type_triplets = convert_item_types_to_rdf_triplets( + combined_parser_output.item_types, + combined_parser_output.reference_urls, + ) + for t in item_type_triplets: + graph.add(t.to_tuple()) + + # add quotesPost triplet if present + if combined_parser_output.quoted_post_url: + triplet = create_quoted_post_triplet(combined_parser_output.quoted_post_url) + graph.add(triplet.to_tuple()) + # gather support info parser_support: ParserSupport = get_support_data( ontology_base.ontology_interface, diff --git a/nlp/desci_sense/shared_functions/preprocessing/__init__.py b/nlp/desci_sense/shared_functions/preprocessing/__init__.py index 96796ccb..0972682f 100644 --- a/nlp/desci_sense/shared_functions/preprocessing/__init__.py +++ b/nlp/desci_sense/shared_functions/preprocessing/__init__.py @@ -287,18 +287,25 @@ def preproc_parser_input(parser_input: ParserInput) -> PreprocParserInput: """ orig_thread = parser_input.thread_post new_thread = trim_thread_by_length(orig_thread, parser_input.max_chars) + included_urls = new_thread.md_ref_urls() # get reference urls from trimmed posts - # TODO handle urls possibly trimmed from trimmed post (currently will be ignored!) excluded_urls = [] num_posts_after_trim = len(new_thread.posts) - excluded_posts = orig_thread.posts[num_posts_after_trim:] + + # (num_posts_after_trim - 1) to handle urls possibly trimmed from trimmed post + excluded_posts = orig_thread.posts[(num_posts_after_trim - 1) :] for p in excluded_posts: - excluded_urls += p.md_ref_urls() + potential_excluded_urls = p.md_ref_urls() + excluded_urls += [ + url for url in potential_excluded_urls if url not in included_urls + ] # remove dups excluded_urls = remove_dups_ordered(excluded_urls) + assert set(included_urls + excluded_urls) == set(orig_thread.md_ref_urls()) + preprocessed_input = PreprocParserInput( post_to_parse=new_thread, unparsed_urls=excluded_urls, diff --git a/nlp/desci_sense/shared_functions/preprocessing/threads.py b/nlp/desci_sense/shared_functions/preprocessing/threads.py index 2f1a7806..5e05c708 100644 --- a/nlp/desci_sense/shared_functions/preprocessing/threads.py +++ b/nlp/desci_sense/shared_functions/preprocessing/threads.py @@ -26,6 +26,7 @@ def create_thread_from_posts(posts: List[QuoteRefPost]): author=author, content=content, url=posts[0].url, + quoted_url=posts[0].quoted_url, source_network=posts[0].source_network, ref_urls=all_ref_urls, posts=posts_copy, diff --git a/nlp/desci_sense/shared_functions/prompting/post_renderers/quote_ref_post_renderer.py b/nlp/desci_sense/shared_functions/prompting/post_renderers/quote_ref_post_renderer.py index 4a9761c0..92861bae 100644 --- a/nlp/desci_sense/shared_functions/prompting/post_renderers/quote_ref_post_renderer.py +++ b/nlp/desci_sense/shared_functions/prompting/post_renderers/quote_ref_post_renderer.py @@ -42,7 +42,7 @@ def render_quote_post_content( processed_content = post.content - if post.quoted_url: + if post.quoted_url and post.quoted_url in ordered_refs: # add quoted post url to end of quote post content if not present there if post.quoted_url not in processed_content: processed_content += f" {post.quoted_url}" diff --git a/nlp/desci_sense/shared_functions/utils.py b/nlp/desci_sense/shared_functions/utils.py index 3680bdd9..fa4399e7 100644 --- a/nlp/desci_sense/shared_functions/utils.py +++ b/nlp/desci_sense/shared_functions/utils.py @@ -448,3 +448,42 @@ def trim_parts_to_length(part_lengths: List[int], max_length: int) -> List[int]: break return trimmed_part_lengths + + +def normalize_tweet_url(url): + """ + Normalize Twitter post URLs to use the x.com domain. + + Parameters: + url (str): The original Twitter URL. + + Returns: + str: The normalized URL with x.com domain. + """ + if "twitter.com" in url: + return url.replace("twitter.com", "x.com") + else: + return url + + +def normalize_tweet_urls_in_text(text: str) -> str: + """ + Normalize all occurrences of Twitter URLs to uniform format (using x.com). + + Args: + text (str): Input string. + + Returns: + str: String after normalization. + """ + extracted_urls, orig_urls = extract_and_expand_urls( + text, + return_orig_urls=True, + ) + normalized_urls = [normalize_tweet_url(url) for url in extracted_urls] + + # Replace all occurrences of orig_urls in text with normalized_urls + for orig_url, normalized_url in zip(orig_urls, normalized_urls): + text = text.replace(orig_url, normalized_url) + + return text diff --git a/nlp/notebooks/XdatasetLogEdit.ipynb b/nlp/notebooks/XdatasetLogEdit.ipynb index 3747ba2f..d4d3bf99 100644 --- a/nlp/notebooks/XdatasetLogEdit.ipynb +++ b/nlp/notebooks/XdatasetLogEdit.ipynb @@ -2,24 +2,16 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 19, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.\n", - "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mshahar-r-oriel\u001b[0m (\u001b[33mcommon-sense-makers\u001b[0m). Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n" - ] - }, { "data": { "text/plain": [ "True" ] }, - "execution_count": 1, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -32,11 +24,12 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import sys\n", + "import pandas as pd\n", "sys.path.append(\"../\")\n", "\n", "from desci_sense.shared_functions.init import init_multi_chain_parser_config\n", @@ -46,366 +39,821 @@ "from desci_sense.shared_functions.dataloaders import (\n", " scrape_post,\n", " convert_text_to_ref_post,\n", - ")" + ")\n", + "from desci_sense.evaluation.Evaluation_benchmark import TwitterEval\n", + "\n" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " id created_at account_id username \\\n", - "0 1780618156832109056 2024-04-17 15:23:45+00:00 14349894 mbauwens \n", - "1 1780617597089669120 2024-04-17 15:21:32+00:00 14349894 mbauwens \n", - "2 1780617135049322496 2024-04-17 15:19:41+00:00 14349894 mbauwens \n", - "3 1780616454812905984 2024-04-17 15:16:59+00:00 14349894 mbauwens \n", - "4 1780608301534789632 2024-04-17 14:44:35+00:00 14349894 mbauwens \n", - ".. ... ... ... ... \n", - "705 1778714568895729920 2024-04-12 09:19:34+00:00 17900290 1Br0wn \n", - "706 1778710540912562432 2024-04-12 09:03:34+00:00 17900290 1Br0wn \n", - "707 1778698871310348544 2024-04-12 08:17:12+00:00 17900290 1Br0wn \n", - "708 1778690751909470464 2024-04-12 07:44:56+00:00 17900290 1Br0wn \n", - "709 1778479139709710848 2024-04-11 17:44:04+00:00 17900290 1Br0wn \n", - "\n", - " urls \\\n", - "0 [https://en.wikipedia.org/wiki/Epic_of_evoluti... \n", - "1 [https://wiki.p2pfoundation.net/Category:Therm... \n", - "2 [https://lifehacker.com/tech/ai-is-running-out... \n", - "3 [https://www.journaloffreespeechlaw.org/] \n", - "4 [https://wiki.p2pfoundation.net/Andrew_Targows... \n", - ".. ... \n", - "705 [https://www.axios.com/2024/04/10/ai-artificia... \n", - "706 [https://www.euractiv.com/section/digital/news... \n", - "707 [https://www.theguardian.com/society/2024/apr/... \n", - "708 [https://twitter.com/HalSinger/status/17785538... \n", - "709 [https://twitter.com/gateklons/status/17784779... \n", - "\n", - " text server \\\n", - "0 A concept you should know about:\\n\\n* The epic... twitter.com \n", - "1 Quotation as selected by The Alternative:\\n\\n... twitter.com \n", - "2 AI is running out of internet:\\n\\n\"AI is runni... twitter.com \n", - "3 * Journal of Free Speech Law,\\n\\nhttps://t.co/... twitter.com \n", - "4 Articles from our 'Civilizational Analysis' va... twitter.com \n", - ".. ... ... \n", - "705 \"I think they know exactly what they do,\" @Ves... twitter.com \n", - "706 \"The Council of the EU is preparing a call to ... twitter.com \n", - "707 Whatever happened to de minimis non curat lex?... twitter.com \n", - "708 Deaton’s book 'concludes that “Joe Biden does ... twitter.com \n", - "709 Exactly as intended! 🥳 “EU firms decreased dat... twitter.com \n", - "\n", - " tootURL \n", - "0 https://twitter.com/mbauwens/status/1780618156... \n", - "1 https://twitter.com/mbauwens/status/1780617597... \n", - "2 https://twitter.com/mbauwens/status/1780617135... \n", - "3 https://twitter.com/mbauwens/status/1780616454... \n", - "4 https://twitter.com/mbauwens/status/1780608301... \n", - ".. ... \n", - "705 https://twitter.com/1Br0wn/status/177871456889... \n", - "706 https://twitter.com/1Br0wn/status/177871054091... \n", - "707 https://twitter.com/1Br0wn/status/177869887131... \n", - "708 https://twitter.com/1Br0wn/status/177869075190... \n", - "709 https://twitter.com/1Br0wn/status/177847913970... \n", - "\n", - "[710 rows x 8 columns]\n" + "0 1800616621209461094\n", + "1 1800616621209461094\n", + "2 1785773444275040720\n", + "3 1757738391414755395\n", + "4 1757129457742065841\n", + " ... \n", + "7063 1790849327692206266\n", + "7064 1790841655597101546\n", + "7065 1790823532877820098\n", + "7066 1790750738466877446\n", + "7067 1790489757144735855\n", + "Name: conversation_id, Length: 7068, dtype: object\n" ] } ], "source": [ - "df = pd.read_json(\"/Users/shaharorielkagan/sensemakers/nlp/notebooks/data/mappedTweets-2.json\")\n", - "print(df)" + "df = pd.read_json(\"/Users/shaharorielkagan/Downloads/allTweetsFlattened-2.json\",dtype={'conversation_id': str})\n", + "print(df['conversation_id'])" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " id created_at account_id username \\\n", - "0 1780618156832109056 2024-04-17 15:23:45+00:00 14349894 mbauwens \n", - "1 1780617597089669120 2024-04-17 15:21:32+00:00 14349894 mbauwens \n", - "2 1780617135049322496 2024-04-17 15:19:41+00:00 14349894 mbauwens \n", - "3 1780616454812905984 2024-04-17 15:16:59+00:00 14349894 mbauwens \n", - "4 1780608301534789632 2024-04-17 14:44:35+00:00 14349894 mbauwens \n", - ".. ... ... ... ... \n", - "705 1778714568895729920 2024-04-12 09:19:34+00:00 17900290 1Br0wn \n", - "706 1778710540912562432 2024-04-12 09:03:34+00:00 17900290 1Br0wn \n", - "707 1778698871310348544 2024-04-12 08:17:12+00:00 17900290 1Br0wn \n", - "708 1778690751909470464 2024-04-12 07:44:56+00:00 17900290 1Br0wn \n", - "709 1778479139709710848 2024-04-11 17:44:04+00:00 17900290 1Br0wn \n", - "\n", - " urls \\\n", - "0 [https://en.wikipedia.org/wiki/Epic_of_evoluti... \n", - "1 [https://wiki.p2pfoundation.net/Category:Therm... \n", - "2 [https://lifehacker.com/tech/ai-is-running-out... \n", - "3 [https://www.journaloffreespeechlaw.org/] \n", - "4 [https://wiki.p2pfoundation.net/Andrew_Targows... \n", - ".. ... \n", - "705 [https://www.axios.com/2024/04/10/ai-artificia... \n", - "706 [https://www.euractiv.com/section/digital/news... \n", - "707 [https://www.theguardian.com/society/2024/apr/... \n", - "708 [https://twitter.com/HalSinger/status/17785538... \n", - "709 [https://twitter.com/gateklons/status/17784779... \n", - "\n", - " Text server \\\n", - "0 A concept you should know about:\\n\\n* The epic... twitter.com \n", - "1 Quotation as selected by The Alternative:\\n\\n... twitter.com \n", - "2 AI is running out of internet:\\n\\n\"AI is runni... twitter.com \n", - "3 * Journal of Free Speech Law,\\n\\nhttps://t.co/... twitter.com \n", - "4 Articles from our 'Civilizational Analysis' va... twitter.com \n", - ".. ... ... \n", - "705 \"I think they know exactly what they do,\" @Ves... twitter.com \n", - "706 \"The Council of the EU is preparing a call to ... twitter.com \n", - "707 Whatever happened to de minimis non curat lex?... twitter.com \n", - "708 Deaton’s book 'concludes that “Joe Biden does ... twitter.com \n", - "709 Exactly as intended! 🥳 “EU firms decreased dat... twitter.com \n", - "\n", - " postURL \n", - "0 https://twitter.com/mbauwens/status/1780618156... \n", - "1 https://twitter.com/mbauwens/status/1780617597... \n", - "2 https://twitter.com/mbauwens/status/1780617135... \n", - "3 https://twitter.com/mbauwens/status/1780616454... \n", - "4 https://twitter.com/mbauwens/status/1780608301... \n", - ".. ... \n", - "705 https://twitter.com/1Br0wn/status/177871456889... \n", - "706 https://twitter.com/1Br0wn/status/177871054091... \n", - "707 https://twitter.com/1Br0wn/status/177869887131... \n", - "708 https://twitter.com/1Br0wn/status/177869075190... \n", - "709 https://twitter.com/1Br0wn/status/177847913970... \n", + "0 []\n", + "1 []\n", + "2 [https://twitter.com/BelTel/status/17857549561...\n", + "3 []\n", + "4 [https://twitter.com/jonny/status/175712945774...\n", + " ... \n", + "7063 [https://www.nature.com/articles/s41581-024-00...\n", + "7064 [https://twitter.com/mpshanahan/status/1790743...\n", + "7065 [https://twitter.com/mindthebrainICN/status/17...\n", + "7066 [https://twitter.com/UCLMentalHealth/status/17...\n", + "7067 [https://www.eventbrite.co.uk/e/insider-outsid...\n", + "Name: urls, Length: 7068, dtype: object\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/j3/1_zy_b4s2517g8j_rjjdhd4h0000gn/T/ipykernel_41416/2646320256.py:5: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", - "[710 rows x 8 columns]\n" + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " df['urls'][n] = df['urls'][n] + [df['quoted_tweet'][n]['url']]\n" ] } ], + "source": [ + "for n in range(len(df['quoted_tweet'])):\n", + " \n", + " if type(df['quoted_tweet'][n]) != float:\n", + " \n", + " df['urls'][n] = df['urls'][n] + [df['quoted_tweet'][n]['url']] \n", + "print(df['urls'])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], "source": [ "df.rename(columns={'text': 'Text'}, inplace=True)\n", - "print(df)\n" + "df['server'] = 'twitter.com'\n", + "df['ref_count'] = df['urls'].apply(len) \n", + "\n" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "#We need to group by conversetion id and sort by creation time\n", + "df['created_at'] = pd.to_datetime(df['created_at'])\n", + "\n", + "# Step 3: Group by 'conversation_id' and sort by 'created_at'\n", + "df = df.sort_values(by=['conversation_id', 'created_at'])" + ] + }, + { + "cell_type": "code", + "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " id created_at account_id username \\\n", - "0 1780618156832109056 2024-04-17 15:23:45+00:00 14349894 mbauwens \n", - "1 1780617597089669120 2024-04-17 15:21:32+00:00 14349894 mbauwens \n", - "2 1780617135049322496 2024-04-17 15:19:41+00:00 14349894 mbauwens \n", - "3 1780616454812905984 2024-04-17 15:16:59+00:00 14349894 mbauwens \n", - "4 1780608301534789632 2024-04-17 14:44:35+00:00 14349894 mbauwens \n", - ".. ... ... ... ... \n", - "705 1778714568895729920 2024-04-12 09:19:34+00:00 17900290 1Br0wn \n", - "706 1778710540912562432 2024-04-12 09:03:34+00:00 17900290 1Br0wn \n", - "707 1778698871310348544 2024-04-12 08:17:12+00:00 17900290 1Br0wn \n", - "708 1778690751909470464 2024-04-12 07:44:56+00:00 17900290 1Br0wn \n", - "709 1778479139709710848 2024-04-11 17:44:04+00:00 17900290 1Br0wn \n", + " id created_at author_id \\\n", + "0 1651581145228668928 2023-04-27 13:36:44+00:00 64017773 \n", + "1 1095012246504292352 2019-02-11 17:30:38+00:00 714022989411586048 \n", + "2 1097758010863022080 2019-02-19 07:21:19+00:00 4757597112 \n", + "3 1778091831026446592 2024-04-10 16:05:02+00:00 4901659043 \n", + "4 1778263104142389248 2024-04-11 03:25:37+00:00 4901659043 \n", + "... ... ... ... \n", + "6390 1801318358261841920 2024-06-13 18:18:58+00:00 828361667918569472 \n", + "6391 1801320990917497088 2024-06-13 18:29:25+00:00 49270737 \n", + "6392 1801328344446681344 2024-06-13 18:58:39+00:00 828361667918569472 \n", + "6393 684753729799622656 2016-01-06 15:09:45+00:00 3238593536 \n", + "6394 881142725764145152 2017-07-01 13:29:31+00:00 4757597112 \n", "\n", - " urls \\\n", - "0 [https://en.wikipedia.org/wiki/Epic_of_evoluti... \n", - "1 [https://wiki.p2pfoundation.net/Category:Therm... \n", - "2 [https://lifehacker.com/tech/ai-is-running-out... \n", - "3 [https://www.journaloffreespeechlaw.org/] \n", - "4 [https://wiki.p2pfoundation.net/Andrew_Targows... \n", - ".. ... \n", - "705 [https://www.axios.com/2024/04/10/ai-artificia... \n", - "706 [https://www.euractiv.com/section/digital/news... \n", - "707 [https://www.theguardian.com/society/2024/apr/... \n", - "708 [https://twitter.com/HalSinger/status/17785538... \n", - "709 [https://twitter.com/gateklons/status/17784779... \n", + " conversation_id username \\\n", + "0 1039183605740785664 petersuber \n", + "1 1095012246504292352 DanKotliar \n", + "2 1097758010863022080 EconFeld \n", + "3 1152989885617385472 aj_boston \n", + "4 1199172884146601989 aj_boston \n", + "... ... ... \n", + "6390 1801318358261842036 AnnaCiaunica \n", + "6391 1801320990917496977 KamounLab \n", + "6392 1801328344446681269 AnnaCiaunica \n", + "6393 684753729799622656 robin \n", + "6394 881142725764145152 EconFeld \n", "\n", - " text server \\\n", - "0 A concept you should know about:\\n\\n* The epic... twitter.com \n", - "1 Quotation as selected by The Alternative:\\n\\n... twitter.com \n", - "2 AI is running out of internet:\\n\\n\"AI is runni... twitter.com \n", - "3 * Journal of Free Speech Law,\\n\\nhttps://t.co/... twitter.com \n", - "4 Articles from our 'Civilizational Analysis' va... twitter.com \n", - ".. ... ... \n", - "705 \"I think they know exactly what they do,\" @Ves... twitter.com \n", - "706 \"The Council of the EU is preparing a call to ... twitter.com \n", - "707 Whatever happened to de minimis non curat lex?... twitter.com \n", - "708 Deaton’s book 'concludes that “Joe Biden does ... twitter.com \n", - "709 Exactly as intended! 🥳 “EU firms decreased dat... twitter.com \n", + " name \\\n", + "0 Peter Suber (@petersuber@fediscience.org) \n", + "1 Dan Kotliar \n", + "2 Jan Feld \n", + "3 arthur j. boston \n", + "4 arthur j. boston \n", + "... ... \n", + "6390 Anna Ciaunica @annaciaunica.bsky.social \n", + "6391 Sophien Kamoun \n", + "6392 Anna Ciaunica @annaciaunica.bsky.social \n", + "6393 Robin heart \n", + "6394 Jan Feld \n", + "\n", + " urls \\\n", + "0 [https://researchintegrityjournal.biomedcentra... \n", + "1 [https://twitter.com/DmK121/status/10950122465... \n", + "2 [https://theconversation.com/research-shows-st... \n", + "3 [https://twitter.com/aj_boston/status/17780918... \n", + "4 [https://twitter.com/aj_boston/status/17782631... \n", + "... ... \n", + "6390 [https://twitter.com/AnnaCiaunica/status/18013... \n", + "6391 [https://twitter.com/alexandrepedro/status/180... \n", + "6392 [https://twitter.com/marianne_brkr/status/1801... \n", + "6393 [http://www.smule.com/p/381007451_243129440] \n", + "6394 [https://twitter.com/joshuasgoodman/status/880... \n", + "\n", + " Text \\\n", + "0 Update. \"The cost of peer review was estimated... \n", + "1 Super excited about my talk at @CIMethods this... \n", + "2 Do you wonder why in many universities profess... \n", + "3 FINALLY @criterionchannl https://twitter.com/a... \n", + "4 @hbomax Watched: \\nCurb Your Enthusiasm\\n📺 Sea... \n", + "... ... \n", + "6390 Off to Palermo 😎 https://twitter.com/AnnaCiaun... \n", + "6391 Always keep a spare key with someone else. Or ... \n", + "6392 Super cool paper here 👇🏼 https://twitter.com/m... \n", + "6393 Awesome cover of \"Let me be there\" via #Smule:... \n", + "6394 I agree, excellent title and paper Mr. @uZoeli... \n", "\n", - " tootURL \n", - "0 https://twitter.com/mbauwens/status/1780618156... \n", - "1 https://twitter.com/mbauwens/status/1780617597... \n", - "2 https://twitter.com/mbauwens/status/1780617135... \n", - "3 https://twitter.com/mbauwens/status/1780616454... \n", - "4 https://twitter.com/mbauwens/status/1780608301... \n", - ".. ... \n", - "705 https://twitter.com/1Br0wn/status/177871456889... \n", - "706 https://twitter.com/1Br0wn/status/177871054091... \n", - "707 https://twitter.com/1Br0wn/status/177869887131... \n", - "708 https://twitter.com/1Br0wn/status/177869075190... \n", - "709 https://twitter.com/1Br0wn/status/177847913970... \n", + " url \\\n", + "0 https://x.com/petersuber/status/16515811452286... \n", + "1 https://x.com/DanKotliar/status/10950122465042... \n", + "2 https://x.com/EconFeld/status/1097758010863022080 \n", + "3 https://x.com/aj_boston/status/177809183102644... \n", + "4 https://x.com/aj_boston/status/177826310414238... \n", + "... ... \n", + "6390 https://x.com/AnnaCiaunica/status/180131835826... \n", + "6391 https://x.com/KamounLab/status/180132099091749... \n", + "6392 https://x.com/AnnaCiaunica/status/180132834444... \n", + "6393 https://x.com/robin/status/684753729799622656 \n", + "6394 https://x.com/EconFeld/status/881142725764145152 \n", "\n", - "[710 rows x 8 columns]\n" + " quoted_tweet server \\\n", + "0 NaN twitter.com \n", + "1 NaN twitter.com \n", + "2 NaN twitter.com \n", + "3 NaN twitter.com \n", + "4 NaN twitter.com \n", + "... ... ... \n", + "6390 NaN twitter.com \n", + "6391 {'id': '1800911525278081311', 'created_at': '2... twitter.com \n", + "6392 {'id': '1801262094307766300', 'created_at': '2... twitter.com \n", + "6393 NaN twitter.com \n", + "6394 NaN twitter.com \n", + "\n", + " ref_count \n", + "0 1 \n", + "1 1 \n", + "2 1 \n", + "3 1 \n", + "4 1 \n", + "... ... \n", + "6390 1 \n", + "6391 2 \n", + "6392 2 \n", + "6393 1 \n", + "6394 1 \n", + "\n", + "[6395 rows x 12 columns]\n" + ] + } + ], + "source": [ + "#Now filter conversations to 10min to approximate threads\n", + "def filter_by_time_difference(group):\n", + " earliest_time = group['created_at'].min()\n", + " return group[group['created_at'] - earliest_time <= pd.Timedelta(minutes=10)]\n", + "\n", + "filtered_df = df.groupby('conversation_id').apply(filter_by_time_difference).reset_index(drop=True)\n", + "\n", + "# Verify the result\n", + "print(filtered_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "count\n", + "1 3940\n", + "2 460\n", + "3 125\n", + "4 43\n", + "5 36\n", + "6 26\n", + "7 15\n", + "8 14\n", + "9 10\n", + "10 6\n", + "11 8\n", + "12 2\n", + "13 3\n", + "14 3\n", + "16 1\n", + "17 3\n", + "25 1\n", + "Name: count, dtype: int64\n" + ] + } + ], + "source": [ + "# Count the number of rows that share the same value in 'conversation_id'\n", + "conversation_counts = filtered_df['conversation_id'].value_counts()\n", + "\n", + "# Display the counts\n", + "\n", + "distribution_of_counts = conversation_counts.value_counts().sort_index()\n", + "\n", + "print(distribution_of_counts)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "wandb version 0.17.2 is available! To upgrade, please run:\n", + " $ pip install wandb --upgrade" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Tracking run with wandb version 0.16.6" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Run data is saved locally in /Users/shaharorielkagan/sensemakers/nlp/notebooks/wandb/run-20240624_105928-3fvr9vjh" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Syncing run soft-snowball-2 to Weights & Biases (docs)
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + " View project at https://wandb.ai/common-sense-makers/post_type_statistics" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + " View run at https://wandb.ai/common-sense-makers/post_type_statistics/runs/3fvr9vjh" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "e2b07cfe043a4360be38b716cad95fe8", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\\r'), FloatProgress(value=1.0, max=1.0)))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + " View run soft-snowball-2 at: https://wandb.ai/common-sense-makers/post_type_statistics/runs/3fvr9vjh
View project at: https://wandb.ai/common-sense-makers/post_type_statistics
Synced 4 W&B file(s), 2 media file(s), 2 artifact file(s) and 0 other file(s)" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Find logs at: ./wandb/run-20240624_105928-3fvr9vjh/logs" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import pandas as pd\n", + "import wandb\n", + "# Initialize wandb\n", + "wandb.init(project=\"post_type_statistics\")\n", + "\n", + "\n", + "\n", + "# Prepare data for logging\n", + "data = [[count, num_conversations] for count, num_conversations in distribution_of_counts.items()]\n", + "\n", + "# Log data to wandb\n", + "wandb.log({\n", + " \"conversation_length_distribution\": wandb.Table(\n", + " columns=[\"Number of Rows per Conversation\", \"Number of Conversations\"],\n", + " data=data\n", + " )\n", + "})\n", + "\n", + "# Plot the distribution with wandb\n", + "wandb.log({\n", + " \"conversation_length_distribution_plot\": wandb.plot.bar(\n", + " wandb.Table(data=data, columns=[\"Number of Rows per Conversation\", \"Number of Conversations\"]),\n", + " \"Number of Rows per Conversation\",\n", + " \"Number of Conversations\",\n", + " title=\"Distribution of Conversation Lengths\"\n", + " )\n", + "})\n", + "\n", + "wandb.run.finish()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 0\n", + "1 0\n", + "2 0\n", + "3 0\n", + "4 0\n", + " ..\n", + "4691 0\n", + "4692 0\n", + "4693 0\n", + "4694 0\n", + "4695 0\n", + "Name: thread, Length: 4696, dtype: int64\n" + ] + } + ], + "source": [ + "# Function to concatenate text, merge urls, and set thread\n", + "def collapse_thread(group):\n", + " earliest_item = group.iloc[0].copy()\n", + " if len(group) > 1:\n", + " earliest_item['Text'] = ' /'.join(group['Text'])\n", + " earliest_item['urls'] = list(set(url for sublist in group['urls'] for url in sublist))\n", + " earliest_item['thread'] = 1\n", + " else:\n", + " earliest_item['thread'] = 0\n", + " return earliest_item\n", + "\n", + "# Apply the function to each group\n", + "collapsed_df = filtered_df.groupby('conversation_id').apply(collapse_thread).reset_index(drop=True)\n", + "\n", + "# Verify the result\n", + "print(collapsed_df['thread'])" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "756" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "collapsed_df['thread'].sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "df_handles = pd.read_json('/Users/shaharorielkagan/Downloads/userAccounts.json')" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "df_handles['server'] = 'twitter.com'\n", + "df_handles.rename(columns={'description': 'info'}, inplace=True)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "del df_handles['entities']\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "llm_type=\"mistralai/mistral-7b-\"\n", + "\n", + "config = init_multi_chain_parser_config(\n", + " llm_type=llm_type,\n", + " post_process_type=\"combined\"\n", + " )\n", + "Eval = TwitterEval(config=config)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "df_eval = Eval.build_post_type_chart(df_handles=df_handles,df=collapsed_df)\n", + "fig1, fig2 = Eval.build_item_type_pie(df=df_eval)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2\n" ] } ], "source": [ - "df['server'] = 'twitter.com'\n", - "print(df)" + "import pandas as pd\n", + "\n", + "# Example dataframe\n", + "data = {\n", + " 'thread': [1, 0, 1, 1, 0],\n", + " 'cited_research': [1, 0, 0, 1, 1]\n", + "}\n", + "\n", + "df = pd.DataFrame(data)\n", + "count = df[(df['thread'] ==1) & (df['cited_research']==1)].shape[0]\n", + "# Display the shape of the dataframe\n", + "print(count) # Output: (5, 2)\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "Eval.feed_tweet_type_statistics(df=collapsed_df,name='sbuckshum')" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "collapsed_df['citoid_research'] = None" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "df1 = collapsed_df[collapsed_df[\"username\"] == 'sbuckshum']" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "pandas.core.series.Series" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "type(df1.iloc[1])" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 14, "metadata": {}, "outputs": [ { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "\n" + "\u001b[32m2024-06-20 18:03:57.389\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mdesci_sense.shared_functions.parsers.multi_chain_parser\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m75\u001b[0m - \u001b[1mInitializing MultiChainParser. PostProcessType=combined\u001b[0m\n", + "\u001b[32m2024-06-20 18:03:57.394\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mdesci_sense.shared_functions.parsers.multi_chain_parser\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m82\u001b[0m - \u001b[1mInitializing post parsers...\u001b[0m\n", + "\u001b[32m2024-06-20 18:03:57.395\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mdesci_sense.shared_functions.parsers.post_parser_chain\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m27\u001b[0m - \u001b[1mInitializing parser chain 'multi_refs_tagger' \u001b[0m\n", + "\u001b[32m2024-06-20 18:03:57.437\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mdesci_sense.shared_functions.parsers.post_parser_chain\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m27\u001b[0m - \u001b[1mInitializing parser chain 'topics' \u001b[0m\n", + "\u001b[32m2024-06-20 18:03:57.448\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mdesci_sense.shared_functions.parsers.post_parser_chain\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m27\u001b[0m - \u001b[1mInitializing parser chain 'keywords' \u001b[0m\n", + "\u001b[32m2024-06-20 18:03:57.461\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mdesci_sense.shared_functions.parsers.post_parser_chain\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m27\u001b[0m - \u001b[1mInitializing parser chain 'hashtags' \u001b[0m\n" + ] + }, + { + "ename": "AttributeError", + "evalue": "'Series' object has no attribute 'md_ref_urls'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[14], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m Eval\u001b[39m.\u001b[39;49mnested_quotes_citoid(df1\u001b[39m.\u001b[39;49miloc[\u001b[39m1\u001b[39;49m])\n", + "File \u001b[0;32m~/sensemakers/nlp/notebooks/../desci_sense/evaluation/Evaluation_benchmark.py:246\u001b[0m, in \u001b[0;36mTwitterEval.nested_quotes_citoid\u001b[0;34m(self, post, steps, ind)\u001b[0m\n\u001b[1;32m 243\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mnested_quotes_citoid\u001b[39m(\u001b[39mself\u001b[39m,post:RefPost,steps \u001b[39m=\u001b[39m \u001b[39m0\u001b[39m, ind \u001b[39m=\u001b[39m \u001b[39m0\u001b[39m):\n\u001b[1;32m 244\u001b[0m multi_chain_parser \u001b[39m=\u001b[39m MultiChainParser(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mconfig)\n\u001b[0;32m--> 246\u001b[0m result \u001b[39m=\u001b[39m multi_chain_parser\u001b[39m.\u001b[39;49mprocess_ref_post(post\u001b[39m=\u001b[39;49mpost,active_list\u001b[39m=\u001b[39;49m[\u001b[39m\"\u001b[39;49m\u001b[39mhashtags\u001b[39;49m\u001b[39m\"\u001b[39;49m])\n\u001b[1;32m 247\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39m\"\u001b[39m\u001b[39mpost urls\u001b[39m\u001b[39m\"\u001b[39m,result\u001b[39m.\u001b[39mreference_urls)\n\u001b[1;32m 248\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39m\"\u001b[39m\u001b[39mItem types: \u001b[39m\u001b[39m\"\u001b[39m,result\u001b[39m.\u001b[39mitem_types)\n", + "File \u001b[0;32m~/sensemakers/nlp/notebooks/../desci_sense/shared_functions/parsers/multi_chain_parser.py:241\u001b[0m, in \u001b[0;36mMultiChainParser.process_ref_post\u001b[0;34m(self, post, active_list, unprocessed_urls)\u001b[0m\n\u001b[1;32m 238\u001b[0m \u001b[39mif\u001b[39;00m unprocessed_urls \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m 239\u001b[0m unprocessed_urls \u001b[39m=\u001b[39m []\n\u001b[0;32m--> 241\u001b[0m md_dict \u001b[39m=\u001b[39m extract_posts_ref_metadata_dict(\n\u001b[1;32m 242\u001b[0m [post],\n\u001b[1;32m 243\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mconfig\u001b[39m.\u001b[39;49mmetadata_extract_config\u001b[39m.\u001b[39;49mextraction_method,\n\u001b[1;32m 244\u001b[0m )\n\u001b[1;32m 245\u001b[0m \u001b[39m# if no filter specified, run all chains\u001b[39;00m\n\u001b[1;32m 246\u001b[0m \u001b[39mif\u001b[39;00m active_list \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n", + "File \u001b[0;32m~/sensemakers/nlp/notebooks/../desci_sense/shared_functions/web_extractors/metadata_extractors.py:167\u001b[0m, in \u001b[0;36mextract_posts_ref_metadata_dict\u001b[0;34m(posts, md_type)\u001b[0m\n\u001b[1;32m 159\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mextract_posts_ref_metadata_dict\u001b[39m(\n\u001b[1;32m 160\u001b[0m posts: List[RefPost],\n\u001b[1;32m 161\u001b[0m md_type: MetadataExtractionType \u001b[39m=\u001b[39m MetadataExtractionType\u001b[39m.\u001b[39mCITOID,\n\u001b[1;32m 162\u001b[0m ) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m Dict[\u001b[39mstr\u001b[39m, RefMetadata]:\n\u001b[1;32m 163\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m 164\u001b[0m \u001b[39m Extract all reference urls from posts and fetch metadata for them.\u001b[39;00m\n\u001b[1;32m 165\u001b[0m \u001b[39m Return dict of metadata keyed by url.\u001b[39;00m\n\u001b[1;32m 166\u001b[0m \u001b[39m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 167\u001b[0m all_ref_urls \u001b[39m=\u001b[39m \u001b[39mlist\u001b[39m(\u001b[39mset\u001b[39m(flatten([p\u001b[39m.\u001b[39;49mmd_ref_urls() \u001b[39mfor\u001b[39;49;00m p \u001b[39min\u001b[39;49;00m posts])))\n\u001b[1;32m 168\u001b[0m md_dict \u001b[39m=\u001b[39m extract_all_metadata_to_dict(\n\u001b[1;32m 169\u001b[0m all_ref_urls, md_type, max_summary_length\u001b[39m=\u001b[39m\u001b[39m500\u001b[39m\n\u001b[1;32m 170\u001b[0m )\n\u001b[1;32m 171\u001b[0m \u001b[39mreturn\u001b[39;00m md_dict\n", + "File \u001b[0;32m~/sensemakers/nlp/notebooks/../desci_sense/shared_functions/web_extractors/metadata_extractors.py:167\u001b[0m, in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 159\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mextract_posts_ref_metadata_dict\u001b[39m(\n\u001b[1;32m 160\u001b[0m posts: List[RefPost],\n\u001b[1;32m 161\u001b[0m md_type: MetadataExtractionType \u001b[39m=\u001b[39m MetadataExtractionType\u001b[39m.\u001b[39mCITOID,\n\u001b[1;32m 162\u001b[0m ) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m Dict[\u001b[39mstr\u001b[39m, RefMetadata]:\n\u001b[1;32m 163\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m 164\u001b[0m \u001b[39m Extract all reference urls from posts and fetch metadata for them.\u001b[39;00m\n\u001b[1;32m 165\u001b[0m \u001b[39m Return dict of metadata keyed by url.\u001b[39;00m\n\u001b[1;32m 166\u001b[0m \u001b[39m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 167\u001b[0m all_ref_urls \u001b[39m=\u001b[39m \u001b[39mlist\u001b[39m(\u001b[39mset\u001b[39m(flatten([p\u001b[39m.\u001b[39;49mmd_ref_urls() \u001b[39mfor\u001b[39;00m p \u001b[39min\u001b[39;00m posts])))\n\u001b[1;32m 168\u001b[0m md_dict \u001b[39m=\u001b[39m extract_all_metadata_to_dict(\n\u001b[1;32m 169\u001b[0m all_ref_urls, md_type, max_summary_length\u001b[39m=\u001b[39m\u001b[39m500\u001b[39m\n\u001b[1;32m 170\u001b[0m )\n\u001b[1;32m 171\u001b[0m \u001b[39mreturn\u001b[39;00m md_dict\n", + "File \u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/core/generic.py:5989\u001b[0m, in \u001b[0;36mNDFrame.__getattr__\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m 5982\u001b[0m \u001b[39mif\u001b[39;00m (\n\u001b[1;32m 5983\u001b[0m name \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_internal_names_set\n\u001b[1;32m 5984\u001b[0m \u001b[39mand\u001b[39;00m name \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_metadata\n\u001b[1;32m 5985\u001b[0m \u001b[39mand\u001b[39;00m name \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_accessors\n\u001b[1;32m 5986\u001b[0m \u001b[39mand\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_info_axis\u001b[39m.\u001b[39m_can_hold_identifiers_and_holds_name(name)\n\u001b[1;32m 5987\u001b[0m ):\n\u001b[1;32m 5988\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m[name]\n\u001b[0;32m-> 5989\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mobject\u001b[39;49m\u001b[39m.\u001b[39;49m\u001b[39m__getattribute__\u001b[39;49m(\u001b[39mself\u001b[39;49m, name)\n", + "\u001b[0;31mAttributeError\u001b[0m: 'Series' object has no attribute 'md_ref_urls'" ] } ], "source": [ - "print(type(df['urls'][0]))" + "Eval.nested_quotes_citoid(df1.iloc[1])" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ - "df_handles = df" + "inputs = Eval.dataframe_to_ref_posts(df1)\n", + "#Eval.nested_quotes_citoid_parallel(inputs)" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "0 twitter.com\n", - "1 twitter.com\n", - "2 twitter.com\n", - "3 twitter.com\n", - "4 twitter.com\n", - "5 twitter.com\n", - "6 twitter.com\n", - "7 twitter.com\n", - "8 twitter.com\n", - "9 twitter.com\n", - "10 twitter.com\n", - "11 twitter.com\n", - "12 twitter.com\n", - "13 twitter.com\n", - "14 twitter.com\n", - "Name: server, dtype: object\n" + "\n" ] } ], "source": [ - "df_handles['server'] = 'twitter.com'\n", - "print(df_handles['server'])" + "print(type(inputs[0]))" ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 3, "metadata": {}, "outputs": [ { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - " username id \\\n", - "0 mbauwens 14349894 \n", - "1 AnnaLeptikon 550845228 \n", - "2 nickmvincent 890345761690595328 \n", - "3 LChoshen 1006797311593377792 \n", - "4 yoginho 91583793 \n", - "5 CAdjovu 1364791141267251200 \n", - "6 EugeneVinitsky 1054895671164063744 \n", - "7 lizilla93 2787254215 \n", - "8 danwilliamsphil 217942278 \n", - "9 CriticalAI 1290793547122302976 \n", - "10 BrettButtliere 561750764 \n", - "11 jbakcoleman 1419907861 \n", - "12 yoavartzi 322460769 \n", - "13 ChombaBupe 1248939194 \n", - "14 1Br0wn 17900290 \n", - "\n", - " info \\\n", - "0 updates on p2p and commons developments; peer,... \n", - "1 nobody. Interested in cognitive science, ratio... \n", - "2 Assistant professor @SFU_CompSci, HCI, HCML, w... \n", - "3 🥇 #NLProc researcher\\n🥈 Opinionatedly Summariz... \n", - "4 I'm on Mastodon: \\n@yoginho@spore.social\\n\\nht... \n", - "5 Director @ledgerback | Curator @_distroid | In... \n", - "6 Anti-cynic. Artificial narrow intelligence. Au... \n", - "7 Scientist @ArcadiaScience working on computati... \n", - "8 Philosopher, University of Sussex. Tweets in p... \n", - "9 Critical AI's first issue out: https://read.du... \n", - "10 developing the future of science and society. ... \n", - "11 Associate Research Scientist @columbiajourn. A... \n", - "12 Research/prof @cs_cornell + @cornell_tech🚡 / h... \n", - "13 Tech entrepreneur | machine intelligence https... \n", - "14 💻 regulation/policy (🔑 #privacy 🗳) ⛷🚴🏻‍♂️🥾🗺 Vi... \n", - "\n", - " name server \n", - "0 Michel Bauwens twitter.com \n", - "1 Anna Riedl twitter.com \n", - "2 Nick Vincent twitter.com \n", - "3 ♻️ Leshem Choshen ♻️ twitter.com \n", - "4 Yogi Jaeger 💙 @yoginho@spore.social twitter.com \n", - "5 Charles Adjovu twitter.com \n", - "6 Eugene Vinitsky twitter.com \n", - "7 Elizabeth McDaniel twitter.com \n", - "8 Dan Williams twitter.com \n", - "9 Critical AI : first issue out! https://read.du... twitter.com \n", - "10 Zr. Nabu Kudurru twitter.com \n", - "11 Joe Bak-Coleman twitter.com \n", - "12 Yoav Artzi twitter.com \n", - "13 Chomba Bupe twitter.com \n", - "14 Ian Brown 🇮🇨 🦣 🦋 twitter.com \n" + "\u001b[32m2024-06-20 16:50:22.610\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mdesci_sense.shared_functions.utils\u001b[0m:\u001b[36munshorten_url\u001b[0m:\u001b[36m123\u001b[0m - \u001b[33m\u001b[1m[unshorten_url] RequestException for url https://www.teqsa.gov.au/About-us/engagement/consultation\u001b[0m\n" ] }, { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - "Python(12627) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.\n" + "author='TEQSA' content='With leading Australian experts, TEQSA has supported the development of assessment reform guiding principles to support the #HigherEd sector in responding to the opportunities and risks posed by #GenAI.\\n\\nFeedback on the principles closes 20 October.\\n\\n🤖 https://www.teqsa.gov.au/About-us/engagement/consultation https://twitter.com/TEQSAGov/status/1707292887883628989/photo/1' url='https://x.com/TEQSAGov/status/1707292887883628989' created_at=datetime.datetime(2023, 9, 28, 7, 15, 19, tzinfo=datetime.timezone.utc) metadata={'allSameType': True, 'combinedMediaUrl': None, 'communityNote': None, 'conversationID': '1707292887883628989', 'date': 'Thu Sep 28 07:15:19 +0000 2023', 'date_epoch': 1695885319, 'hasMedia': True, 'hashtags': ['HigherEd', 'GenAI'], 'likes': 49, 'mediaURLs': ['https://pbs.twimg.com/media/F7GE9z1aMAEU6k0.jpg'], 'media_extended': [{'altText': 'Consultation. Assessment reform for the age of artificial intelligence. Feedbackcloses Friday 20 October. teqsa.gov.au/consultation', 'size': {'height': 1080, 'width': 1080}, 'thumbnail_url': 'https://pbs.twimg.com/media/F7GE9z1aMAEU6k0.jpg', 'type': 'image', 'url': 'https://pbs.twimg.com/media/F7GE9z1aMAEU6k0.jpg'}], 'pollData': None, 'possibly_sensitive': False, 'qrt': None, 'qrtURL': None, 'replies': 0, 'retweets': 29, 'text': 'With leading Australian experts, TEQSA has supported the development of assessment reform guiding principles to support the #HigherEd sector in responding to the opportunities and risks posed by #GenAI.\\n\\nFeedback on the principles closes 20 October.\\n\\n🤖 https://www.teqsa.gov.au/About-us/engagement/consultation https://t.co/v9d6MeF5hi', 'tweetID': '1707292887883628989', 'tweetURL': 'https://twitter.com/TEQSAGov/status/1707292887883628989', 'user_name': 'TEQSA', 'user_profile_image_url': 'https://pbs.twimg.com/profile_images/765051369707311104/qSUt7iaC_normal.jpg', 'user_screen_name': 'TEQSAGov'} source_network='twitter' ref_urls=['https://www.teqsa.gov.au/About-us/engagement/consultation']\n" ] } ], "source": [ - "del df_handles['entities']\n", - "print(df_handles)" + "p=scrape_post('https://twitter.com/TEQSAGov/status/1707292887883628989')\n", + "print(p)" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [], + "source": [ + "fig1.savefig('example_figure.png')\n", + "\n" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 15, "metadata": {}, "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "Python(12883) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.\n", - "Python(12884) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.\n", - "Python(12885) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.\n", - "Python(12886) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.\n" - ] + "data": { + "text/html": [ + "Finishing last run (ID:9h7tode8) before initializing another..." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "04cd33810e9144fab45e7485a9416413", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\\r'), FloatProgress(value=1.0, max=1.0)))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + " View run desert-cherry-1 at: https://wandb.ai/common-sense-makers/post_type_stat/runs/9h7tode8
View project at: https://wandb.ai/common-sense-makers/post_type_stat
Synced 5 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Find logs at: ./wandb/run-20240620_120255-9h7tode8/logs" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Successfully finished last run (ID:9h7tode8). Initializing new run:
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a08b3a80ec034374a0f22065f5968e8e", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "VBox(children=(Label(value='Waiting for wandb.init()...\\r'), FloatProgress(value=0.011114446754153405, max=1.0…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "wandb version 0.17.2 is available! To upgrade, please run:\n", + " $ pip install wandb --upgrade" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" }, { "data": { @@ -422,7 +870,7 @@ { "data": { "text/html": [ - "Run data is saved locally in /Users/shaharorielkagan/sensemakers/nlp/notebooks/wandb/run-20240418_145303-1h780x8q" + "Run data is saved locally in /Users/shaharorielkagan/sensemakers/nlp/notebooks/wandb/run-20240620_120330-kx4uq3n0" ], "text/plain": [ "" @@ -434,7 +882,7 @@ { "data": { "text/html": [ - "Syncing run easy-dew-8 to Weights & Biases (docs)
" + "Syncing run vague-deluge-2 to Weights & Biases (docs)
" ], "text/plain": [ "" @@ -446,7 +894,7 @@ { "data": { "text/html": [ - " View project at https://wandb.ai/common-sense-makers/filter_evaluation" + " View project at https://wandb.ai/common-sense-makers/post_type_stat" ], "text/plain": [ "" @@ -458,7 +906,7 @@ { "data": { "text/html": [ - " View run at https://wandb.ai/common-sense-makers/filter_evaluation/runs/1h780x8q" + " View run at https://wandb.ai/common-sense-makers/post_type_stat/runs/kx4uq3n0" ], "text/plain": [ "" @@ -470,12 +918,12 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "172f3e79de4d47e285937a321111e345", + "model_id": "d4d758ed427340de9a5e933b2db8643b", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "VBox(children=(Label(value='0.301 MB of 0.301 MB uploaded\\r'), FloatProgress(value=1.0, max=1.0)))" + "VBox(children=(Label(value='3.910 MB of 3.910 MB uploaded\\r'), FloatProgress(value=1.0, max=1.0)))" ] }, "metadata": {}, @@ -484,7 +932,7 @@ { "data": { "text/html": [ - " View run easy-dew-8 at: https://wandb.ai/common-sense-makers/filter_evaluation/runs/1h780x8q
View project at: https://wandb.ai/common-sense-makers/filter_evaluation
Synced 4 W&B file(s), 0 media file(s), 2 artifact file(s) and 0 other file(s)" + " View run vague-deluge-2 at: https://wandb.ai/common-sense-makers/post_type_stat/runs/kx4uq3n0
View project at: https://wandb.ai/common-sense-makers/post_type_stat
Synced 4 W&B file(s), 0 media file(s), 2 artifact file(s) and 0 other file(s)" ], "text/plain": [ "" @@ -496,7 +944,7 @@ { "data": { "text/html": [ - "Find logs at: ./wandb/run-20240418_145303-1h780x8q/logs" + "Find logs at: ./wandb/run-20240620_120330-kx4uq3n0/logs" ], "text/plain": [ "" @@ -507,11 +955,11 @@ } ], "source": [ - "wandb.init(project=\"filter_evaluation\")\n", + "wandb.init(project=\"post_type_stat\")\n", "artifact = wandb.Artifact(\"non_labeled_tweets\", type=\"dataset\")\n", "\n", "# Create a wandb.Table from the Pandas DataFrame\n", - "table1 = wandb.Table(dataframe=df)\n", + "table1 = wandb.Table(dataframe=collapsed_df)\n", "table2 = wandb.Table(dataframe=df_handles)\n", "\n", "\n", @@ -677,16 +1125,6 @@ "wandb.run.finish()" ] }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "#add a column that counts references\n", - "df['ref_count'] = df['urls'].apply(len)" - ] - }, { "cell_type": "code", "execution_count": 6, @@ -1935,6 +2373,21 @@ "print(df_labeled)" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Thread stat" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": 51, @@ -2077,69 +2530,6 @@ "wandb.run.finish()" ] }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
usernameidinfonameserver
0mbauwens14349894updates on p2p and commons developments; peer,...Michel Bauwenstwitter.com
\n", - "
" - ], - "text/plain": [ - " username id info \\\n", - "0 mbauwens 14349894 updates on p2p and commons developments; peer,... \n", - "\n", - " name server \n", - "0 Michel Bauwens twitter.com " - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_handles[df_handles['username']=='mbauwens']" - ] - }, { "cell_type": "code", "execution_count": 8, @@ -2369,129 +2759,6 @@ "\n", "wandb.run.finish()" ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'batch_size': 5, 'wandb_config': \"entity='common-sense-makers' project='st-demo-sandbox'\", 'parser_configs': [\"name='refs_tagger' type= llm_config=LLMConfig(llm_type='mistralai/mixtral-8x7b-instruct:nitro', temperature='0.6') use_metadata=True is_ref=True\", \"name='multi_refs_tagger' type= llm_config=LLMConfig(llm_type='mistralai/mixtral-8x7b-instruct:nitro', temperature='0.6') use_metadata=True is_multi_ref=True\", \"name='topics' type= llm_config=LLMConfig(llm_type='mistralai/mixtral-8x7b-instruct:nitro', temperature='0.6') use_metadata=True is_topic=True\", \"name='keywords' type= llm_config=LLMConfig(llm_type='mistralai/mixtral-8x7b-instruct:nitro', temperature='0.6') use_metadata=True max_keywords=6\", \"name='hashtags' type= llm_config=LLMConfig(llm_type='mistralai/mistral-7b-instruct', temperature='0.6') use_metadata=False max_hashtags=20\"], 'post_process_type': 'PostProcessType.COMBINED', 'openrouter_api_config': \"openrouter_api_base='https://openrouter.ai/api/v1' openrouter_api_key='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' openrouter_referer='http://localhost:3000'\", 'metadata_extract_config': \"extraction_method= max_summary_length=500\"}\n" - ] - } - ], - "source": [ - "print(dataset_run_id.config)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " A B C\n", - "0 1 4 7\n", - "1 2 5 8\n", - "2 3 6 9\n" - ] - } - ], - "source": [ - "import pandas as pd\n", - "data = {\n", - " 'A': [1, 2, 3],\n", - " 'B': [4, 5, 6],\n", - " 'C': [7, 8, 9]\n", - "}\n", - "df = pd.DataFrame(data)\n", - "print(df)" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "6" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.iloc[2,1]" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [], - "source": [ - "list = []\n", - "list.extend([1,2])" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[1, 2]\n" - ] - } - ], - "source": [ - "print(list)" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[[1 2 3]\n", - " [4 5 6]\n", - " [7 8 9]]\n", - "Element at row 2, column 3: 6\n" - ] - } - ], - "source": [ - "import numpy as np\n", - "\n", - "# Creating a NumPy matrix (2D array)\n", - "matrix = np.array([\n", - " [1, 2, 3],\n", - " [4, 5, 6],\n", - " [7, 8, 9]\n", - "])\n", - "print(matrix)\n", - "# Accessing the element at row 2, column 3 (1-based description)\n", - "element = matrix[1, 2] # Using zero-based indexing, so 1 is the second row, 2 is the third column\n", - "\n", - "print(\"Element at row 2, column 3:\", element)\n" - ] } ], "metadata": { diff --git a/nlp/notebooks/add_item_types_firebase.ipynb b/nlp/notebooks/add_item_types_firebase.ipynb new file mode 100644 index 00000000..b6aa4ddc --- /dev/null +++ b/nlp/notebooks/add_item_types_firebase.ipynb @@ -0,0 +1,208 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "import nest_asyncio\n", + "nest_asyncio.apply()\n", + "from pathlib import Path\n", + "from datetime import datetime\n", + "import re\n", + "\n", + "import sys\n", + "sys.path.append(\"../\")\n", + "\n", + "from typing import List\n", + "from desci_sense.shared_functions.init import init_multi_chain_parser_config\n", + "from desci_sense.shared_functions.parsers.multi_chain_parser import MultiChainParser\n", + "from desci_sense.evaluation.utils import get_dataset, obj_to_json, obj_str_to_dict\n", + "from desci_sense.shared_functions.dataloaders import (\n", + " scrape_post,\n", + " convert_text_to_ref_post,\n", + ")\n", + "from desci_sense.shared_functions.configs import (\n", + " OpenrouterAPIConfig,\n", + " WandbConfig,\n", + " LLMConfig,\n", + " KeywordPParserChainConfig,\n", + " RefTaggerChainConfig,\n", + " TopicsPParserChainConfig,\n", + " validate_env_var,\n", + " MultiParserChainConfig,\n", + " ParserChainType,\n", + " PostProcessType,\n", + ") " + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "from desci_sense.shared_functions.interface import (\n", + " RDFTriplet,\n", + " isAConceptDefintion,\n", + " KeywordConceptDefinition,\n", + " ParserSupport,\n", + " ParserResult,\n", + " OntologyInterface,\n", + " ZoteroItemTypeDefinition,\n", + " )\n", + "from rdflib.namespace import RDF\n", + "from rdflib import URIRef, Literal, Graph" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "TEST_POST_TEXT_W_REF = \"\"\"\n", + "I really liked this paper!\n", + "https://arxiv.org/abs/2402.04607\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[32m2024-07-08 15:57:35.197\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mdesci_sense.shared_functions.parsers.multi_chain_parser\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m76\u001b[0m - \u001b[1mInitializing MultiChainParser. PostProcessType=combined\u001b[0m\n", + "\u001b[32m2024-07-08 15:57:35.199\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mdesci_sense.shared_functions.parsers.multi_chain_parser\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m83\u001b[0m - \u001b[1mInitializing post parsers...\u001b[0m\n", + "\u001b[32m2024-07-08 15:57:35.199\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mdesci_sense.shared_functions.parsers.post_parser_chain\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m27\u001b[0m - \u001b[1mInitializing parser chain 'multi_refs_tagger' \u001b[0m\n", + "\u001b[32m2024-07-08 15:57:35.241\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mdesci_sense.shared_functions.parsers.post_parser_chain\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m27\u001b[0m - \u001b[1mInitializing parser chain 'topics' \u001b[0m\n", + "\u001b[32m2024-07-08 15:57:35.268\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mdesci_sense.shared_functions.parsers.post_parser_chain\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m27\u001b[0m - \u001b[1mInitializing parser chain 'keywords' \u001b[0m\n", + "\u001b[32m2024-07-08 15:57:35.296\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mdesci_sense.shared_functions.parsers.post_parser_chain\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m27\u001b[0m - \u001b[1mInitializing parser chain 'hashtags' \u001b[0m\n", + "\u001b[32m2024-07-08 15:57:35.411\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mdesci_sense.shared_functions.web_extractors.citoid\u001b[0m:\u001b[36mbefore_retry\u001b[0m:\u001b[36m84\u001b[0m - \u001b[33m\u001b[1mRetry attempt 1\u001b[0m\n", + "\u001b[32m2024-07-08 15:57:35.412\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mdesci_sense.shared_functions.web_extractors.citoid\u001b[0m:\u001b[36mfetch_citation\u001b[0m:\u001b[36m154\u001b[0m - \u001b[34m\u001b[1mfetching citoid data for: https://arxiv.org/abs/2402.04607\u001b[0m\n", + "\u001b[32m2024-07-08 15:57:37.558\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mdesci_sense.shared_functions.parsers.multi_chain_parser\u001b[0m:\u001b[36mprocess_ref_post\u001b[0m:\u001b[36m265\u001b[0m - \u001b[34m\u001b[1mProcessing post with parsers: ['multi_refs_tagger', 'topics', 'keywords', 'hashtags']\u001b[0m\n", + "\u001b[32m2024-07-08 15:57:37.559\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mdesci_sense.shared_functions.parsers.multi_chain_parser\u001b[0m:\u001b[36mprocess_ref_post\u001b[0m:\u001b[36m267\u001b[0m - \u001b[34m\u001b[1mInstantiating prompts...\u001b[0m\n", + "\u001b[32m2024-07-08 15:57:37.560\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mdesci_sense.shared_functions.parsers.multi_chain_parser\u001b[0m:\u001b[36mprocess_ref_post\u001b[0m:\u001b[36m272\u001b[0m - \u001b[34m\u001b[1mInvoking parallel chain...\u001b[0m\n" + ] + } + ], + "source": [ + "multi_config = init_multi_chain_parser_config(llm_type='google/gemma-7b-it:free',\n", + " post_process_type=\"combined\")\n", + "multi_config.post_process_type = PostProcessType.COMBINED\n", + "mcp = MultiChainParser(multi_config)\n", + "res = mcp.process_text(TEST_POST_TEXT_W_REF)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['https://arxiv.org/abs/2402.04607']" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "res.reference_urls" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['preprint']" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "res.item_types" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "def convert_item_types_to_rdf_triplets(item_types: List[str], reference_urls: List[str]) -> List[RDFTriplet]:\n", + " assert len(res.reference_urls) == len(res.item_types)\n", + " triplets = [\n", + " RDFTriplet(\n", + " subject=URIRef(ref_url),\n", + " predicate=URIRef(ZoteroItemTypeDefinition().uri),\n", + " object=Literal(item_type),\n", + " )\n", + " for ref_url, item_type in zip(reference_urls, item_types)\n", + " ]\n", + "\n", + " return triplets" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "ename": "ValidationError", + "evalue": "2 validation errors for RDFTriplet\nsubject.is-instance[Literal]\n Input should be an instance of Literal [type=is_instance_of, input_value='https://arxiv.org/abs/2402.04607', input_type=str]\n For further information visit https://errors.pydantic.dev/2.6/v/is_instance_of\nsubject.is-instance[URIRef]\n Input should be an instance of URIRef [type=is_instance_of, input_value='https://arxiv.org/abs/2402.04607', input_type=str]\n For further information visit https://errors.pydantic.dev/2.6/v/is_instance_of", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValidationError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[13], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mconvert_item_types_to_rdf_triplets\u001b[49m\u001b[43m(\u001b[49m\u001b[43mres\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mitem_types\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mres\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mreference_urls\u001b[49m\u001b[43m)\u001b[49m\n", + "Cell \u001b[0;32mIn[12], line 3\u001b[0m, in \u001b[0;36mconvert_item_types_to_rdf_triplets\u001b[0;34m(item_types, reference_urls)\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mconvert_item_types_to_rdf_triplets\u001b[39m(item_types: List[\u001b[38;5;28mstr\u001b[39m], reference_urls: List[\u001b[38;5;28mstr\u001b[39m]) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m List[RDFTriplet]:\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(res\u001b[38;5;241m.\u001b[39mreference_urls) \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mlen\u001b[39m(res\u001b[38;5;241m.\u001b[39mitem_types)\n\u001b[0;32m----> 3\u001b[0m triplets \u001b[38;5;241m=\u001b[39m \u001b[43m[\u001b[49m\n\u001b[1;32m 4\u001b[0m \u001b[43m \u001b[49m\u001b[43mRDFTriplet\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 5\u001b[0m \u001b[43m \u001b[49m\u001b[43msubject\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mref_url\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 6\u001b[0m \u001b[43m \u001b[49m\u001b[43mpredicate\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mURIRef\u001b[49m\u001b[43m(\u001b[49m\u001b[43mZoteroItemTypeDefinition\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43muri\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 7\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mobject\u001b[39;49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mLiteral\u001b[49m\u001b[43m(\u001b[49m\u001b[43mitem_type\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 8\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 9\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mref_url\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mitem_type\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mzip\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mreference_urls\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mitem_types\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 10\u001b[0m \u001b[43m \u001b[49m\u001b[43m]\u001b[49m\n\u001b[1;32m 12\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m triplets\n", + "Cell \u001b[0;32mIn[12], line 4\u001b[0m, in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mconvert_item_types_to_rdf_triplets\u001b[39m(item_types: List[\u001b[38;5;28mstr\u001b[39m], reference_urls: List[\u001b[38;5;28mstr\u001b[39m]) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m List[RDFTriplet]:\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(res\u001b[38;5;241m.\u001b[39mreference_urls) \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mlen\u001b[39m(res\u001b[38;5;241m.\u001b[39mitem_types)\n\u001b[1;32m 3\u001b[0m triplets \u001b[38;5;241m=\u001b[39m [\n\u001b[0;32m----> 4\u001b[0m \u001b[43mRDFTriplet\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 5\u001b[0m \u001b[43m \u001b[49m\u001b[43msubject\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mref_url\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 6\u001b[0m \u001b[43m \u001b[49m\u001b[43mpredicate\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mURIRef\u001b[49m\u001b[43m(\u001b[49m\u001b[43mZoteroItemTypeDefinition\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43muri\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 7\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mobject\u001b[39;49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mLiteral\u001b[49m\u001b[43m(\u001b[49m\u001b[43mitem_type\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 8\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 9\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m ref_url, item_type \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mzip\u001b[39m(reference_urls, item_types)\n\u001b[1;32m 10\u001b[0m ]\n\u001b[1;32m 12\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m triplets\n", + "File \u001b[0;32m~/anaconda3/envs/asensebot/lib/python3.11/site-packages/pydantic/main.py:171\u001b[0m, in \u001b[0;36mBaseModel.__init__\u001b[0;34m(self, **data)\u001b[0m\n\u001b[1;32m 169\u001b[0m \u001b[38;5;66;03m# `__tracebackhide__` tells pytest and some other tools to omit this function from tracebacks\u001b[39;00m\n\u001b[1;32m 170\u001b[0m __tracebackhide__ \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m--> 171\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m__pydantic_validator__\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalidate_python\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mself_instance\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m)\u001b[49m\n", + "\u001b[0;31mValidationError\u001b[0m: 2 validation errors for RDFTriplet\nsubject.is-instance[Literal]\n Input should be an instance of Literal [type=is_instance_of, input_value='https://arxiv.org/abs/2402.04607', input_type=str]\n For further information visit https://errors.pydantic.dev/2.6/v/is_instance_of\nsubject.is-instance[URIRef]\n Input should be an instance of URIRef [type=is_instance_of, input_value='https://arxiv.org/abs/2402.04607', input_type=str]\n For further information visit https://errors.pydantic.dev/2.6/v/is_instance_of" + ] + } + ], + "source": [ + "convert_item_types_to_rdf_triplets(res.item_types, res.reference_urls)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "asensebot", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/nlp/tests/test_citoid.py b/nlp/tests/test_citoid.py index 1a6b5bb7..d21a1dbb 100644 --- a/nlp/tests/test_citoid.py +++ b/nlp/tests/test_citoid.py @@ -57,8 +57,6 @@ def test_i99(): from urllib.parse import urlparse -from url_normalize import url_normalize - def identify_social_media(url): """ diff --git a/nlp/tests/test_multi_chain_app_interface.py b/nlp/tests/test_multi_chain_app_interface.py index 0dbd83f5..2cdfdfb3 100644 --- a/nlp/tests/test_multi_chain_app_interface.py +++ b/nlp/tests/test_multi_chain_app_interface.py @@ -106,34 +106,34 @@ def test_thread_trim(): ] assert len(res.multi_reference_tagger) == 5 assert res.multi_reference_tagger[3:] == [["default"], ["default"]] - assert TARGET_THREAD_RENDER in res.debug["multi_reference_tagger"]["prompt"] + # assert TARGET_THREAD_RENDER in res.debug["multi_reference_tagger"]["prompt"] assert no_empty_lists(res.multi_reference_tagger) -def test_batch(): - multi_config = MultiParserChainConfig( - parser_configs=[ - MultiRefTaggerChainConfig( - name="multi_ref_tagger", - llm_config=LLMConfig(llm_type="google/gemma-7b-it"), - post_renderer=PostRendererType.THREAD_REF_POST, - ) - ], - post_process_type=PostProcessType.COMBINED, - metadata_extract_config=MetadataExtractionConfig(extraction_method="citoid"), - ) - mcp = MultiChainParser(multi_config) - thread = get_thread_1() - pi_1 = ParserInput(thread_post=thread, max_posts=1) - pi_2 = ParserInput(thread_post=thread, max_posts=3) - pi_3 = ParserInput(thread_post=thread, max_posts=4) - batch = [pi_1, pi_2, pi_3] - res = mcp.batch_process_parser_inputs(batch) - assert len(res[0].debug["multi_reference_tagger"]["reasoning"]) == 1 - assert len(res[1].debug["multi_reference_tagger"]["reasoning"]) == 3 - assert len(res[2].debug["multi_reference_tagger"]["reasoning"]) == 4 - for result in res: - assert no_empty_lists(result.multi_reference_tagger) +# def test_batch(): +# multi_config = MultiParserChainConfig( +# parser_configs=[ +# MultiRefTaggerChainConfig( +# name="multi_ref_tagger", +# llm_config=LLMConfig(llm_type="google/gemma-7b-it"), +# post_renderer=PostRendererType.THREAD_REF_POST, +# ) +# ], +# post_process_type=PostProcessType.COMBINED, +# metadata_extract_config=MetadataExtractionConfig(extraction_method="citoid"), +# ) +# mcp = MultiChainParser(multi_config) +# thread = get_thread_1() +# pi_1 = ParserInput(thread_post=thread, max_posts=1) +# pi_2 = ParserInput(thread_post=thread, max_posts=3) +# pi_3 = ParserInput(thread_post=thread, max_posts=4) +# batch = [pi_1, pi_2, pi_3] +# res = mcp.batch_process_parser_inputs(batch) +# assert len(res[0].debug["multi_reference_tagger"]["reasoning"]) == 1 +# assert len(res[1].debug["multi_reference_tagger"]["reasoning"]) == 3 +# assert len(res[2].debug["multi_reference_tagger"]["reasoning"]) == 4 +# for result in res: +# assert no_empty_lists(result.multi_reference_tagger) def test_citoid_unprocessed_urls(): @@ -163,8 +163,6 @@ def test_citoid_unprocessed_urls(): # "mistralai/mistral-7b-instruct:free" # "google/gemma-7b-it" if __name__ == "__main__": - thread = get_thread_1() - pi_1 = ParserInput(thread_post=thread, max_posts=1) multi_config = MultiParserChainConfig( parser_configs=[ MultiRefTaggerChainConfig( @@ -177,11 +175,20 @@ def test_citoid_unprocessed_urls(): metadata_extract_config=MetadataExtractionConfig(extraction_method="citoid"), ) mcp = MultiChainParser(multi_config) - res = mcp.process_parser_input(pi_1) - assert res.filter_classification == SciFilterClassfication.CITOID_DETECTED_RESEARCH - - - + thread = get_thread_1() + pi = ParserInput(thread_post=thread, max_posts=1) + res = mcp.process_parser_input(pi) + assert res.reference_urls == [ + "https://x.com/FDAadcomms/status/1798104612635070611", + "https://journals.sagepub.com/doi/10.1177/20451253231198466", + "https://www.youtube.com/watch?feature=youtu.be&si=kjMtNR1Hwe7NZ8as&v=WknlkmJee4E", + "https://x.com/eturnermd1/status/1798046087737180395", + "https://x.com/FDAadcomms/status/1798107142219796794", + ] + assert len(res.multi_reference_tagger) == 5 + assert res.multi_reference_tagger[3:] == [["default"], ["default"]] + assert TARGET_THREAD_RENDER in res.debug["multi_reference_tagger"]["prompt"] + assert no_empty_lists(res.multi_reference_tagger) # parse_request = create_post_request() # multi_config = MultiParserChainConfig( # parser_configs=[ diff --git a/nlp/tests/test_multi_chain_post_processing.py b/nlp/tests/test_multi_chain_post_processing.py index 43610f6e..70d8d900 100644 --- a/nlp/tests/test_multi_chain_post_processing.py +++ b/nlp/tests/test_multi_chain_post_processing.py @@ -11,7 +11,7 @@ import os import pytest from pydantic import ValidationError - +from rdflib import URIRef, Literal, Graph from utils import create_multi_chain_for_tests, create_multi_config_for_tests from desci_sense.shared_functions.parsers.multi_chain_parser import MultiChainParser from desci_sense.shared_functions.filters import SciFilterClassfication @@ -27,10 +27,17 @@ ParserChainType, PostProcessType, ) # Adjust the import as necessary +from desci_sense.shared_functions.postprocessing import ( + convert_item_types_to_rdf_triplets, +) from desci_sense.shared_functions.dataloaders import ( scrape_post, convert_text_to_ref_post, ) +from desci_sense.shared_functions.interface import ( + RDFTriplet, + ZoteroItemTypeDefinition, +) TEST_POST_TEXT_W_REF = """ I really liked this paper! @@ -56,6 +63,16 @@ def test_firebase_pp(): res = mcp.process_text(TEST_POST_TEXT_W_REF) len(res.support.refs_meta) == 1 assert res.filter_classification == SciFilterClassfication.CITOID_DETECTED_RESEARCH + # check item types + expected = [ + RDFTriplet( + subject=URIRef("https://arxiv.org/abs/2402.04607"), + predicate=URIRef(ZoteroItemTypeDefinition().uri), + object=Literal("preprint"), + ), + ] + for triplet in expected: + assert (triplet.subject, triplet.predicate, triplet.object) in res.semantics def test_multi_chain_batch_pp_simple(): @@ -110,18 +127,71 @@ def test_multi_chain_batch_pp_combined(): ) -if __name__ == "__main__": - # get a few posts for input - urls = [ - "https://mastodon.social/@psmaldino@qoto.org/111405098400404613", - "https://mastodon.social/@UlrikeHahn@fediscience.org/111732713776994953", - "https://mastodon.social/@ronent/111687038322549430", +def test_convert_item_types_to_rdf_triplets_single_entry(): + item_types = ["preprint"] + reference_urls = ["https://arxiv.org/abs/2402.04607"] + result = convert_item_types_to_rdf_triplets(item_types, reference_urls) + + expected = [ + RDFTriplet( + subject=URIRef("https://arxiv.org/abs/2402.04607"), + predicate=URIRef(ZoteroItemTypeDefinition().uri), + object=Literal("preprint"), + ) ] - posts = [scrape_post(url) for url in urls] - multi_config = create_multi_config_for_tests(llm_type="google/gemma-7b-it") - multi_chain_parser = MultiChainParser(multi_config) - multi_chain_parser.config.post_process_type = PostProcessType.FIREBASE - res = multi_chain_parser.batch_process_ref_posts(posts) + + assert len(result) == len(expected) + for res, exp in zip(result, expected): + assert res.subject == exp.subject + assert res.predicate == exp.predicate + assert res.object == exp.object + + +def test_convert_item_types_to_rdf_triplets_multiple_entries(): + item_types = ["journalArticle", "book"] + reference_urls = ["https://example.com/article1", "https://example.com/book1"] + result = convert_item_types_to_rdf_triplets(item_types, reference_urls) + + expected = [ + RDFTriplet( + subject=URIRef("https://example.com/article1"), + predicate=URIRef(ZoteroItemTypeDefinition().uri), + object=Literal("journalArticle"), + ), + RDFTriplet( + subject=URIRef("https://example.com/book1"), + predicate=URIRef(ZoteroItemTypeDefinition().uri), + object=Literal("book"), + ), + ] + + assert len(result) == len(expected) + for res, exp in zip(result, expected): + assert res.subject == exp.subject + assert res.predicate == exp.predicate + assert res.object == exp.object + + +def test_convert_item_types_to_rdf_triplets_empty(): + item_types = [] + reference_urls = [] + result = convert_item_types_to_rdf_triplets(item_types, reference_urls) + assert result == [] + + +def test_convert_item_types_to_rdf_triplets_mismatched_lengths(): + item_types = ["preprint", "book"] + reference_urls = ["https://arxiv.org/abs/2402.04607"] + + with pytest.raises(AssertionError): + convert_item_types_to_rdf_triplets(item_types, reference_urls) + + +if __name__ == "__main__": + multi_config = create_multi_config_for_tests() + multi_config.post_process_type = PostProcessType.COMBINED + mcp = MultiChainParser(multi_config) + res = mcp.process_text(TEST_POST_TEXT_W_REF) # len(res.support.refs_meta) == 1 # assert "test" in mcp.pparsers diff --git a/nlp/tests/test_post_app_interface.py b/nlp/tests/test_post_app_interface.py index b5db3242..872f0ee3 100644 --- a/nlp/tests/test_post_app_interface.py +++ b/nlp/tests/test_post_app_interface.py @@ -1,5 +1,6 @@ import sys from pathlib import Path +import pytest ROOT = Path(__file__).parents[1] sys.path.append(str(ROOT)) @@ -21,7 +22,12 @@ ThreadRefPost, QuoteRefPost, ) -from desci_sense.shared_functions.interface import AppThread, ParsePostRequest +from desci_sense.shared_functions.interface import ( + AppThread, + ParsePostRequest, + Author, + AppPost, +) from desci_sense.shared_functions.dataloaders import scrape_post from desci_sense.shared_functions.dataloaders.twitter.twitter_utils import ( extract_external_ref_urls, @@ -36,6 +42,40 @@ preproc_parser_input, ) +QUOTED_THREAD_I123 = { + "author": { + "platformId": "twitter", + "id": "author_123", + "username": "user123", + "name": "John Doe", + }, + "url": "https://x.com/fchollet/status/1810833882037825646", + "thread": [ + { + "url": "https://x.com/fchollet/status/1810833882037825646", + "content": "The fact is that tech bubbles have very little to do with the technical or even commercial merits of the technology they form around. They can happen with worthless narratives or with entirely grounded ones. They don't even require unrealistic revenue projections!", + } + ], +} + +TEST_POST_I123 = { + "author": { + "platformId": "twitter", + "id": "author_456", + "username": "user546", + "name": "Sarah Gore", + }, + "url": "https://example.com/post/2", + "thread": [ + { + "url": "https://example.com/post/2", + "content": "Something deep alluded to here: - In theory we have free markets as superhuman information processors (Hayek) - In practice, that information processing is limited by the bounded, fallible & biased cognition of individual investors wielding outsized influence https://twitter.com/fchollet/status/1810833882037825646", + "quotedThread": QUOTED_THREAD_I123, + } + ], +} + + TEST_THREAD_INTERFACE_2 = { "url": "https://example.com/post/2", "thread": [ @@ -278,10 +318,67 @@ def test_load_real_thread(): ] +def test_author_platform_id_lowercase(): + author = Author(id="123", name="John Doe", username="johndoe", platformId="Twitter") + assert author.platformId == "twitter" + + +def test_author_invalid_platform_id(): + with pytest.raises(ValidationError): + Author( + id="123", name="John Doe", username="johndoe", platformId=123 + ) # platformId should be a string + + +def test_app_post_normalize_content_urls(): + content = "Check out this tweet: https://twitter.com/user/status/1234567890" + expected_content = "Check out this tweet: https://x.com/user/status/1234567890" + post = AppPost(content=content) + assert post.content == expected_content + + +def test_app_post_normalize_url(): + url = "https://twitter.com/user/status/1234567890" + expected_url = "https://x.com/user/status/1234567890" + post = AppPost(content="Test post", url=url) + assert post.url == expected_url + + +def test_app_post_no_normalization_needed(): + content = "This is a test post with no Twitter URLs." + url = "https://example.com" + post = AppPost(content=content, url=url) + assert post.content == content + assert post.url == url + + +def test_app_thread_normalize_url(): + url = "https://twitter.com/user/status/1234567890" + expected_url = "https://x.com/user/status/1234567890" + author = Author(id="123", name="John Doe", username="johndoe", platformId="twitter") + post = AppPost(content="Test post", url=url) + thread = AppThread(author=author, thread=[post], url=url) + assert thread.url == expected_url + + +def test_app_thread_author(): + author = Author(id="123", name="John Doe", username="johndoe", platformId="Twitter") + post = AppPost(content="Test post") + thread = AppThread(author=author, thread=[post]) + assert thread.author.name == "John Doe" + assert thread.author.platformId == "twitter" + + +def test_i123(): + thread = AppThread.model_validate(TEST_POST_I123) + thread_ref_post = convert_thread_interface_to_ref_post(thread) + assert thread_ref_post.md_ref_urls() == [ + "https://x.com/fchollet/status/1810833882037825646" + ] if __name__ == "__main__": - thread = AppThread.model_validate(TEST_OVERLENGTH_THREAD_INTERFACE) + thread = AppThread.model_validate(TEST_POST_I123) thread_ref_post = convert_thread_interface_to_ref_post(thread) - pi = ParserInput(thread_post=thread_ref_post, max_posts=30) - proc_pi = preproc_parser_input(pi) + # pi = ParserInput(thread_post=thread_ref_post, max_posts=30) + # proc_pi = preproc_parser_input(pi) diff --git a/nlp/tests/test_quoted_post_firebase.py b/nlp/tests/test_quoted_post_firebase.py new file mode 100644 index 00000000..29f4fc91 --- /dev/null +++ b/nlp/tests/test_quoted_post_firebase.py @@ -0,0 +1,150 @@ +import sys +from pathlib import Path + +# https://stackoverflow.com/a/63539722/2882125 +import nest_asyncio + +nest_asyncio.apply() + +ROOT = Path(__file__).parents[1] +sys.path.append(str(ROOT)) +import os +import pytest +from pydantic import ValidationError, BaseModel +from rdflib import URIRef, Literal, Graph +from utils import ( + create_multi_chain_for_tests, + create_multi_config_for_tests, + get_thread_1, + no_empty_lists, + create_post_request, +) +from desci_sense.shared_functions.schema.post import ThreadRefPost, QuoteRefPost +from desci_sense.shared_functions.parsers.multi_chain_parser import MultiChainParser +from desci_sense.shared_functions.configs import ( + OpenrouterAPIConfig, + WandbConfig, + LLMConfig, + MultiRefTaggerChainConfig, + KeywordPParserChainConfig, + RefTaggerChainConfig, + TopicsPParserChainConfig, + validate_env_var, + MultiParserChainConfig, + ParserChainType, + PostProcessType, + PostRendererType, + MetadataExtractionConfig, +) # Adjust the import as necessary +from desci_sense.shared_functions.dataloaders import ( + scrape_post, + convert_text_to_ref_post, +) +from desci_sense.shared_functions.interface import (RDFTriplet, QuotedPostDefinition,) +from desci_sense.shared_functions.preprocessing import ParserInput +from desci_sense.shared_functions.filters import SciFilterClassfication + +TEST_POST_TEXT_W_REF = """ +I really liked this paper! +https://arxiv.org/abs/2402.04607 +""" + +TARGET_THREAD_RENDER = """- Author: Eiko Fried +- Content: After careful consideration, the FDA advisory comission voted today 9:2 that MDMA has *not* been shown to be effective for treating PTSD, given massive concerns around validity threats in this literature. They also voted 10:1 that MDMA has *not* shown to be safe. @eturnermd1 #MDMAadcomm VOTE 1/2: Do the available data show that the drug is effective in patients with posttraumatic +stress disorder? +2-Yes +9-No +0-Abstain https://twitter.com/FDAadcomms/status/1798104612635070611/photo/1 +--- +📄Many mentioned reasons overlap with those we summarized recently in our review paper: + + +📺 I also summarize them for a lay audience in this YouTube video: + +- References: + +url: https://x.com/FDAadcomms/status/1798104612635070611 +item_type: forumPost +title: Twitter post +summary: None +========== + +url: https://journals.sagepub.com/doi/10.1177/20451253231198466 +item_type: journalArticle +title: History repeating: guidelines to address common problems in psychedelic science +summary: None +========== + +url: https://www.youtube.com/watch?feature=youtu.be&si=kjMtNR1Hwe7NZ8as&v=WknlkmJee4E +item_type: videoRecording +title: Psychedelic treatments for mental health problems: promises and pitfalls +summary: In this lecture, I summarize promises and pitfalls of psychedelic treatments for mental health problems. No scientific background knowledge is required to vi... +==========""" + + +def test_thread_quoted_url(): + thread = get_thread_1() + pi_1 = ParserInput(thread_post=thread, max_posts=30) + multi_config = MultiParserChainConfig( + parser_configs=[ + MultiRefTaggerChainConfig( + name="multi_ref_tagger", + llm_config=LLMConfig(llm_type="mistralai/mistral-7b-instruct:free"), + post_renderer=PostRendererType.THREAD_REF_POST, + ) + ], + post_process_type=PostProcessType.COMBINED, + metadata_extract_config=MetadataExtractionConfig(extraction_method="citoid"), + ) + mcp = MultiChainParser(multi_config) + res = mcp.process_parser_input(pi_1) + assert res.quoted_post_url == thread.quoted_url + assert thread.quoted_url == 'https://x.com/FDAadcomms/status/1798104612635070611' + + +def test_firebase_quoted_post_pp(): + thread = get_thread_1() + pi_1 = ParserInput(thread_post=thread, max_posts=30) + multi_config = MultiParserChainConfig( + parser_configs=[ + MultiRefTaggerChainConfig( + name="multi_ref_tagger", + llm_config=LLMConfig(llm_type="mistralai/mistral-7b-instruct:free"), + post_renderer=PostRendererType.THREAD_REF_POST, + ) + ], + post_process_type=PostProcessType.FIREBASE, + metadata_extract_config=MetadataExtractionConfig(extraction_method="citoid"), + ) + mcp = MultiChainParser(multi_config) + res = mcp.process_parser_input(pi_1) + expected = RDFTriplet( + predicate=URIRef(QuotedPostDefinition().uri), + object=URIRef("https://x.com/FDAadcomms/status/1798104612635070611"), + ) + assert (expected.subject, expected.predicate, expected.object) in res.semantics + + +# "mistralai/mixtral-8x7b-instruct" +# "mistralai/mistral-7b-instruct:free" +# "google/gemma-7b-it" +if __name__ == "__main__": + thread = get_thread_1() + pi_1 = ParserInput(thread_post=thread, max_posts=30) + multi_config = MultiParserChainConfig( + parser_configs=[ + MultiRefTaggerChainConfig( + name="multi_ref_tagger", + llm_config=LLMConfig(llm_type="mistralai/mistral-7b-instruct:free"), + post_renderer=PostRendererType.THREAD_REF_POST, + ) + ], + post_process_type=PostProcessType.FIREBASE, + metadata_extract_config=MetadataExtractionConfig(extraction_method="citoid"), + ) + mcp = MultiChainParser(multi_config) + res = mcp.process_parser_input(pi_1) + + + + diff --git a/nlp/tests/test_twitter.py b/nlp/tests/test_twitter.py index 3b99a4ef..e07ecd8d 100644 --- a/nlp/tests/test_twitter.py +++ b/nlp/tests/test_twitter.py @@ -1,10 +1,13 @@ import sys from pathlib import Path + ROOT = Path(__file__).parents[1] sys.path.append(str(ROOT)) import pytest +from desci_sense.shared_functions.utils import normalize_tweet_urls_in_text + from desci_sense.shared_functions.dataloaders import ( PostScrapeError, UnknownSocialMediaTypeError, @@ -112,6 +115,48 @@ def test_problem_tweet_i31(): ), f"{case} has_refs? = {tweet.has_refs()} - mismatch with {label}" +def test_normalize_single_twitter_url(): + text = "Check out this tweet: https://twitter.com/user/status/1234567890" + expected = "Check out this tweet: https://x.com/user/status/1234567890" + assert normalize_tweet_urls_in_text(text) == expected + + +def test_normalize_multiple_twitter_urls(): + text = "First tweet: https://twitter.com/user1/status/1234567890 and second tweet: https://twitter.com/user2/status/0987654321" + expected = "First tweet: https://x.com/user1/status/1234567890 and second tweet: https://x.com/user2/status/0987654321" + assert normalize_tweet_urls_in_text(text) == expected + + +def test_mixed_urls(): + text = "Tweet: https://twitter.com/user/status/1234567890 and a non-Twitter URL: https://example.com/page" + expected = "Tweet: https://x.com/user/status/1234567890 and a non-Twitter URL: https://example.com/page" + assert normalize_tweet_urls_in_text(text) == expected + + +def test_no_twitter_url(): + text = "This text contains no Twitter URLs, only this: https://example.com/page" + expected = "This text contains no Twitter URLs, only this: https://example.com/page" + assert normalize_tweet_urls_in_text(text) == expected + + +def test_empty_string(): + text = "" + expected = "" + assert normalize_tweet_urls_in_text(text) == expected + + +def test_url_with_http(): + text = "Check out this tweet: http://twitter.com/user/status/1234567890" + expected = "Check out this tweet: https://x.com/user/status/1234567890" + assert normalize_tweet_urls_in_text(text) == expected + + +def test_mixed_case_url(): + text = "Check out this tweet: https://Twitter.com/user/status/1234567890" + expected = "Check out this tweet: https://x.com/user/status/1234567890" + assert normalize_tweet_urls_in_text(text) == expected + + if __name__ == "__main__": post_url = "https://twitter.com/example/status/1234567890" tweet = scrape_post(post_url)