From 5e444d43f9e3a9cadc80184ee78247c16a89d497 Mon Sep 17 00:00:00 2001 From: hagen-danswer Date: Sat, 2 Nov 2024 09:01:15 -0700 Subject: [PATCH 1/5] Cleaned up connector --- .../danswer/connectors/freshdesk/connector.py | 138 ++++++++++-------- 1 file changed, 75 insertions(+), 63 deletions(-) diff --git a/backend/danswer/connectors/freshdesk/connector.py b/backend/danswer/connectors/freshdesk/connector.py index b1f1dc1d2df..448c3d95cd9 100644 --- a/backend/danswer/connectors/freshdesk/connector.py +++ b/backend/danswer/connectors/freshdesk/connector.py @@ -21,76 +21,84 @@ logger = setup_logger() +_TICKET_FIELDS_TO_INCLUDE = { + "fr_escalated", + "spam", + "priority", + "source", + "status", + "type", + "is_escalated", + "tags", + "nr_due_by", + "nr_escalated", + "cc_emails", + "fwd_emails", + "reply_cc_emails", + "ticket_cc_emails", + "support_email", + "to_emails", +} + +_SOURCE_TYPES = { + "1": "Email", + "2": "Portal", + "3": "Phone", + "7": "Chat", + "9": "Feedback Widget", + "10": "Outbound Email", +} + +_PRIORITY_TYPES = {"1": "low", "2": "medium", "3": "high", "4": "urgent"} + +_STATUS_TYPES = {"2": "open", "3": "pending", "4": "resolved", "5": "closed"} + + def _create_metadata_from_ticket(ticket: dict) -> dict: - included_fields = { - "fr_escalated", - "spam", - "priority", - "source", - "status", - "type", - "is_escalated", - "tags", - "nr_due_by", - "nr_escalated", - "cc_emails", - "fwd_emails", - "reply_cc_emails", - "ticket_cc_emails", - "support_email", - "to_emails", - } - - metadata = {} - email_data = {} + metadata: dict[str, str | list[str]] = {} + # Combine all emails into a list so there are no repeated emails + email_data: set[str] = set() for key, value in ticket.items(): - if ( - key in included_fields - and value is not None - and value != [] - and value != {} - and value != "[]" - and value != "" - ): - value_to_str = ( - [str(item) for item in value] if isinstance(value, List) else str(value) - ) - if "email" in key: - email_data[key] = value_to_str + # Skip fields that aren't useful for embedding + if key not in _TICKET_FIELDS_TO_INCLUDE: + continue + + # Skip empty fields + if not value or value == "[]": + continue + + # Convert strings or lists to strings + stringified_value: str | list[str] + if isinstance(value, list): + stringified_value = [str(item) for item in value] + else: + stringified_value = str(value) + + if "email" in key: + if isinstance(stringified_value, list): + email_data.update(stringified_value) else: - metadata[key] = value_to_str + email_data.add(stringified_value) + else: + metadata[key] = stringified_value if email_data: - metadata["email_data"] = str(email_data) - - # Convert source to human-parsable string - source_types = { - "1": "Email", - "2": "Portal", - "3": "Phone", - "7": "Chat", - "9": "Feedback Widget", - "10": "Outbound Email", - } - if ticket.get("source"): - metadata["source"] = source_types.get( - str(ticket.get("source")), "Unknown Source Type" - ) + metadata["emails"] = list(email_data) - # Convert priority to human-parsable string - priority_types = {"1": "low", "2": "medium", "3": "high", "4": "urgent"} - if ticket.get("priority"): - metadata["priority"] = priority_types.get( - str(ticket.get("priority")), "Unknown Priority" + # Convert source numbers to human-parsable string + if source_number := ticket.get("source"): + metadata["source"] = _SOURCE_TYPES.get( + str(source_number), "Unknown Source Type" ) + # Convert priority numbers to human-parsable string + if priority_number := ticket.get("priority"): + metadata["priority"] = _PRIORITY_TYPES.get(priority_number, "Unknown Priority") + # Convert status to human-parsable string - status_types = {"2": "open", "3": "pending", "4": "resolved", "5": "closed"} - if ticket.get("status"): - metadata["status"] = status_types.get( - str(ticket.get("status")), "Unknown Status" - ) + if status_number := ticket.get("status"): + metadata["status"] = _STATUS_TYPES.get(str(status_number), "Unknown Status") due_by = datetime.fromisoformat(ticket["due_by"].replace("Z", "+00:00")) metadata["overdue"] = str(datetime.now(timezone.utc) > due_by) @@ -99,17 +107,21 @@ def _create_metadata_from_ticket(ticket: dict) -> dict: def _create_doc_from_ticket(ticket: dict, domain: str) -> Document: + # Use the ticket description as the text + text = f"Ticket description: {parse_html_page_basic(ticket.get('description_text', ''))}" + metadata = _create_metadata_from_ticket(ticket) + return Document( id=str(ticket["id"]), sections=[ Section( link=f"https://{domain}.freshdesk.com/helpdesk/tickets/{int(ticket['id'])}", - text=f"description: {parse_html_page_basic(ticket.get('description_text', ''))}", + text=text, ) ], source=DocumentSource.FRESHDESK, semantic_identifier=ticket["subject"], - metadata=_create_metadata_from_ticket(ticket), + metadata=metadata, doc_updated_at=datetime.fromisoformat( ticket["updated_at"].replace("Z", "+00:00") ), @@ -146,7 +158,7 @@ def _fetch_tickets( 'include' field available for this endpoint: https://developers.freshdesk.com/api/#filter_tickets """ - if any(attr is None for attr in [self.api_key, self.domain, self.password]): + if self.api_key is None or self.domain is None or self.password is None: raise ConnectorMissingCredentialError("freshdesk") base_url = f"https://{self.domain}.freshdesk.com/api/v2/tickets" From d652cb3141fff4c213d3bb9872fffd839612438a Mon Sep 17 00:00:00 2001 From: hagen-danswer Date: Sat, 2 Nov 2024 09:03:42 -0700 Subject: [PATCH 2/5] renamed variables --- .../danswer/connectors/freshdesk/connector.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/backend/danswer/connectors/freshdesk/connector.py b/backend/danswer/connectors/freshdesk/connector.py index 448c3d95cd9..28fa20ec8c2 100644 --- a/backend/danswer/connectors/freshdesk/connector.py +++ b/backend/danswer/connectors/freshdesk/connector.py @@ -40,7 +40,7 @@ "to_emails", } -_SOURCE_TYPES = { +_SOURCE_NUMBER_TYPE_MAP = { "1": "Email", "2": "Portal", "3": "Phone", @@ -49,9 +49,9 @@ "10": "Outbound Email", } -_PRIORITY_TYPES = {"1": "low", "2": "medium", "3": "high", "4": "urgent"} +_PRIORITY_NUMBER_TYPE_MAP = {"1": "low", "2": "medium", "3": "high", "4": "urgent"} -_STATUS_TYPES = {"2": "open", "3": "pending", "4": "resolved", "5": "closed"} +_STATUS_NUMBER_TYPE_MAP = {"2": "open", "3": "pending", "4": "resolved", "5": "closed"} def _create_metadata_from_ticket(ticket: dict) -> dict: @@ -88,17 +88,21 @@ def _create_metadata_from_ticket(ticket: dict) -> dict: # Convert source numbers to human-parsable string if source_number := ticket.get("source"): - metadata["source"] = _SOURCE_TYPES.get( + metadata["source"] = _SOURCE_NUMBER_TYPE_MAP.get( str(source_number), "Unknown Source Type" ) # Convert priority numbers to human-parsable string if priority_number := ticket.get("priority"): - metadata["priority"] = _PRIORITY_TYPES.get(priority_number, "Unknown Priority") + metadata["priority"] = _PRIORITY_NUMBER_TYPE_MAP.get( + priority_number, "Unknown Priority" + ) # Convert status to human-parsable string if status_number := ticket.get("status"): - metadata["status"] = _STATUS_TYPES.get(str(status_number), "Unknown Status") + metadata["status"] = _STATUS_NUMBER_TYPE_MAP.get( + str(status_number), "Unknown Status" + ) due_by = datetime.fromisoformat(ticket["due_by"].replace("Z", "+00:00")) metadata["overdue"] = str(datetime.now(timezone.utc) > due_by) From 6e8c88ed71ac0e3ea4df7b2f4f842fbe10721667 Mon Sep 17 00:00:00 2001 From: hagen-danswer Date: Sat, 2 Nov 2024 09:05:24 -0700 Subject: [PATCH 3/5] made id more unique --- backend/danswer/connectors/freshdesk/connector.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/backend/danswer/connectors/freshdesk/connector.py b/backend/danswer/connectors/freshdesk/connector.py index 28fa20ec8c2..30bfee662c7 100644 --- a/backend/danswer/connectors/freshdesk/connector.py +++ b/backend/danswer/connectors/freshdesk/connector.py @@ -20,6 +20,8 @@ logger = setup_logger() +_FRESHDESK_ID_PREFIX = "FRESHDESK_" + _TICKET_FIELDS_TO_INCLUDE = { "fr_escalated", @@ -116,7 +118,7 @@ def _create_doc_from_ticket(ticket: dict, domain: str) -> Document: metadata = _create_metadata_from_ticket(ticket) return Document( - id=str(ticket["id"]), + id=_FRESHDESK_ID_PREFIX + str(ticket["id"]), sections=[ Section( link=f"https://{domain}.freshdesk.com/helpdesk/tickets/{int(ticket['id'])}", From 6b4143cc30256177203650854d64f5d8f71a8213 Mon Sep 17 00:00:00 2001 From: hagen-danswer Date: Sat, 2 Nov 2024 09:08:26 -0700 Subject: [PATCH 4/5] ID fix --- backend/danswer/connectors/freshdesk/connector.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/backend/danswer/connectors/freshdesk/connector.py b/backend/danswer/connectors/freshdesk/connector.py index 30bfee662c7..7154e62bf9f 100644 --- a/backend/danswer/connectors/freshdesk/connector.py +++ b/backend/danswer/connectors/freshdesk/connector.py @@ -117,11 +117,14 @@ def _create_doc_from_ticket(ticket: dict, domain: str) -> Document: text = f"Ticket description: {parse_html_page_basic(ticket.get('description_text', ''))}" metadata = _create_metadata_from_ticket(ticket) + # This is also used in the ID because it is more unique than the just the ticket ID + link = f"https://{domain}.freshdesk.com/helpdesk/tickets/{ticket['id']}" + return Document( - id=_FRESHDESK_ID_PREFIX + str(ticket["id"]), + id=_FRESHDESK_ID_PREFIX + link, sections=[ Section( - link=f"https://{domain}.freshdesk.com/helpdesk/tickets/{int(ticket['id'])}", + link=link, text=text, ) ], From 83c299ebc8e183c9da63bf33b1ef94723f92ff6b Mon Sep 17 00:00:00 2001 From: hagen-danswer Date: Sat, 2 Nov 2024 09:09:46 -0700 Subject: [PATCH 5/5] troll logger statement --- backend/danswer/connectors/freshdesk/connector.py | 1 - 1 file changed, 1 deletion(-) diff --git a/backend/danswer/connectors/freshdesk/connector.py b/backend/danswer/connectors/freshdesk/connector.py index 7154e62bf9f..04033b05f3f 100644 --- a/backend/danswer/connectors/freshdesk/connector.py +++ b/backend/danswer/connectors/freshdesk/connector.py @@ -208,7 +208,6 @@ def _process_tickets( for ticket_batch in self._fetch_tickets(start, end): for ticket in ticket_batch: - logger.info(_create_doc_from_ticket(ticket, self.domain)) doc_batch.append(_create_doc_from_ticket(ticket, self.domain)) if len(doc_batch) >= self.batch_size: