dyvenia · djagoda881 · Feb 15, 2023 · Feb 15, 2023 · Feb 15, 2023 · Feb 21, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Added `ExchangeRates` source to the library.
 - Added `from_df()` method to `Azure Data Lake` source
 - Added `SAPRFC` source to the library.
+- Added `Salesforce` source to the library.
 
 ### Changed
 - Added `SQLServerToDF` task

diff --git a/tests/unit/test_salesforce.py b/tests/unit/test_salesforce.py
@@ -0,0 +1,101 @@
+import pandas as pd
+import pytest
+
+from viadot.sources import Salesforce
+
+TABLE_TO_DOWNLOAD = "Account"
+TABLE_TO_UPSERT = "Contact"
+TEST_LAST_NAME = "prefect-viadot-test"
+ID_TO_UPSERT = "0035E00001YGWK3QAP"
+
+
+@pytest.fixture(scope="session")
+def salesforce():
+    s = Salesforce(config_key="sales_force_dev")
+    yield s
+
+
+@pytest.fixture(scope="session")
+def test_df_data(salesforce):
+    data = {
+        "Id": [ID_TO_UPSERT],
+        "LastName": [TEST_LAST_NAME],
+    }
+    df = pd.DataFrame(data=data)
+
+    yield df
+
+    data_restored = {
+        "Id": [ID_TO_UPSERT],
+        "LastName": ["LastName"],
+    }
+    df_restored = pd.DataFrame(data=data_restored)
+    salesforce.upsert(df=df_restored, table=TABLE_TO_UPSERT)
+
+
+@pytest.fixture(scope="session")
+def test_df_external(salesforce):
+    data = {
+        "LastName": [TEST_LAST_NAME],
+        "SAPContactId__c": ["111"],
+    }
+    df = pd.DataFrame(data=data)
+    yield df
+
+    data_restored = {
+        "Id": [ID_TO_UPSERT],
+        "LastName": ["LastName"],
+    }
+    df_restored = pd.DataFrame(data=data_restored)
+    salesforce.upsert(df=df_restored, table=TABLE_TO_UPSERT)
+
+
+def test_upsert_empty(salesforce):
+    try:
+        df = pd.DataFrame()
+        salesforce.upsert(df=df, table=TABLE_TO_UPSERT)
+    except Exception as exception:
+        assert False, exception
+
+
+def test_upsert_external_id_correct(salesforce, test_df_external):
+    try:
+        salesforce.upsert(
+            df=test_df_external, table=TABLE_TO_UPSERT, external_id="SAPContactId__c"
+        )
+    except Exception as exception:
+        assert False, exception
+    df = salesforce.to_df(
+        query=f"SELECT ID, LastName FROM {TABLE_TO_UPSERT} WHERE LastName='{TEST_LAST_NAME}'"
+    )
+
+    result = df.values
+    assert result[0][0] == ID_TO_UPSERT
+    assert result[0][1] == TEST_LAST_NAME
+
+
+def test_upsert_external_id_wrong(salesforce, test_df_external):
+    with pytest.raises(ValueError):
+        salesforce.upsert(
+            df=test_df_external, table=TABLE_TO_UPSERT, external_id="SAPId"
+        )
+
+
+def test_download_no_query(salesforce):
+    ordered_dict = salesforce.download(table=TABLE_TO_DOWNLOAD)
+    assert len(ordered_dict) > 0
+
+
+def test_download_with_query(salesforce):
+    query = f"SELECT Id, Name FROM {TABLE_TO_DOWNLOAD}"
+    ordered_dict = salesforce.download(query=query)
+    assert len(ordered_dict) > 0
+
+
+def test_to_df(salesforce):
+    df = salesforce.to_df(table=TABLE_TO_DOWNLOAD)
+    assert df.empty == False
+
+
+def test_upsert(salesforce, test_df_data):
+    salesforce.upsert(df=test_df_data, table=TABLE_TO_UPSERT)
diff --git a/viadot/sources/__init__.py b/viadot/sources/__init__.py
@@ -11,3 +11,4 @@
 from .s3 import S3
 from .sharepoint import Sharepoint
 from .redshift_spectrum import RedshiftSpectrum
+from .salesforce import Salesforce
diff --git a/viadot/sources/salesforce.py b/viadot/sources/salesforce.py
@@ -0,0 +1,192 @@
+from typing import Any, Dict, List, Literal, OrderedDict
+
+import pandas as pd
+from simple_salesforce import Salesforce as SF
+from simple_salesforce.exceptions import SalesforceMalformedRequest
+from viadot.config import get_source_credentials
+from viadot.exceptions import CredentialError
+from viadot.sources.base import Source
+
+
+class Salesforce(Source):
+    """
+    A class for pulling data from theSalesforce.
+
+    Args:
+        domain (str): domain of a connection; defaults to 'test' (sandbox). Can be added only if built-in username/password/security token is provided.
+        client_id (str): client id to keep the track of API calls.
+        credentials (dict): credentials to connect with. If not provided, will read from local config file.
+        env (Literal): environment information, provides information about credential and connection configuration; defaults to 'DEV'.
+        config_key (str, optional): The key in the viadot config holding relevant credentials. Defaults to None.
+
+    """
+
+    def __init__(
+        self,
+        *args,
+        domain: str = "test",
+        client_id: str = "viadot",
+        credentials: Dict[str, Any] = None,
+        env: Literal["DEV", "QA", "PROD"] = "DEV",
+        config_key: str = None,
+        **kwargs,
+    ):
+
+        credentials = credentials or get_source_credentials(config_key) or {}
+
+        if credentials is None:
+            raise CredentialError("Please specify the credentials.")
+
+        super().__init__(*args, credentials=credentials, **kwargs)
+
+        if env.upper() == "DEV":
+            self.salesforce = SF(
+                username=self.credentials.get("username"),
+                password=self.credentials.get("password"),
+                security_token=self.credentials.get("token"),
+                domain=domain,
+                client_id=client_id,
+            )
+        elif env.upper() == "QA":
+            self.salesforce = SF(
+                username=self.credentials.get("username"),
+                password=self.credentials.get("password"),
+                security_token=self.credentials.get("token"),
+                domain=domain,
+                client_id=client_id,
+            )
+        elif env.upper() == "PROD":
+            self.salesforce = SF(
+                username=self.credentials.get("username"),
+                password=self.credentials.get("password"),
+                security_token=self.credentials.get("token"),
+                domain=domain,
+                client_id=client_id,
+            )
+        else:
+            raise ValueError("The only available environments are DEV, QA, and PROD.")
+
+    def upsert(
+        self,
+        df: pd.DataFrame,
+        table: str,
+        external_id: str = None,
+        raise_on_error: bool = False,
+    ) -> None:
+
+        if df.empty:
+            self.logger.info("No data to upsert.")
+            return
+
+        if external_id and external_id not in df.columns:
+            raise ValueError(
+                f"Passed DataFrame does not contain column '{external_id}'."
+            )
+
+        table_to_upsert = getattr(self.salesforce, table)
+        records = df.to_dict("records")
+        records_cp = records.copy()
+
+        for record in records_cp:
+            response = 0
+            if external_id:
+                if record[external_id] is None:
+                    continue
+                else:
+                    merge_key = f"{external_id}/{record[external_id]}"
+                    record.pop(external_id)
+            else:
+                merge_key = record.pop("Id")
+
+            try:
+                response = table_to_upsert.upsert(data=record, record_id=merge_key)
+            except SalesforceMalformedRequest as e:
+                msg = f"Upsert of record {merge_key} failed."
+                if raise_on_error:
+                    raise ValueError(msg) from e
+                else:
+                    self.logger.warning(msg)
+
+            codes = {200: "updated", 201: "created", 204: "updated"}
+
+            if response not in codes:
+                msg = f"Upsert failed for record: \n{record} with response {response}"
+                if raise_on_error:
+                    raise ValueError(msg)
+                else:
+                    self.logger.warning(msg)
+            else:
+                self.logger.info(f"Successfully {codes[response]} record {merge_key}.")
+
+        self.logger.info(
+            f"Successfully upserted {len(records)} records into table '{table}'."
+        )
+
+    def bulk_upsert(
+        self,
+        df: pd.DataFrame,
+        table: str,
+        external_id: str = None,
+        batch_size: int = 10000,
+        raise_on_error: bool = False,
+    ) -> None:
+
+        if df.empty:
+            self.logger.info("No data to upsert.")
+            return
+
+        if external_id and external_id not in df.columns:
+            raise ValueError(
+                f"Passed DataFrame does not contain column '{external_id}'."
+            )
+        records = df.to_dict("records")
+        response = 0
+        try:
+            response = self.salesforce.bulk.__getattr__(table).upsert(
+                data=records, external_id_field=external_id, batch_size=batch_size
+            )
+        except SalesforceMalformedRequest as e:
+            # Bulk insert didn't work at all.
+            raise ValueError(f"Upsert of records failed: {e}") from e
+
+        self.logger.info(f"Successfully upserted bulk records.")
+
+        if any(result.get("success") is not True for result in response):
+            # Upsert of some individual records failed.
+            failed_records = [
+                result for result in response if result.get("success") is not True
+            ]
+            msg = f"Upsert failed for records {failed_records} with response {response}"
+            if raise_on_error:
+                raise ValueError(msg)
+            else:
+                self.logger.warning(msg)
+
+        self.logger.info(
+            f"Successfully upserted {len(records)} records into table '{table}'."
+        )
+
+    def download(
+        self, query: str = None, table: str = None, columns: List[str] = None
+    ) -> List[OrderedDict]:
+        if not query:
+            if columns:
+                columns_str = ", ".join(columns)
+            else:
+                columns_str = "FIELDS(STANDARD)"
+            query = f"SELECT {columns_str} FROM {table}"
+        records = self.salesforce.query(query).get("records")
+        # Take trash out.
+        _ = [record.pop("attributes") for record in records]
+        return records
+
+    def to_df(
+        self,
+        query: str = None,
+        table: str = None,
+        columns: List[str] = None,
+    ) -> pd.DataFrame:
+
+        records = self.download(query=query, table=table, columns=columns)
+
+        return pd.DataFrame(records)