From 8041f21353186f93d9b06db8ade1937fff83b996 Mon Sep 17 00:00:00 2001 From: Impervguin Date: Mon, 14 Oct 2024 15:49:35 +0300 Subject: [PATCH] Done google sheet parser --- googleimport/achievement.py | 8 + googleimport/googleloader.py | 58 +++++++ googleimport/googlesheet.py | 138 +++++++++++++++++ googleimport/member.py | 23 +++ googleimport/organisation.py | 35 +++++ googleimport/organisationparser.py | 235 +++++++++++++++++++++++++++++ googleimport/settings.json | 38 +++++ googleimport/utils.py | 21 +++ 8 files changed, 556 insertions(+) create mode 100644 googleimport/achievement.py create mode 100644 googleimport/googleloader.py create mode 100644 googleimport/googlesheet.py create mode 100644 googleimport/member.py create mode 100644 googleimport/organisation.py create mode 100644 googleimport/organisationparser.py create mode 100644 googleimport/settings.json create mode 100644 googleimport/utils.py diff --git a/googleimport/achievement.py b/googleimport/achievement.py new file mode 100644 index 0000000..c0ce0b0 --- /dev/null +++ b/googleimport/achievement.py @@ -0,0 +1,8 @@ + + +class Achievement: + def __init__(self, count : str, description : str): + self.count = count + self.description = description + + \ No newline at end of file diff --git a/googleimport/googleloader.py b/googleimport/googleloader.py new file mode 100644 index 0000000..b033f91 --- /dev/null +++ b/googleimport/googleloader.py @@ -0,0 +1,58 @@ +from googleapiclient.discovery import build +from googleapiclient.http import MediaIoBaseDownload +import os + +from google.auth.transport.requests import Request +from google.oauth2.credentials import Credentials +from google_auth_oauthlib.flow import InstalledAppFlow + +SCOPES = ["https://www.googleapis.com/auth/drive.readonly"] + +class GoogleFile: + def __init__(self, id: str, name: str, mimeType: str): + self.Id = id + self.Name = name + self.MimeType = mimeType + +class GoogleLoader: + def __init__(self, creds): + self.service = build("drive", "v3", credentials=creds) + + def DownloadBlobFile(self, googleFileId, filename): + req = self.service.files().get_media( + fileId = googleFileId, + supportsAllDrives=True, + ) + with open(filename, 'wb') as saveFile: + downloader = MediaIoBaseDownload(saveFile, req) + done = False + while not done: + _, done = downloader.next_chunk() + + def GetFileInfo(self, googleFileId) -> GoogleFile: + req = self.service.files().get( + fileId=googleFileId, + supportsAllDrives=True, + ).execute() + return GoogleFile( + id=req["id"], + name=req["name"], + mimeType=req["mimeType"] + ) + + def GetSharedDriveFiles(self, googleDriveId) -> list[GoogleFile] : + req = (self.service.files() + .list(q=f"'{googleDriveId}' in parents", supportsAllDrives=True, includeItemsFromAllDrives=True,) + .execute()) + files = req.get("files", []) + resFiles = [] + for f in files: + resFiles.append(GoogleFile( + id=f['id'], + name=f['name'], + mimeType=f['mimeType'] + )) + return resFiles + + + diff --git a/googleimport/googlesheet.py b/googleimport/googlesheet.py new file mode 100644 index 0000000..fbc4d6b --- /dev/null +++ b/googleimport/googlesheet.py @@ -0,0 +1,138 @@ +from googleapiclient.discovery import build +import os.path +import utils +from google.auth.transport.requests import Request +from google.oauth2.credentials import Credentials +from google_auth_oauthlib.flow import InstalledAppFlow +from googleapiclient.discovery import build +from googleapiclient.errors import HttpError + +SCOPES = ["https://www.googleapis.com/auth/spreadsheets.readonly"] +ALPHABET = {chr(i): i - ord('A') + 1 for i in range(ord('A'), ord('Z') + 1)} +REVERSE_ALPHABET = {i + 1: chr(i + ord('A')) for i in range(26)} +class GoogleCellAddress: + def __init__(self, addr : str): + addr = addr.upper() + self.column = 0 + self.row = 0 + columnPart = "" + for i in range(len(addr)): + if addr[i] in ALPHABET: + columnPart += addr[i] + else: + self.row = int(addr[i:]) + break + pow = 26 ** (len(columnPart) - 1) + for el in columnPart: + self.column += pow * ALPHABET[el] + pow //= 26 + + + def ToGoogleCell(self): + column = self.column + addr = "" + while column > 0: + if column % 26 == 0: + addr = REVERSE_ALPHABET[26] + addr + column //= 26 + column -= 1 + else: + addr = REVERSE_ALPHABET[column % 26] + addr + column //= 26 + addr += str(self.row) + return addr + + def GetColumn(self): + return self.column + + def GetRow(self): + return self.row + + @staticmethod + def Difference(cell1, cell2): + return cell1.column - cell2.column, cell1.row - cell2.row + + def __str__(self): + return f"{'{'}Column: {self.column}, Row: {self.row}.{'}'}" + +class GoogleSheetRange: + def __init__(self, values, cellStart: GoogleCellAddress, cellEnd: GoogleCellAddress): + self. valuesMatrix = values + if values == None: + raise ValueError("values must be specified") + self.cellStart = cellStart + self.cellEnd = cellEnd + self.cols, self.rows = GoogleCellAddress.Difference(self.cellEnd, self.cellStart) + self.cols += 1 + self.rows += 1 + if self.cols <= 0 or self.rows <= 0: + raise ValueError("Incorrect range values") + + def Rows(self): + return self.rows + + def Cols(self): + return self.cols + + def NonEmptyRows(self): + return len(self.valuesMatrix) + + def GetCell(self, address: str) -> str: + googleAddress = GoogleCellAddress(address) + col, row = GoogleCellAddress.Difference(googleAddress, self.cellStart) + if row < 0 or row >= self.rows or col < 0 or col >= self.cols: + raise IndexError(f"Address out of range: {address}") + if row >= len(self.valuesMatrix): + return "" + if col >= len(self.valuesMatrix[row]): + return "" + return self.valuesMatrix[row][col] + + def __getitem__(self, index: str): + return self.GetCell(index) + +class GoogleSpreadsheet: + def __init__(self, spreadsheetId, creds): + self.service = build("sheets", "v4", credentials=creds) + self.spreadsheetGoogleID = spreadsheetId + + def GetTableSizes(self, tableName: str) -> tuple[int, int]: + spreadsheets = self.service.spreadsheets().get(spreadsheetId=self.spreadsheetGoogleID).execute() + for spreadsheet in spreadsheets['sheets']: + if spreadsheet['properties']['title'] == tableName: + sheet = spreadsheet + break + else: + raise ValueError(f"No {tableName} table found.") + + rowCount = sheet['properties']['gridProperties']['rowCount'] + colCount = sheet['properties']['gridProperties']['columnCount'] + return rowCount, colCount + + def GetTableRange(self, tableName: str, cellStart: str, cellEnd: str): + cellS = GoogleCellAddress(cellStart) + cellE = GoogleCellAddress(cellEnd) + if any([el < 0 for el in GoogleCellAddress.Difference(cellE, cellS)]): + raise ValueError(f"invalid range {cellS.ToGoogleCell()}:{cellE.ToGoogleCell()}") + + dataRange = f"{tableName}!{cellS.ToGoogleCell()}:{cellE.ToGoogleCell()}" + + data = self.service.spreadsheets().values().get( + spreadsheetId=self.spreadsheetGoogleID, + range=dataRange).execute() + + if data == None: + raise RuntimeError(f"can't get table {dataRange} from spreadsheet {self.spreadsheetGoogleID}") + + return GoogleSheetRange(values=data['values'], cellStart=cellS, cellEnd=cellE) + + + + + + + + + + + diff --git a/googleimport/member.py b/googleimport/member.py new file mode 100644 index 0000000..eb6d1a4 --- /dev/null +++ b/googleimport/member.py @@ -0,0 +1,23 @@ +import utils +class Member: + def __init__(self, name, photoUrl, telegram, vk, roleName, roleSpec, roleField): + try: + self.photoGoogleId = utils.ParseSharedFileID(photoUrl) + except ValueError: # Incorrect photo url + self.photoGoogleId = None + self.osPhotoPath = None + self.name = name + self.telegram = telegram + self.vk = vk + self.roleName = roleName + self.roleSpec = roleSpec + self.roleField = roleField + + def GetPhotoGoogleId(self): + return self.photoGoogleId + + def SetOsPhotoPath(self, path): + self.osPhotoPath = path + + def GetName(self): + return self.name \ No newline at end of file diff --git a/googleimport/organisation.py b/googleimport/organisation.py new file mode 100644 index 0000000..312abe8 --- /dev/null +++ b/googleimport/organisation.py @@ -0,0 +1,35 @@ +import os.path +import utils +from member import Member +from achievement import Achievement + +class Organization: + def __init__(self): + self.ClubId = None + self.ClubType = "" + self.Name = "" + self.ShortName = "" + self.ShortDescription = "" + self.Description = "" + self.Telegram = "" + self.Vk = "" + self.PhotoFolderGoogleID = "" + self.LogoGoogleID = None + self.OsLogoPath = None + self.OsPhotosPath = None + self.Members = [] + self.Achievements = [] + + def AddMember(self, member: Member): + self.Members.append(member) + + def AddAchievement(self, achievement: Achievement): + self.Achievements.append(achievement) + + def DeleteMember(self, member: Member): + self.Members.remove(member) + + def DeleteAchievement(self, achievement: Achievement): + self.Achievements.remove(achievement) + + diff --git a/googleimport/organisationparser.py b/googleimport/organisationparser.py new file mode 100644 index 0000000..b8e8e00 --- /dev/null +++ b/googleimport/organisationparser.py @@ -0,0 +1,235 @@ +import json +from googleapiclient.discovery import build +import os.path +import utils +from google.auth.transport.requests import Request +from google.oauth2.credentials import Credentials +from google_auth_oauthlib.flow import InstalledAppFlow +from googleapiclient.discovery import build +from googleapiclient.errors import HttpError +from member import Member +from achievement import Achievement +from googlesheet import GoogleSheetRange, GoogleSpreadsheet +from googleloader import GoogleLoader +from organisation import Organization +import googleloader +import googlesheet +import logging + +SCOPES = googlesheet.SCOPES + googleloader.SCOPES +SAMPLE_SPREADSHEET_ID = "1lfzZuoui21E78wYmF6fBPXHimgREkx81iSSFF45dOw8" + +Settings = json.load(open("settings.json")) +class OrganizationParser: + def __init__(self, spreadsheetID: str, creds, logger: logging.Logger): + self.spreadsheetGoogleID = spreadsheetID + self.creds = creds + self.spreadsheet = GoogleSpreadsheet(spreadsheetID, creds) + self.loader = GoogleLoader(creds) + self.organisation = Organization() + self.logger = logger + + + def ParseSpreadsheet(self): + self.logger.info(f"Start loading spreadsheet id={self.spreadsheetGoogleID}.") + + + self.logger.info(f"Parsing sheet {Settings["organisations_table"]}.") + orgRange = self.spreadsheet.GetTableRange(Settings["organisations_table"], + f"{Settings["organisation_start_column"]}{Settings["organisation_row"]}", + f"{Settings["organisation_end_column"]}{Settings["organisation_row"]}") + self.logger.info(f"Parse {Settings["organisations_table"]} successful.") + + self.organisation.ClubType = orgRange[f"{Settings["club_type_column"]}{Settings["organisation_row"]}"] + self.organisation.Name = orgRange[f"{Settings['name_column']}{Settings["organisation_row"]}"] + self.organisation.ShortName = orgRange[f"{Settings['short_name_column']}{Settings["organisation_row"]}"] + self.organisation.ShortDescription = orgRange[f"{Settings['short_description_column']}{Settings["organisation_row"]}"] + self.organisation.Description = orgRange[f"{Settings['description_column']}{Settings["organisation_row"]}"] + self.organisation.PhotoFolderGoogleID = utils.ParseSharedFolderID(orgRange[f"{Settings['photos_url_column']}{Settings["organisation_row"]}"]) + self.organisation.LogoGoogleID = utils.ParseSharedFileID(orgRange[f"{Settings['logo_url_column']}{Settings["organisation_row"]}"]) + + self.logger.info(f"Parsing sheet {Settings["urls_table"]}.") + urlRange = self.spreadsheet.GetTableRange(Settings["urls_table"], + f"{Settings["urls_column"]}{Settings["vk_url_row"]}", + f"{Settings["urls_column"]}{Settings["telegram_url_row"]}") + + self.logger.info(f"Parse {Settings["urls_table"]} successful.") + + self.organisation.Telegram = urlRange[f"{Settings["urls_column"]}{Settings["telegram_url_row"]}"] + self.organisation.Vk = urlRange[f"{Settings["urls_column"]}{Settings["vk_url_row"]}"] + + self.logger.debug(f"Parsed organisation {self.organisation.__dict__}.") + + self.ParseMembers() + self.ParseAchievements() + + def ParseMembers(self): + self.logger.info(f"Parsing members of organisation {self.organisation.Name}, id={self.spreadsheetGoogleID}.") + self.logger.info(f"Parsing {Settings["members_table"]}.") + + rows, _ = self.spreadsheet.GetTableSizes(Settings['members_table']) + startRow = Settings['members_start_row'] + data = self.spreadsheet.GetTableRange(Settings['members_table'], f"{Settings['members_start_column']}{startRow}", f"{Settings['members_end_column']}{rows}") + self.logger.info(f"Parse {Settings['members_table']} successful.") + self.logger.info(f"Got {data.NonEmptyRows()} member rows.") + + for i in range(data.NonEmptyRows()): + try: + row = i + startRow + self.organisation.AddMember(Member( + name=data[f"{Settings['members_name_column']}{row}"], + photoUrl=data[f"{Settings['members_photo_url_column']}{row}"], + telegram=data[f"{Settings['members_telegram_url_column']}{row}"], + vk=data[f"{Settings['members_vk_url_column']}{row}"], + roleName=data[f"{Settings['members_role_name_column']}{row}"], + roleSpec=data[f"{Settings['members_role_spec_column']}{row}"], + roleField=data[f"{Settings['members_role_field_column']}{row}"], + ) + ) + self.logger.debug(f"Parsed member {self.organisation.Members[-1].__dict__}") + except BaseException as e: + self.logger.error(f"Error parsing member at row {row}: {e}") + self.logger.error(f"Do not include member at row {row}.") + + + def ParseAchievements(self): + self.logger.info(f"Parsing achievements of organisation {self.organisation.Name}, id={self.spreadsheetGoogleID}.") + self.logger.info(f"Parsing {Settings["achievments_table"]}.") + + rows, _ = self.spreadsheet.GetTableSizes(Settings['achievments_table']) + startRow = Settings['achievments_start_row'] + data = self.spreadsheet.GetTableRange(Settings['achievments_table'], f"{Settings['achievments_start_column']}{startRow}", f"{Settings['achievments_end_column']}{rows}") + self.logger.info(f"Parse {Settings['achievments_table']} successful.") + self.logger.info(f"Got {data.NonEmptyRows()} achievements rows.") + + for i in range(data.NonEmptyRows()): + try: + row = i + startRow + self.organisation.AddAchievement(Achievement( + count=data[f"{Settings['achievments_count_column']}{row}"], + description=data[f"{Settings['achievments_description_column']}{row}"] + ) + ) + self.logger.debug(f"Parsed member {self.organisation.Achievements[-1].__dict__}") + except BaseException as e: + self.logger.error(f"Error parsing achievement at row {row}: {e}") + self.logger.error(f"Do not include achievement at row {row}.") + + def DownloadLogo(self): + self.logger.info(f"Downloading organisation {self.organisation.Name} logo id={self.organisation.LogoGoogleID}...") + self.CreateOrganisationSubdir(Settings["logo_dir"]) + if self.organisation.LogoGoogleID == None: + self.logger.info("No logo found.") + return + + fileInfo = self.loader.GetFileInfo(self.organisation.LogoGoogleID) + self.logger.info(f"Logo file: {fileInfo}.") + logoFileName = os.path.join(self.GetOrganisationSubdir(Settings["logo_dir"]), fileInfo.Name) + self.logger.info(f"Downloading logo file id={self.organisation.LogoGoogleID} to {logoFileName}") + try: + self.loader.DownloadBlobFile(self.organisation.LogoGoogleID, logoFileName) + except BaseException as e: + self.logger.error(f"Error downloading logo: {e}") + self.logger.info(f"Logo downloaded to {logoFileName}.") + self.organisation.OsLogoPath = logoFileName + + def DownloadOrganizationPhotos(self): + self.logger.info(f"Downloading organisation {self.organisation.Name} photos...") + self.CreateOrganisationSubdir(Settings["photos_dir"]) + self.organisation.OsPhotosPath = self.GetOrganisationSubdir(Settings["photos_dir"]) + + try: + files = self.loader.GetSharedDriveFiles(self.organisation.PhotoFolderGoogleID) + except BaseException as e: + self.logger.error(f"Error getting photo files: {e}") + return + self.logger.info(f"Got {len(files)} photo files.") + for f in files: + if f.MimeType == 'application/vnd.google-apps.folder': + continue + self.logger.info(f"Photo file: {f}.") + photoFileName = os.path.join(self.GetOrganisationSubdir(Settings['photos_dir']), f.Name) + self.logger.info(f"Downloading photo id={f.Id} to {photoFileName}") + try: + self.loader.DownloadBlobFile(f.Id, photoFileName) + except BaseException as e: + self.logger.error(f"Error downloading photo: {e}") + self.logger.info(f"Photo downloaded to {photoFileName}.") + + def DownloadMemberPhotos(self): + self.logger.info(f"Downloading organisation {self.organisation.Name} members photos...") + self.CreateOrganisationSubdir(Settings["member_photos_dir"]) + + for member in self.organisation.Members: + fId = member.GetPhotoGoogleId() + if fId == None: + self.logger.info(f"No photo found for member {member.GetName()}.") + continue + self.logger.info(f"Downloading member {member.GetName()} photo id={fId}") + try: + fileInfo = self.loader.GetFileInfo(fId) + except BaseException as e: + self.logger.error(f"Error getting member photo file: {e}") + continue + self.logger.info(f"Member {member.GetName()} photo file: {fileInfo}.") + photoFileName = os.path.join(self.GetOrganisationSubdir(Settings['member_photos_dir']), fileInfo.Name) + self.logger.info(f"Downloading member photo id={fId} to {photoFileName}") + try: + self.loader.DownloadBlobFile(fId, photoFileName) + except BaseException as e: + self.logger.error(f"Error downloading member photo: {e}") + self.logger.info(f"Member {member.GetName()} photo downloaded to {photoFileName}.") + member.SetOsPhotoPath(photoFileName) + + def CreateOrganisationDirectory(self): + if not os.path.exists(Settings["data_dir"]): + self.logger.info("Creating organisation directory {0}".format(Settings["data_dir"])) + os.mkdir(Settings["data_dir"]) + if not os.path.exists(self.GetOrganisationDir()): + self.logger.info("Creating organisation subdir {0}".format(self.GetOrganisationDir())) + os.mkdir(self.GetOrganisationDir()) + + def CreateOrganisationSubdir(self, subdir): + self.CreateOrganisationDirectory() + if not os.path.exists(self.GetOrganisationSubdir(subdir)): + self.logger.info("Creating {0} subdir {1}".format(subdir, self.GetOrganisationSubdir(subdir))) + os.mkdir(self.GetOrganisationSubdir(subdir)) + + def GetOrganisationDir(self): + return os.path.join(os.path.join(Settings["data_dir"], self.organisation.Name)) + + def GetOrganisationSubdir(self, subdir): + return os.path.join(os.path.join(Settings["data_dir"], self.organisation.Name, subdir)) + + +if __name__ == "__main__": + creds = None + # The file token.json stores the user's access and refresh tokens, and is + # created automatically when the authorization flow completes for the first + # time. + if os.path.exists("token.json"): + creds = Credentials.from_authorized_user_file("token.json", SCOPES) + # If there are no (valid) credentials available, let the user log in. + if not creds or not creds.valid: + if creds and creds.expired and creds.refresh_token: + creds.refresh(Request()) + else: + flow = InstalledAppFlow.from_client_secrets_file( + "creds.json", SCOPES) + creds = flow.run_local_server(port=0) + # Save the credentials for the next run + with open("token.json", "w") as token: + token.write(creds.to_json()) + + + l = logging.getLogger("OrganizationParser") + l.setLevel(logging.INFO) + logging.basicConfig(level=logging.DEBUG, filename="info.log", filemode="w") + # logging.basicConfig(level=logging.WARNING, filename="warning.log", filemode="w") + + org = OrganizationParser(SAMPLE_SPREADSHEET_ID, creds, l) + org.ParseSpreadsheet() + # print(org.organisation.__dict__) + org.DownloadLogo() + org.DownloadOrganizationPhotos() + org.DownloadMemberPhotos() \ No newline at end of file diff --git a/googleimport/settings.json b/googleimport/settings.json new file mode 100644 index 0000000..bfeb86d --- /dev/null +++ b/googleimport/settings.json @@ -0,0 +1,38 @@ +{ + "organisations_table":"Организация", + "organisation_row":2, + "organisation_start_column":"A", + "organisation_end_column":"G", + "club_type_column":"A", + "name_column":"B", + "short_name_column":"C", + "logo_url_column":"D", + "photos_url_column":"E", + "short_description_column":"F", + "description_column":"G", + "urls_table":"Ссылки", + "urls_column":"A", + "vk_url_row":3, + "telegram_url_row":4, + "members_table":"Руководящий состав", + "members_start_column":"A", + "members_end_column":"G", + "members_start_row": 3, + "members_name_column": "A", + "members_photo_url_column": "B", + "members_telegram_url_column": "C", + "members_vk_url_column":"D", + "members_role_spec_column":"E", + "members_role_name_column":"F", + "members_role_field_column":"G", + "achievments_table":"Достижения организации", + "achievments_start_column":"B", + "achievments_end_column":"C", + "achievments_start_row": 3, + "achievments_count_column": "B", + "achievments_description_column": "C", + "data_dir": "data", + "logo_dir": "logo", + "photos_dir":"photo", + "member_photos_dir":"orgs" +} \ No newline at end of file diff --git a/googleimport/utils.py b/googleimport/utils.py new file mode 100644 index 0000000..cbb3168 --- /dev/null +++ b/googleimport/utils.py @@ -0,0 +1,21 @@ + +def ParseSharedFolderID(url: str) -> str: + """Parses the Shared Folder ID from the Google Drive URL.""" + folderIndex = 0 + splitURL = url.split('/') + while folderIndex < len(splitURL) - 1: + if splitURL[folderIndex] == 'folders': + return splitURL[folderIndex + 1] + folderIndex += 1 + raise ValueError('Could not parse Shared Folder ID: no folders found') + +def ParseSharedFileID(url: str) -> str: + """Parses the Shared File ID from the Google Drive URL.""" + fileIndex = 0 + splitURL = url.split('/') + while fileIndex < len(splitURL) - 1: + if splitURL[fileIndex] == 'file': + return splitURL[fileIndex + 2] + fileIndex += 1 + raise ValueError('Could not parse Shared File ID: no files found') +