From 4c84f0bc6dfdc25857ddf98cd3ff4aef79c21743 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Mon, 14 Aug 2023 07:21:14 -0400 Subject: [PATCH 001/101] First cut at tools for parsing workbooks. --- CHANGELOG.rst | 10 ++ dcicutils/sheet_utils.py | 188 ++++++++++++++++++++++++++++++ pyproject.toml | 12 +- test/data_files/sample_items.xlsx | Bin 0 -> 9901 bytes test/test_sheet_utils.py | 179 ++++++++++++++++++++++++++++ 5 files changed, 384 insertions(+), 5 deletions(-) create mode 100644 dcicutils/sheet_utils.py create mode 100644 test/data_files/sample_items.xlsx create mode 100644 test/test_sheet_utils.py diff --git a/CHANGELOG.rst b/CHANGELOG.rst index fe2b0b147..61f334d68 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,6 +7,16 @@ Change Log ---------- +7.8.0 +===== + +* New module ``sheet_utils`` for loading workbooks. + + * class ``WorkbookManager`` for loading raw data + + * class ``ItemManager`` for loading item data + + 7.7.2 ===== diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py new file mode 100644 index 000000000..c23129ffe --- /dev/null +++ b/dcicutils/sheet_utils.py @@ -0,0 +1,188 @@ +import copy + +from openpyxl import load_workbook +from openpyxl.worksheet.worksheet import Worksheet +from openpyxl.workbook.workbook import Workbook +from typing import Any, Dict, List, Optional, Union + + +class WorkbookManager: + + @classmethod + def load_workbook(cls, filename: str): + wb = cls(filename) + return wb.load_content() + + def __init__(self, filename: str): + self.filename: str = filename + self.workbook: Optional[Workbook] = None + self.headers_by_sheetname: Dict[List[str]] = {} + self.content_by_sheetname: Dict[List[Any]] = {} + + def sheet_headers(self, sheet: Worksheet) -> List[str]: + return self.headers_by_sheetname[sheet.title] + + def sheet_content(self, sheet: Worksheet) -> List[Any]: + return self.content_by_sheetname[sheet.title] + + @classmethod + def all_rows(cls, sheet: Worksheet): + row_max = sheet.max_row + for row in range(2, row_max + 1): + yield row + + @classmethod + def all_cols(cls, sheet: Worksheet): + col_max = sheet.max_column + for col in range(1, col_max + 1): + yield col + + def load_headers(self, sheet: Worksheet): + headers: List[str] = [str(sheet.cell(row=1, column=col).value) + for col in self.all_cols(sheet)] + self.headers_by_sheetname[sheet.title] = headers + + def load_content(self): + workbook: Workbook = load_workbook(self.filename) + self.workbook = workbook + for sheetname in workbook.sheetnames: + sheet: Worksheet = workbook[sheetname] + self.load_headers(sheet) + content = [] + for row in self.all_rows(sheet): + row_dict = self.load_row(sheet=sheet, row=row) + content.append(row_dict) + self.content_by_sheetname[sheetname] = content + return self.content_by_sheetname + + def load_row(self, *, sheet: Worksheet, row: int): + headers = self.sheet_headers(sheet) + row_dict: Dict[str, Any] = {headers[col-1]: sheet.cell(row=row, column=col).value + for col in self.all_cols(sheet)} + return row_dict + + +class ItemManager(WorkbookManager): + + def __init__(self, filename: str): + super().__init__(filename=filename) + self.patch_prototypes_by_sheetname: Dict[Dict] = {} + self.parsed_headers_by_sheetname: Dict[List[List[Union[int, str]]]] = {} + + def sheet_patch_prototype(self, sheet: Worksheet) -> Dict: + return self.patch_prototypes_by_sheetname[sheet.title] + + def sheet_parsed_headers(self, sheet: Worksheet) -> List[List[Union[int, str]]]: + return self.parsed_headers_by_sheetname[sheet.title] + + def load_headers(self, sheet: Worksheet): + super().load_headers(sheet) + self.compile_sheet_headers(sheet) + + def compile_sheet_headers(self, sheet: Worksheet): + headers = self.headers_by_sheetname[sheet.title] + parsed_headers = self.parse_sheet_headers(headers) + self.parsed_headers_by_sheetname[sheet.title] = parsed_headers + prototype = self.compute_patch_prototype(parsed_headers) + self.patch_prototypes_by_sheetname[sheet.title] = prototype + + @classmethod + def compute_patch_prototype(cls, parsed_headers): + prototype = {} + for parsed_header in parsed_headers: + parsed_header0 = parsed_header[0] + if isinstance(parsed_header0, int): + raise ValueError(f"A header cannot begin with a numeric ref: {parsed_header0}") + cls.assure_patch_prototype_shape(parent=prototype, keys=parsed_header) + return prototype + + @classmethod + def assure_patch_prototype_shape(cls, *, parent: Union[Dict, List], keys: List[Union[int, str]]): + [key0, *more_keys] = keys + key1 = more_keys[0] if more_keys else None + if isinstance(key1, int): + placeholder = [] + elif isinstance(key1, str): + placeholder = {} + else: + placeholder = None + if isinstance(key0, int): + n = len(parent) + if key0 == n: + parent.append(placeholder) + elif key0 > n: + raise Exception("Numeric items must occur sequentially.") + elif isinstance(key0, str): + if key0 not in parent: + parent[key0] = placeholder + if key1 is not None: + cls.assure_patch_prototype_shape(parent=parent[key0], keys=more_keys) + return parent + + @classmethod + def parse_sheet_headers(cls, headers): + return [cls.parse_sheet_header(header) + for header in headers] + + @classmethod + def parse_sheet_header(cls, header) -> List[Union[int, str]]: + result = [] + token = "" + for i in range(len(header)): + ch = header[i] + if ch == '.' or ch == '#': + if token: + result.append(int(token) if token.isdigit() else token) + token = "" + else: + token += ch + if token: + result.append(int(token) if token.isdigit() else token) + return result + + def load_row(self, *, sheet: Worksheet, row: int): + parsed_headers = self.sheet_parsed_headers(sheet) + patch_item = copy.deepcopy(self.sheet_patch_prototype(sheet)) + for col in self.all_cols(sheet): + value = sheet.cell(row=row, column=col).value + parsed_value = self.parse_value(value) + self.set_path_value(patch_item, parsed_headers[col - 1], parsed_value) + return patch_item + + @classmethod + def set_path_value(cls, datum, path, value, force=False): + if (value is None or value == '') and not force: + return + [key, *more_path] = path + if not more_path: + datum[key] = value + else: + cls.set_path_value(datum[key], more_path, value) + + @classmethod + def parse_value(cls, value): + if isinstance(value, str): + lvalue = value.lower() + # TODO: We could consult a schema to make this less heuristic, but this may do for now + if lvalue == 'true': + return True + elif lvalue == 'false': + return False + elif lvalue == 'null' or lvalue == '': + return None + elif '|' in value: + return [cls.parse_value(subvalue) for subvalue in value.split('|')] + else: + ch0 = value[0] + if ch0 == '+' or ch0 == '-' or ch0.isdigit(): + try: + return int(value) + except Exception: + pass + try: + return float(value) + except Exception: + pass + return value + else: # probably a number + return value diff --git a/pyproject.toml b/pyproject.toml index 7c56d6b7e..647c13fe0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,6 +37,7 @@ classifiers = [ [tool.poetry.dependencies] python = ">=3.7,<3.10" + boto3 = "^1.17.39" botocore = "^1.20.39" # The DCIC portals (cgap-portal and fourfront) are very particular about which ElasticSearch version. @@ -45,20 +46,21 @@ elasticsearch = "7.13.4" aws-requests-auth = ">=0.4.2,<1" docker = "^4.4.4" gitpython = "^3.1.2" +openpyxl = "^3.1.2" +opensearch-py = "^2.0.1" +pyOpenSSL = "^23.1.1" +PyJWT = "^2.6.0" pytz = ">=2020.4" PyYAML = ">=5.1,<5.5" +redis = "^4.5.1" requests = "^2.21.0" rfc3986 = "^1.4.0" structlog = "^19.2.0" toml = ">=0.10.1,<1" +tqdm = "^4.65.0" typing-extensions = ">=3.8" # Fourfront uses 3.8 urllib3 = "^1.26.6" webtest = "^2.0.34" -opensearch-py = "^2.0.1" -redis = "^4.5.1" -pyOpenSSL = "^23.1.1" -PyJWT = "^2.6.0" -tqdm = "^4.65.0" [tool.poetry.dev-dependencies] diff --git a/test/data_files/sample_items.xlsx b/test/data_files/sample_items.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..19ca2acc8ddc54c0e84ee2f3fc84f9dd2e92f8e5 GIT binary patch literal 9901 zcmeHNRajifwr)ajch|-xxLfex9^7f%-GT;(0FAo@cXtVH2^vCh4;mx{x7*2_xigcQ z`*@$u)Wfc?zujw9?Y;l?FDqp^=%?5KSO7c#0H6S1;;$^5K>+~xFaQ7+0RD-txP!eb z$lldZ-OCZ=qW{d(&Xzp?=@YtKz!S*-{~iCuZ=gSM!oHUkdGI*l0#&0*jcAQk0V|~M zftaVD7H*hO95_uCi4xqLLu8a*g!=mVLZJ%L?X91aDcaD%pgCi>We1~V_ihBT-gcwY znQDhp&Y3_w1$}YSIjU?e6!zxS`K)6bLI8n<0F$Hw_EBMfS*VopIyCP}+Vy9;jXB;F zQ4J(6IKJI;d(bvX;M+_c6gEQPUU_C_kdnCr2|3QB#f$f3tC?!tyIXP4v9#pFChU$j%G`vu4>p*6<5r6OAHrX@yKCk%#` z_V4t&8(mx$jM*QixY*#Vh{D7c05*A4hNj#&xxq2eI44UvRjl`8xX+%?UZhFOdeOOe z#xj(*l;p{ctOBKGP9)seP>st!d`T-A|av z@SjUA+Cvcu=aWC2PQw{?F|nAh@*TFNIKL*;P_yK-sxirS=BM;BG_!qoCXvyNb??ip zm^rLU$%%QzJ|R6soqO)D)4+K;mgUvYgV0w#d@>RgMOLx`@l$^{64}xs;~U5uk%k5U z2m$a=p0+H%*@=gPvyF*^gUt^g_PfnMK^z%G%YXOi0DrLVXGMN}9MTm$)#(I&P#)#? zp%3Y7T7+Y*#SBxhH7ynKx=l|!;;t!3nS4vddB*nM{Fd+g1_{t@g`TdC_-Wfh#&{#5 z!KISsC3$!wN82STPV|eK%4aOx6Q6AqmCV|oQdwe16_d_1JcCcIJ)Q^en@*|o&B|sX9KuXAc!(10+ zb!UL9{RGAAFwTSC~0Sd*$)uJx(f9mjhU_GX*jxLJ%3NJHwv{DmC4EYvRb z49Kys_HSw#IX{S8IKO<{NBSq^s5pNah=rgA8UX-6gFpg;oIhixLc`ACEid|m-}F6P zzlXKFClZ6UC_ul^U88XR;z(^fK&=AsCE7|=<95|UUTnc$u~_dizkiS;VQ9iLn=QrA z-K|haZHjH>HGdbYTMRcJi`E(T{%UgC`hgCuj9OeI59dci%VPhu`-jEtw7O zzdrTVzcR*xwOK;Yqt%^d!t2-G`3{@e8fybZVa5FEB-_4>4M9~hh9lJQUa!NR&t>VniF|4-kyMaR0Uq}PiiU-C z_-VL6UvzBcI@*ao9;M|@YUQ#F<4A$w%x0G{wXHi##7JHpkb5;iJz-AZvQoYc@*XK0 z7d`HmQ50Ag8BW+@uy`tS9Pu^BUl_1P8>AA{tZI4Uo1`aBa5T(;4I^tv?0q4db+c7F zA{jtts5UXiPTL!bqrlPk;qYTP6AN2Mzd{H65Y{3`w>Wz*inF4(HtBw4uK053Ms|z+ zu>E%=G{r^o+}eoCOOJ2)i_`1>UtRcp#o?&)IH$Zb++x{RV^P>q&H$vk5Yna|n1Jz3V&pI4=10g@w&_V5NDSFtfSmxhZ{{&v^dm$t&QiTNoP=2#r7gsM^kjoFYFVL{FS!6|j5dQ3s zeseVHzR85pG&cQFd3P|EKa+?maAMux@ZBtrwCBy?$g~# zhoNX%Vwd@vWS<(u(#Q?w|D29e8)Mv1_Yj9oh1y3SpU5ak&SZ8iLJm&fX+n@?CfdJB zM>@I?!j-DiTXIO`c4ZE*##6A2XtT67`Iu%3Ub&@=-71`VId_EU7*S6VE|%dJc2LKyEz-lcG)Cu!I~FU7O| z2#rI80HBhn@K7{^Q84PoOjqD{L>jw*z0C)VPBvGvfstLn9@dvVoeV^p+ojmIj7l)& zBxE zTZ&P@xx6FdCiRczHtHt8bvvgPCVyjT7$;mPZE%e2e=(==Zi|?FvTeK9=^U;%K-+Bx z_q74kPPM^mrW)?9#DMhz;-%|N>U#ZsghofwDg8dus@0sh*_Ew`XJLNnL=XR^k#UES zWaT=+gIDt%tKl+^6Dv|my^-?=uu(RXrYL)>e$~qX!RWq4!?$p2D0AKZ)!(q6edi66*v1PEUixRf=RjGhS zBaAr_aS~G}Yo6amdYwj|YD!nWL5%4n$FzM^eagomI_?j9?eLFy?!iOvQiKBl2FZWq z;Xi$mt0l+|#PajX_5;<2+8?5bc(A+iZbXn>JnpzQW2slxCTtUysm!txi5r^^RbFzi zgFA?@o>23htI`z|rieKRB~CwwM(O+>i$K+|M>a+K;WMyun<~RrR>vJBO8TL*xcD;L z&-={vbhj4z=xYhvyks&NzC{My0r=4PmUrg7fvYspLoa%PIk-c_B*81GxQvCWmH0sLl|hh+ zvd=4a3g971lB~Z^X@}A}Tf7)bG~KFZSuJ&5?Pgz{Sa`^;>JbUW!Z+h=HUZKNpXp0F zOiVw^YwG(Qyk-{Msh2bo+pyZ=_PLMjt@1{`T2n6H_Ij+R$>D=_dgVjJrA&B=W=P;UNiqbwog+!FdMq|w@=bpgk$clQm}3UaV$CzyUh zB@U(hP*qJ{i;`zsC;Et87!dG8l}ct~WB^%d627h_ijHrsf023y9@8Ndlg6k;ogPJ2 zZ-y9@9h_2cq7dLWyMcvx@bDpJVL0mmG}^KQYiH-|uy+ z`P#cktThPJA=qs*-$r)YO7$kKTB;Ui$u!y>--CaxEPd?c#4%fbIpM4|qCH|V-sb3; zYz5$Qha*>WLv=o;2J{6b5;LNRw`W?dyH-4!q`X}`e0xraeM7z`;`N5L?5g`pY-WXg&49*`eX#n4mu?S^Sw2%aROORk<09#bGTUNz z_V1CVsbZm$@Q$QEyD?>YE%${#?3_@v8xeUE^_En>pBpUaygS_*h0hT$mSP5G(0}~N-w09n(q#jT%urRT@r?-jx8FZi> ziNJbX{EUL88wpv4tQ(mQWr7|rg0+~0RXhdk)7h)%o zb;Lo-?=mCh&(%FSXmU*p!rT~fawaQp70*0lV`daBS}WSpOOar}*PirxG2$t*MY}pe z$19Z|5Bzr3J+xC`>E>tfjpfaWRNU}Y3{l0f4J=+8%%^EhxoS-*@2BMHNI7jIvfAO` zcy3>;Np^0Bi7tqSniD7PnBo*Sq;*7BA&#d>c_(m`ra>v2uyByax~k2$IeA&TS(SB4 z7BB;&UF&05r&)vW)|mu8B;RTewxxxAWLIi6X(e*z*pETOu#u#sZ3(2IQJd@+u)l7! zhHqv7a@t&D?c(>%dBU~=KkDM;tF^cj$Yk4Jc^D@X;MIjmB=YMeD2~Fi#i#lt_A?tY z`zN7rP7&wZF=_^&3+rsAKW8M%a6)$#81`iE0a}GFP|Xwf zc9O-!c0%*-@L&{ZX|XWEy@?2j3U=QAz!sQMgA>1f?Xh7rYB|>tIn9^&2FM?!?4F-$ zmfg!0QT2XGiIEInX+FIt@8FqWY8muJr_wSH+3ctbtkoBbq5+2ekFE)OVFFty<=@0V z)-e>?-zzdZSJ^ykMa7);lI>Xk_$mszhm|?!J-dQ-hhkr3eZ9(J0|yg4veHNHD?2wS zUZK+}T9M(b_d0IRnGR&Lu5Lz`I%dt`JOTy19sPNWRd@Fq+Ck-0@$I-Ph>tyD^}x8O5vUq0SuKK(UDBjC8Q^bNNoj?@Sg;{&$10KXi=Vo1aYif|+TkR6KBw2xyW}^GtdvZoo zqiy-MI{e=H(AQ6U_73kfHeA$2(X-`wwyRM5o|1afs@`8oac!I=pPv5VyB zp~p!#Wau7YSHM!c0d2c1tz}(}#$}G4O?;SWq%Ap(N1vfLV14=&OKNoJaEmV?DFZVy zjLv|Af;~h+#iGRLwDQdDcB20G6+wxZJ6@VDAbz@C0^(OXeh&@2gzYZO0kki6^+a`# zZ!_6wbtab-685-##V!i2KRS1JPv}A2 zwZSKtd)+~er7C>w5dsMA!k!5X+Px7*65Ib$BDw)2y(dV?8E^5}P6Qd1B> zV*|T6tC(+&^yOg%?4##Vh*(n1BcH+{_8TGhI`KY7ki}`p%zOJ7Esu;bY~O!*nq&|I z`(Qs3DnJ+Goz&=XTD&G*nz!z=?Vq~I6Sw$jbDuZ_RdM9t3^DV0lnH$T{LCFBf_i-X z8DJ%X^$~kn(O*|%UI2=&PYVm6>{u{O-W4+_gnRomtDCf`*Eo#$LJQPaM9iU2uYv2$ z+k`%U3wq35HS?&tEos2QC2gypRgaW&DzCaBwYAC_fPG*?lb z#t>~9geiU$&IUL7G7gYotu#BUe0b~O(a7hjLcP~sB9gvOB0i;yt9bh<8n$FUo+;Dc zVGWY6{AZJO^e^O<&J~zsC*Cu@9F|dFtkqzj58#3j{BFt)9th8(peXtCkFwU^^Q4S``X*e+t#t^vv3~=$ zi>0wM$V|=E*~;GHXS(DI?vd?hg@8KfSb1ZZpmb#`*KXh$DbL)Z?_1YHXC7&D zgA4AYl#|#2lR;x*(GheL?>P_oAMbBVWc+h{ZjODLLe;TZ5WbF8NFq z8**DSQc3ymrt!;Lhcw%btD4^(oYGcm5r=XVwI1%K#Vz{a7g-eqOowNMj^|L|Vqfn5 z;0L^P+)D^QEb9N{2X$2$f5N;(%lK-ZtlAD$txGo9R&wbmi)c;fc!FDp#0Kn7yfaq_ z;Q9G8yK90md_2u*yL%7g{6y`~W5!U1DyO$hk~55qej(JyuRm1P7`NGWCAf(`77~y; zbs&#!i{lE)tG(Ca%Yw1u)@Rn7XTR1HE>7R#I}ewuC(bk8hgrt!;?Cc0)Pu0_wHw$Y;0TPd*-sT4 zh?_`h-jJR0rHXKDI(|#-H^v-LlkFe(ddp({b|U@+QhE5DOcHcagILieo5k7yd6K8` z3lj8b%P^vt-5<4r+I*0xCk6cPpe&v(WztVCWfaY7=+a;N`2}6ZJK?5kp0i%;zXRg> zUbmH@HRssNFAFXJqh1ElYhTvW3fSi0`>d8qgl2(_h3e`jnub+#zQ$SQe2oVQZ({Tk zlS^v>u;Jwb*m{xParH93`|wnp(S@BM8NabFK}9j?VncxXMISc1M5B8eN^PcS=({Bl z=C`aT>}FispJPfouLwlB-Q5U;yAeAE5FAM23QSrg=l)_bTm;&J3BbKu$VeH zgMOG4q}BCr*&gySqQNS1eXQ6at1yq^BR(nSB>duwfuq^Pi1u_{AzSW9+%hux%MYDa z?X#oUzVhR(hU2YNx`!G#uY%mBcZuU=f+#1K;7gnN7AjIsi6zH5n6*HhTe|>M=E$+A z=($89B&AOujXIy=!GPTp=lB-94Ar7ZRs&7!F^T!Z3u_~<9UFA@I3C8E0*YmvLvTE4 zQY~6w2G5^MNEfU@*FDr|z@Zb1!yq{cG)d<24DgNf1&V#8^l$FJ8PtNQ+g_OFfBrd0XUB*-0Z2*acQ8H2`-j{ias zg1!HY%mh(~MONgHW9SPq;((JYk$*c)E{{-^q8{cEz89erY%as7=CH!QWY^G}Uwy%@=fo~aWJK)uF z@v-Je9vv+A6egruZq;7A(^-B!sRfC3GEGFU|U?wwS+NfY3HEb5VU`DbFh^ zS&3iMd8I5X5SC{TUSCwaWNYWS5c&6m#h{o1jiCcnl-tz!`z+iR%y|~_!yd#b(S|i% zsEejrf_)lv>HC=>%`%L%MEOuB-b9zJjm!MBE@dZ-j~xO}O=ot*8_Ow7(wNn!fdS_;h01O{gLjGya@cEJ3Dl?R=>G!*QbGV&G}QB{XYN00Uu?#zbg3av4h_Qf6Q5sCI55?;a9=GwqpM**b6Ds|KE+-U-kUjnEON1 z9pu!~FKxPCg@5e~{2|w?MzecNnXrLqd$9?>aU4PZ`SI_*14gesC6ae^}pZ-<+ jucZ2Caaf8!iT{UUmE~X{82NF|1qm<+2?eHrKYslW&fXYe literal 0 HcmV?d00001 diff --git a/test/test_sheet_utils.py b/test/test_sheet_utils.py new file mode 100644 index 000000000..714ade0eb --- /dev/null +++ b/test/test_sheet_utils.py @@ -0,0 +1,179 @@ +import os +import pytest + +from dcicutils.sheet_utils import WorkbookManager, ItemManager +from .conftest_settings import TEST_DIR + + +def test_item_manager_parse_sheet_header(): + assert ItemManager.parse_sheet_header('.a') == ['a'] + assert ItemManager.parse_sheet_header('a') == ['a'] + assert ItemManager.parse_sheet_header('#0') == [0] + assert ItemManager.parse_sheet_header('0') == [0] + assert ItemManager.parse_sheet_header('foo.bar') == ['foo', 'bar'] + assert ItemManager.parse_sheet_header('a.b#0') == ['a', 'b', 0] + assert ItemManager.parse_sheet_header('x.xx#17#8.z') == ['x', 'xx', 17, 8, 'z'] + + # We don't error-check this, but it shouldn't matter + assert ItemManager.parse_sheet_header('#abc') == ['abc'] + assert ItemManager.parse_sheet_header('.123') == [123] + assert ItemManager.parse_sheet_header('#abc.123#456.def') == ['abc', 123, 456, 'def'] + + +def test_item_manager_parse_sheet_headers(): + input = ['a.b', 'a.c', 'a.d#1', 'a.d#2'] + expected = [['a', 'b'], ['a', 'c'], ['a', 'd', 1], ['a', 'd', 2]] + assert ItemManager.parse_sheet_headers(input) == expected + + +@pytest.mark.parametrize('parsed_headers,expected_prototype', [ + (['a'], + {'a': None}), + (['a', 'b'], + {'a': None, 'b': None}), + (['a.b', 'a.c', 'a.d#0', 'a.d#1'], + {'a': {'b': None, 'c': None, 'd': [None, None]}}), + (['a.b', 'a.c', 'a.d#0.foo', 'a.d#0.bar'], + {'a': {'b': None, 'c': None, 'd': [{'foo': None, 'bar': None}]}}), + (['a.b', 'a.c', 'a.d#0.foo', 'a.d#0.bar', 'a.d#1.foo', 'a.d#1.bar'], + {'a': {'b': None, 'c': None, 'd': [{'foo': None, 'bar': None}, {'foo': None, 'bar': None}]}}), +]) +def test_item_manager_compute_patch_prototype(parsed_headers, expected_prototype): + parsed_headers = ItemManager.parse_sheet_headers(parsed_headers) + assert ItemManager.compute_patch_prototype(parsed_headers) == expected_prototype + + +@pytest.mark.parametrize('headers', [['0'], ['x', '0.y']]) +def test_item_manager_compute_patch_prototype_errors(headers): + + parsed_headers = ItemManager.parse_sheet_headers(headers) + with pytest.raises(ValueError) as exc: + ItemManager.compute_patch_prototype(parsed_headers) + assert str(exc.value) == "A header cannot begin with a numeric ref: 0" + + +def test_item_manager_set_path_value(): + + x = {'foo': 1, 'bar': 2} + ItemManager.set_path_value(x, ['foo'], 3) + assert x == {'foo': 3, 'bar': 2} + + x = {'foo': [11, 22, 33], 'bar': {'x': 'xx', 'y': 'yy'}} + ItemManager.set_path_value(x, ['foo', 1], 17) + assert x == {'foo': [11, 17, 33], 'bar': {'x': 'xx', 'y': 'yy'}} + + x = {'foo': [11, 22, 33], 'bar': {'x': 'xx', 'y': 'yy'}} + ItemManager.set_path_value(x, ['bar', 'x'], 'something') + assert x == {'foo': [11, 22, 33], 'bar': {'x': 'something', 'y': 'yy'}} + + +SAMPLE_FILE = os.path.join(TEST_DIR, 'data_files/sample_items.xlsx') + +SAMPLE_FILE_RAW_CONTENT = { + "Sheet1": [ + {"x": 1, "y.a": 1, "y.z": 1}, + {"x": 1, "y.a": 2, "y.z": 3}, + {"x": "alpha", "y.a": "beta", "y.z": "gamma|delta"}, + ], + "Sheet2": [ + { + "name": "bill", "age": 23, + "mother.name": "mary", "mother.age": 58, + "father.name": "fred", "father.age": 63, + "friends#0.name": "sam", "friends#0.age": 22, + "friends#1.name": "arthur", "friends#1.age": 19, + }, + { + "name": "joe", "age": 9, + "mother.name": "estrella", "mother.age": 35, + "father.name": "anthony", "father.age": 34, + "friends#0.name": "anders", "friends#0.age": 9, + "friends#1.name": None, "friends#1.age": None, + }, + ] +} + +SAMPLE_FILE_ITEM_CONTENT = { + "Sheet1": [ + {"x": 1, "y": {"a": 1, "z": 1}}, + {"x": 1, "y": {"a": 2, "z": 3}}, + {"x": "alpha", "y": {"a": "beta", "z": ["gamma", "delta"]}}, + ], + "Sheet2": [ + { + "name": "bill", "age": 23, + "mother": {"name": "mary", "age": 58}, + "father": {"name": "fred", "age": 63}, + "friends": [ + {"name": "sam", "age": 22}, + {"name": "arthur", "age": 19}, + ] + }, + { + "name": "joe", "age": 9, + "mother": {"name": "estrella", "age": 35}, + "father": {"name": "anthony", "age": 34}, + "friends": [ + {"name": "anders", "age": 9}, + {"name": None, "age": None} + ] + }, + ], +} + + +def test_workbook_manager_load_content(): + + wt = WorkbookManager(SAMPLE_FILE) + assert wt.load_content() == SAMPLE_FILE_RAW_CONTENT + + +def test_workbook_manager_load_workbook(): + + assert WorkbookManager.load_workbook(SAMPLE_FILE) == SAMPLE_FILE_RAW_CONTENT + + +def test_item_manager_parse_value(): + + for x in [37, 19.3, True, False, None, 'simple text']: + assert ItemManager.parse_value(x) == x + + assert ItemManager.parse_value('3') == 3 + assert ItemManager.parse_value('+3') == 3 + assert ItemManager.parse_value('-3') == -3 + + assert ItemManager.parse_value('3.5') == 3.5 + assert ItemManager.parse_value('+3.5') == 3.5 + assert ItemManager.parse_value('-3.5') == -3.5 + + assert ItemManager.parse_value('3.5e1') == 35.0 + assert ItemManager.parse_value('+3.5e1') == 35.0 + assert ItemManager.parse_value('-3.5e1') == -35.0 + + assert ItemManager.parse_value('') is None + + assert ItemManager.parse_value('null') is None + assert ItemManager.parse_value('Null') is None + assert ItemManager.parse_value('NULL') is None + + assert ItemManager.parse_value('true') is True + assert ItemManager.parse_value('True') is True + assert ItemManager.parse_value('TRUE') is True + + assert ItemManager.parse_value('false') is False + assert ItemManager.parse_value('False') is False + assert ItemManager.parse_value('FALSE') is False + + assert ItemManager.parse_value('alpha|beta|gamma') == ['alpha', 'beta', 'gamma'] + assert ItemManager.parse_value('alpha|true|false|null||7|1.5') == ['alpha', True, False, None, None, 7, 1.5] + + +def test_item_manager_load_content(): + + it = ItemManager(SAMPLE_FILE) + assert it.load_content() == SAMPLE_FILE_ITEM_CONTENT + + +def test_item_manager_load_workbook(): + + assert ItemManager.load_workbook(SAMPLE_FILE) == SAMPLE_FILE_ITEM_CONTENT From 7b73a67313ebf52eaebc3f119d42d232d35bc530 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Mon, 14 Aug 2023 08:30:01 -0400 Subject: [PATCH 002/101] Refactor to separate some functionality into a separate sevice class. --- dcicutils/sheet_utils.py | 67 +++++++++++++++++++++------------------- 1 file changed, 35 insertions(+), 32 deletions(-) diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index c23129ffe..9633999d8 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -62,29 +62,7 @@ def load_row(self, *, sheet: Worksheet, row: int): return row_dict -class ItemManager(WorkbookManager): - - def __init__(self, filename: str): - super().__init__(filename=filename) - self.patch_prototypes_by_sheetname: Dict[Dict] = {} - self.parsed_headers_by_sheetname: Dict[List[List[Union[int, str]]]] = {} - - def sheet_patch_prototype(self, sheet: Worksheet) -> Dict: - return self.patch_prototypes_by_sheetname[sheet.title] - - def sheet_parsed_headers(self, sheet: Worksheet) -> List[List[Union[int, str]]]: - return self.parsed_headers_by_sheetname[sheet.title] - - def load_headers(self, sheet: Worksheet): - super().load_headers(sheet) - self.compile_sheet_headers(sheet) - - def compile_sheet_headers(self, sheet: Worksheet): - headers = self.headers_by_sheetname[sheet.title] - parsed_headers = self.parse_sheet_headers(headers) - self.parsed_headers_by_sheetname[sheet.title] = parsed_headers - prototype = self.compute_patch_prototype(parsed_headers) - self.patch_prototypes_by_sheetname[sheet.title] = prototype +class ItemTools: @classmethod def compute_patch_prototype(cls, parsed_headers): @@ -140,15 +118,6 @@ def parse_sheet_header(cls, header) -> List[Union[int, str]]: result.append(int(token) if token.isdigit() else token) return result - def load_row(self, *, sheet: Worksheet, row: int): - parsed_headers = self.sheet_parsed_headers(sheet) - patch_item = copy.deepcopy(self.sheet_patch_prototype(sheet)) - for col in self.all_cols(sheet): - value = sheet.cell(row=row, column=col).value - parsed_value = self.parse_value(value) - self.set_path_value(patch_item, parsed_headers[col - 1], parsed_value) - return patch_item - @classmethod def set_path_value(cls, datum, path, value, force=False): if (value is None or value == '') and not force: @@ -186,3 +155,37 @@ def parse_value(cls, value): return value else: # probably a number return value + + +class ItemManager(ItemTools, WorkbookManager): + + def __init__(self, filename: str): + super().__init__(filename=filename) + self.patch_prototypes_by_sheetname: Dict[Dict] = {} + self.parsed_headers_by_sheetname: Dict[List[List[Union[int, str]]]] = {} + + def sheet_patch_prototype(self, sheet: Worksheet) -> Dict: + return self.patch_prototypes_by_sheetname[sheet.title] + + def sheet_parsed_headers(self, sheet: Worksheet) -> List[List[Union[int, str]]]: + return self.parsed_headers_by_sheetname[sheet.title] + + def load_headers(self, sheet: Worksheet): + super().load_headers(sheet) + self.compile_sheet_headers(sheet) + + def compile_sheet_headers(self, sheet: Worksheet): + headers = self.headers_by_sheetname[sheet.title] + parsed_headers = self.parse_sheet_headers(headers) + self.parsed_headers_by_sheetname[sheet.title] = parsed_headers + prototype = self.compute_patch_prototype(parsed_headers) + self.patch_prototypes_by_sheetname[sheet.title] = prototype + + def load_row(self, *, sheet: Worksheet, row: int): + parsed_headers = self.sheet_parsed_headers(sheet) + patch_item = copy.deepcopy(self.sheet_patch_prototype(sheet)) + for col in self.all_cols(sheet): + value = sheet.cell(row=row, column=col).value + parsed_value = self.parse_value(value) + self.set_path_value(patch_item, parsed_headers[col - 1], parsed_value) + return patch_item From 3d4573fc089694d9ab34145b0481f9414f57e363 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Mon, 14 Aug 2023 08:32:34 -0400 Subject: [PATCH 003/101] Add a csv file for testing. --- test/data_files/sample_items_sheet2.csv | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 test/data_files/sample_items_sheet2.csv diff --git a/test/data_files/sample_items_sheet2.csv b/test/data_files/sample_items_sheet2.csv new file mode 100644 index 000000000..b1d3ec2da --- /dev/null +++ b/test/data_files/sample_items_sheet2.csv @@ -0,0 +1,3 @@ +name,age,mother.name,mother.age,father.name,father.age,friends#0.name,friends#0.age,friends#1.name,friends#1.age +bill,23,mary,58,fred,63,sam,22,arthur,19 +joe,9,estrella,35,anthony,34,anders,9,, \ No newline at end of file From f4e5cfa4f605168a453e7c601574c38be9854e97 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Mon, 14 Aug 2023 08:38:17 -0400 Subject: [PATCH 004/101] Add some negative testing. --- test/test_sheet_utils.py | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/test/test_sheet_utils.py b/test/test_sheet_utils.py index 714ade0eb..32dffb25b 100644 --- a/test/test_sheet_utils.py +++ b/test/test_sheet_utils.py @@ -67,9 +67,9 @@ def test_item_manager_set_path_value(): assert x == {'foo': [11, 22, 33], 'bar': {'x': 'something', 'y': 'yy'}} -SAMPLE_FILE = os.path.join(TEST_DIR, 'data_files/sample_items.xlsx') +SAMPLE_XLSX_FILE = os.path.join(TEST_DIR, 'data_files/sample_items.xlsx') -SAMPLE_FILE_RAW_CONTENT = { +SAMPLE_XLSX_FILE_RAW_CONTENT = { "Sheet1": [ {"x": 1, "y.a": 1, "y.z": 1}, {"x": 1, "y.a": 2, "y.z": 3}, @@ -93,7 +93,7 @@ def test_item_manager_set_path_value(): ] } -SAMPLE_FILE_ITEM_CONTENT = { +SAMPLE_XLSX_FILE_ITEM_CONTENT = { "Sheet1": [ {"x": 1, "y": {"a": 1, "z": 1}}, {"x": 1, "y": {"a": 2, "z": 3}}, @@ -121,16 +121,28 @@ def test_item_manager_set_path_value(): ], } +SAMPLE_CSV_FILE = os.path.join(TEST_DIR, 'data_files/sample_items_sheet2.csv') + +SAMPLE_CSV_FILE_RAW_CONTENT = SAMPLE_XLSX_FILE_RAW_CONTENT['Sheet2'] + +SAMPLE_CSV_FILE_ITEM_CONTENT = SAMPLE_XLSX_FILE_ITEM_CONTENT['Sheet2'] + def test_workbook_manager_load_content(): - wt = WorkbookManager(SAMPLE_FILE) - assert wt.load_content() == SAMPLE_FILE_RAW_CONTENT + wt = WorkbookManager(SAMPLE_XLSX_FILE) + assert wt.load_content() == SAMPLE_XLSX_FILE_RAW_CONTENT def test_workbook_manager_load_workbook(): - assert WorkbookManager.load_workbook(SAMPLE_FILE) == SAMPLE_FILE_RAW_CONTENT + assert WorkbookManager.load_workbook(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_RAW_CONTENT + + +def test_workbook_manager_load_csv(): + + with pytest.raises(Exception): + WorkbookManager.load_workbook(SAMPLE_CSV_FILE) def test_item_manager_parse_value(): @@ -170,10 +182,16 @@ def test_item_manager_parse_value(): def test_item_manager_load_content(): - it = ItemManager(SAMPLE_FILE) - assert it.load_content() == SAMPLE_FILE_ITEM_CONTENT + it = ItemManager(SAMPLE_XLSX_FILE) + assert it.load_content() == SAMPLE_XLSX_FILE_ITEM_CONTENT def test_item_manager_load_workbook(): - assert ItemManager.load_workbook(SAMPLE_FILE) == SAMPLE_FILE_ITEM_CONTENT + assert ItemManager.load_workbook(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_ITEM_CONTENT + + +def test_item_manager_load_csv(): + + with pytest.raises(Exception): + ItemManager.load_workbook(SAMPLE_CSV_FILE) From e9d2465f7e8b5b73c140dc3704df32556440d4c4 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Mon, 14 Aug 2023 08:43:24 -0400 Subject: [PATCH 005/101] Update lock file. --- poetry.lock | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/poetry.lock b/poetry.lock index d7e77523c..480148ea1 100644 --- a/poetry.lock +++ b/poetry.lock @@ -752,6 +752,18 @@ develop = ["black", "coverage", "jinja2", "mock", "pytest", "pytest-cov", "pyyam docs = ["sphinx (<1.7)", "sphinx-rtd-theme"] requests = ["requests (>=2.4.0,<3.0.0)"] +[[package]] +name = "et-xmlfile" +version = "1.1.0" +description = "An implementation of lxml.xmlfile for the standard library" +category = "main" +optional = false +python-versions = ">=3.6" +files = [ + {file = "et_xmlfile-1.1.0-py3-none-any.whl", hash = "sha256:a2ba85d1d6a74ef63837eed693bcb89c3f752169b0e3e7ae5b16ca5e1b3deada"}, + {file = "et_xmlfile-1.1.0.tar.gz", hash = "sha256:8eb9e2bc2f8c97e37a2dc85a09ecdcdec9d8a396530a6d5a33b30b9a92da0c5c"}, +] + [[package]] name = "exceptiongroup" version = "1.1.2" @@ -911,6 +923,21 @@ files = [ [package.dependencies] psutil = {version = ">=4.0.0", markers = "sys_platform != \"cygwin\""} +[[package]] +name = "openpyxl" +version = "3.1.2" +description = "A Python library to read/write Excel 2010 xlsx/xlsm files" +category = "main" +optional = false +python-versions = ">=3.6" +files = [ + {file = "openpyxl-3.1.2-py2.py3-none-any.whl", hash = "sha256:f91456ead12ab3c6c2e9491cf33ba6d08357d802192379bb482f1033ade496f5"}, + {file = "openpyxl-3.1.2.tar.gz", hash = "sha256:a6f5977418eff3b2d5500d54d9db50c8277a368436f4e4f8ddb1be3422870184"}, +] + +[package.dependencies] +et-xmlfile = "*" + [[package]] name = "opensearch-py" version = "2.3.0" @@ -1594,4 +1621,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more [metadata] lock-version = "2.0" python-versions = ">=3.7,<3.10" -content-hash = "b8d6612bb28cfb9da79306a82b2ac35a20678e1f62ef86c93b8af3c3d1ed798e" +content-hash = "9d01884634874c0304ebd91ae564ad7920cece54aea7de4c67955c2343e7d44b" From 6e9060f670a309bef7de997d7f5ae7e33a8b9272 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Mon, 14 Aug 2023 10:28:30 -0400 Subject: [PATCH 006/101] Document new sheets_utils module. --- docs/source/dcicutils.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/source/dcicutils.rst b/docs/source/dcicutils.rst index 7fdaba7ea..cf8654a96 100644 --- a/docs/source/dcicutils.rst +++ b/docs/source/dcicutils.rst @@ -281,6 +281,13 @@ secrets_utils :members: +sheets_utils +^^^^^^^^^^^^ + +.. automodule:: dcicutils.sheets_utils + :members: + + snapshot_utils ^^^^^^^^^^^^^^ From df12c91bba8a9dc12816ae8b5555460fd2662fab Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Tue, 15 Aug 2023 16:30:01 -0400 Subject: [PATCH 007/101] Issue a beta for this functionality. --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 647c13fe0..8fd8826a4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dcicutils" -version = "7.7.2" +version = "7.7.2.1b0" # to become "7.8.0" description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources" authors = ["4DN-DCIC Team "] license = "MIT" From 6a39c8a6dcafa584a22e311224e347522fac8b84 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Tue, 15 Aug 2023 16:31:41 -0400 Subject: [PATCH 008/101] Fix documentation for sheet_utils. --- docs/source/dcicutils.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/dcicutils.rst b/docs/source/dcicutils.rst index cf8654a96..f0f07c49d 100644 --- a/docs/source/dcicutils.rst +++ b/docs/source/dcicutils.rst @@ -281,10 +281,10 @@ secrets_utils :members: -sheets_utils -^^^^^^^^^^^^ +sheet_utils +^^^^^^^^^^^ -.. automodule:: dcicutils.sheets_utils +.. automodule:: dcicutils.sheet_utils :members: From eedb5c68307fe1644d3396ef2d9de77eee6b5439 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Wed, 16 Aug 2023 10:25:12 -0400 Subject: [PATCH 009/101] Add some declarations. Small refactors to improve modularity. --- dcicutils/sheet_utils.py | 107 ++++++++++++++++++++++++--------------- 1 file changed, 67 insertions(+), 40 deletions(-) diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index 9633999d8..db310aeb2 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -1,11 +1,19 @@ import copy +from dcicutils.common import AnyJsonData from openpyxl import load_workbook from openpyxl.worksheet.worksheet import Worksheet from openpyxl.workbook.workbook import Workbook from typing import Any, Dict, List, Optional, Union +Header = str +Headers = List[str] +ParsedHeader = List[Union[str, int]] +ParsedHeaders = List[ParsedHeader] +SheetCellValue = Union[int, float, str] + + class WorkbookManager: @classmethod @@ -16,30 +24,30 @@ def load_workbook(cls, filename: str): def __init__(self, filename: str): self.filename: str = filename self.workbook: Optional[Workbook] = None - self.headers_by_sheetname: Dict[List[str]] = {} - self.content_by_sheetname: Dict[List[Any]] = {} + self.headers_by_sheetname: Dict[str, List[str]] = {} + self.content_by_sheetname: Dict[str, List[Any]] = {} - def sheet_headers(self, sheet: Worksheet) -> List[str]: - return self.headers_by_sheetname[sheet.title] + def sheet_headers(self, sheetname: str) -> List[str]: + return self.headers_by_sheetname[sheetname] - def sheet_content(self, sheet: Worksheet) -> List[Any]: - return self.content_by_sheetname[sheet.title] + def sheet_content(self, sheetname: str) -> List[Any]: + return self.content_by_sheetname[sheetname] @classmethod - def all_rows(cls, sheet: Worksheet): + def _all_rows(cls, sheet: Worksheet): row_max = sheet.max_row for row in range(2, row_max + 1): yield row @classmethod - def all_cols(cls, sheet: Worksheet): + def _all_cols(cls, sheet: Worksheet): col_max = sheet.max_column for col in range(1, col_max + 1): yield col - def load_headers(self, sheet: Worksheet): + def _load_headers(self, sheet: Worksheet): headers: List[str] = [str(sheet.cell(row=1, column=col).value) - for col in self.all_cols(sheet)] + for col in self._all_cols(sheet)] self.headers_by_sheetname[sheet.title] = headers def load_content(self): @@ -47,25 +55,44 @@ def load_content(self): self.workbook = workbook for sheetname in workbook.sheetnames: sheet: Worksheet = workbook[sheetname] - self.load_headers(sheet) + self._load_headers(sheet) content = [] - for row in self.all_rows(sheet): - row_dict = self.load_row(sheet=sheet, row=row) + for row in self._all_rows(sheet): + row_dict = self._load_row(sheet=sheet, row=row) content.append(row_dict) self.content_by_sheetname[sheetname] = content return self.content_by_sheetname - def load_row(self, *, sheet: Worksheet, row: int): - headers = self.sheet_headers(sheet) + def _load_row(self, *, sheet: Worksheet, row: int): + headers = self.sheet_headers(sheet.title) row_dict: Dict[str, Any] = {headers[col-1]: sheet.cell(row=row, column=col).value - for col in self.all_cols(sheet)} + for col in self._all_cols(sheet)} return row_dict class ItemTools: + """ + Implements operations on table-related data without pre-supposing the specific representation of the table. + It is assumed this can be used for data that was obtained from .json, .csv, .tsv, and .xlsx files because + it does not presuppose the source of the data nor where it will be written to. + + For the purpose of this class: + + * a 'header' is a string representing the top of a column. + + * a 'parsed header' is a list of strings and/or ints, after splitting at uses of '#' or '.', so that + "a.b.c" is represented as ["a", "b", "c"], and "x.y#0" is represented as ["x", "y", 0], and representing + each numeric token as an int instead of a string. + + * a 'headers' object is just a list of strings, each of which is a 'header'. + + * a 'parsed headers' object is a non-empty list of lists, each of which is a 'parsed header'. + e..g., the headers ["a.b.c", "x.y#0"] is represented as parsed hearders [["a", "b", "c"], ["x", "y", 0]]. + + """ @classmethod - def compute_patch_prototype(cls, parsed_headers): + def compute_patch_prototype(cls, parsed_headers: ParsedHeaders): prototype = {} for parsed_header in parsed_headers: parsed_header0 = parsed_header[0] @@ -75,7 +102,7 @@ def compute_patch_prototype(cls, parsed_headers): return prototype @classmethod - def assure_patch_prototype_shape(cls, *, parent: Union[Dict, List], keys: List[Union[int, str]]): + def assure_patch_prototype_shape(cls, *, parent: Union[Dict, List], keys: ParsedHeader): [key0, *more_keys] = keys key1 = more_keys[0] if more_keys else None if isinstance(key1, int): @@ -98,12 +125,12 @@ def assure_patch_prototype_shape(cls, *, parent: Union[Dict, List], keys: List[U return parent @classmethod - def parse_sheet_headers(cls, headers): + def parse_sheet_headers(cls, headers: Headers): return [cls.parse_sheet_header(header) for header in headers] @classmethod - def parse_sheet_header(cls, header) -> List[Union[int, str]]: + def parse_sheet_header(cls, header: Header) -> ParsedHeader: result = [] token = "" for i in range(len(header)): @@ -119,7 +146,7 @@ def parse_sheet_header(cls, header) -> List[Union[int, str]]: return result @classmethod - def set_path_value(cls, datum, path, value, force=False): + def set_path_value(cls, datum: Union[List, Dict], path: ParsedHeader, value: Any, force: bool = False): if (value is None or value == '') and not force: return [key, *more_path] = path @@ -129,7 +156,7 @@ def set_path_value(cls, datum, path, value, force=False): cls.set_path_value(datum[key], more_path, value) @classmethod - def parse_value(cls, value): + def parse_value(cls, value: SheetCellValue) -> AnyJsonData: if isinstance(value, str): lvalue = value.lower() # TODO: We could consult a schema to make this less heuristic, but this may do for now @@ -153,7 +180,7 @@ def parse_value(cls, value): except Exception: pass return value - else: # probably a number + else: # presumably a number (int or float) return value @@ -161,30 +188,30 @@ class ItemManager(ItemTools, WorkbookManager): def __init__(self, filename: str): super().__init__(filename=filename) - self.patch_prototypes_by_sheetname: Dict[Dict] = {} - self.parsed_headers_by_sheetname: Dict[List[List[Union[int, str]]]] = {} + self.patch_prototypes_by_sheetname: Dict[str, Dict] = {} + self.parsed_headers_by_sheetname: Dict[str, List[List[Union[int, str]]]] = {} - def sheet_patch_prototype(self, sheet: Worksheet) -> Dict: - return self.patch_prototypes_by_sheetname[sheet.title] + def sheet_patch_prototype(self, sheetname: str) -> Dict: + return self.patch_prototypes_by_sheetname[sheetname] - def sheet_parsed_headers(self, sheet: Worksheet) -> List[List[Union[int, str]]]: - return self.parsed_headers_by_sheetname[sheet.title] + def sheet_parsed_headers(self, sheetname: str) -> List[List[Union[int, str]]]: + return self.parsed_headers_by_sheetname[sheetname] - def load_headers(self, sheet: Worksheet): - super().load_headers(sheet) - self.compile_sheet_headers(sheet) + def _load_headers(self, sheet: Worksheet): + super()._load_headers(sheet) + self._compile_sheet_headers(sheet.title) - def compile_sheet_headers(self, sheet: Worksheet): - headers = self.headers_by_sheetname[sheet.title] + def _compile_sheet_headers(self, sheetname: str): + headers = self.headers_by_sheetname[sheetname] parsed_headers = self.parse_sheet_headers(headers) - self.parsed_headers_by_sheetname[sheet.title] = parsed_headers + self.parsed_headers_by_sheetname[sheetname] = parsed_headers prototype = self.compute_patch_prototype(parsed_headers) - self.patch_prototypes_by_sheetname[sheet.title] = prototype + self.patch_prototypes_by_sheetname[sheetname] = prototype - def load_row(self, *, sheet: Worksheet, row: int): - parsed_headers = self.sheet_parsed_headers(sheet) - patch_item = copy.deepcopy(self.sheet_patch_prototype(sheet)) - for col in self.all_cols(sheet): + def _load_row(self, *, sheet: Worksheet, row: int): + parsed_headers = self.sheet_parsed_headers(sheet.title) + patch_item = copy.deepcopy(self.sheet_patch_prototype(sheet.title)) + for col in self._all_cols(sheet): value = sheet.cell(row=row, column=col).value parsed_value = self.parse_value(value) self.set_path_value(patch_item, parsed_headers[col - 1], parsed_value) From a6b68feeb2219eb9f79635a7441656b09f38dc32 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Wed, 16 Aug 2023 13:33:06 -0400 Subject: [PATCH 010/101] Rearrange some methods for presentational reasons. --- dcicutils/sheet_utils.py | 174 +++++++++++++++++++-------------------- test/test_sheet_utils.py | 118 +++++++++++++------------- 2 files changed, 146 insertions(+), 146 deletions(-) diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index db310aeb2..8125f27d3 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -14,62 +14,6 @@ SheetCellValue = Union[int, float, str] -class WorkbookManager: - - @classmethod - def load_workbook(cls, filename: str): - wb = cls(filename) - return wb.load_content() - - def __init__(self, filename: str): - self.filename: str = filename - self.workbook: Optional[Workbook] = None - self.headers_by_sheetname: Dict[str, List[str]] = {} - self.content_by_sheetname: Dict[str, List[Any]] = {} - - def sheet_headers(self, sheetname: str) -> List[str]: - return self.headers_by_sheetname[sheetname] - - def sheet_content(self, sheetname: str) -> List[Any]: - return self.content_by_sheetname[sheetname] - - @classmethod - def _all_rows(cls, sheet: Worksheet): - row_max = sheet.max_row - for row in range(2, row_max + 1): - yield row - - @classmethod - def _all_cols(cls, sheet: Worksheet): - col_max = sheet.max_column - for col in range(1, col_max + 1): - yield col - - def _load_headers(self, sheet: Worksheet): - headers: List[str] = [str(sheet.cell(row=1, column=col).value) - for col in self._all_cols(sheet)] - self.headers_by_sheetname[sheet.title] = headers - - def load_content(self): - workbook: Workbook = load_workbook(self.filename) - self.workbook = workbook - for sheetname in workbook.sheetnames: - sheet: Worksheet = workbook[sheetname] - self._load_headers(sheet) - content = [] - for row in self._all_rows(sheet): - row_dict = self._load_row(sheet=sheet, row=row) - content.append(row_dict) - self.content_by_sheetname[sheetname] = content - return self.content_by_sheetname - - def _load_row(self, *, sheet: Worksheet, row: int): - headers = self.sheet_headers(sheet.title) - row_dict: Dict[str, Any] = {headers[col-1]: sheet.cell(row=row, column=col).value - for col in self._all_cols(sheet)} - return row_dict - - class ItemTools: """ Implements operations on table-related data without pre-supposing the specific representation of the table. @@ -91,6 +35,27 @@ class ItemTools: """ + @classmethod + def parse_sheet_header(cls, header: Header) -> ParsedHeader: + result = [] + token = "" + for i in range(len(header)): + ch = header[i] + if ch == '.' or ch == '#': + if token: + result.append(int(token) if token.isdigit() else token) + token = "" + else: + token += ch + if token: + result.append(int(token) if token.isdigit() else token) + return result + + @classmethod + def parse_sheet_headers(cls, headers: Headers): + return [cls.parse_sheet_header(header) + for header in headers] + @classmethod def compute_patch_prototype(cls, parsed_headers: ParsedHeaders): prototype = {} @@ -124,37 +89,6 @@ def assure_patch_prototype_shape(cls, *, parent: Union[Dict, List], keys: Parsed cls.assure_patch_prototype_shape(parent=parent[key0], keys=more_keys) return parent - @classmethod - def parse_sheet_headers(cls, headers: Headers): - return [cls.parse_sheet_header(header) - for header in headers] - - @classmethod - def parse_sheet_header(cls, header: Header) -> ParsedHeader: - result = [] - token = "" - for i in range(len(header)): - ch = header[i] - if ch == '.' or ch == '#': - if token: - result.append(int(token) if token.isdigit() else token) - token = "" - else: - token += ch - if token: - result.append(int(token) if token.isdigit() else token) - return result - - @classmethod - def set_path_value(cls, datum: Union[List, Dict], path: ParsedHeader, value: Any, force: bool = False): - if (value is None or value == '') and not force: - return - [key, *more_path] = path - if not more_path: - datum[key] = value - else: - cls.set_path_value(datum[key], more_path, value) - @classmethod def parse_value(cls, value: SheetCellValue) -> AnyJsonData: if isinstance(value, str): @@ -183,6 +117,72 @@ def parse_value(cls, value: SheetCellValue) -> AnyJsonData: else: # presumably a number (int or float) return value + @classmethod + def set_path_value(cls, datum: Union[List, Dict], path: ParsedHeader, value: Any, force: bool = False): + if (value is None or value == '') and not force: + return + [key, *more_path] = path + if not more_path: + datum[key] = value + else: + cls.set_path_value(datum[key], more_path, value) + + +class WorkbookManager: + + @classmethod + def load_workbook(cls, filename: str): + wb = cls(filename) + return wb.load_content() + + def __init__(self, filename: str): + self.filename: str = filename + self.workbook: Optional[Workbook] = None + self.headers_by_sheetname: Dict[str, List[str]] = {} + self.content_by_sheetname: Dict[str, List[Any]] = {} + + def sheet_headers(self, sheetname: str) -> List[str]: + return self.headers_by_sheetname[sheetname] + + def sheet_content(self, sheetname: str) -> List[Any]: + return self.content_by_sheetname[sheetname] + + @classmethod + def _all_rows(cls, sheet: Worksheet): + row_max = sheet.max_row + for row in range(2, row_max + 1): + yield row + + @classmethod + def _all_cols(cls, sheet: Worksheet): + col_max = sheet.max_column + for col in range(1, col_max + 1): + yield col + + def _load_headers(self, sheet: Worksheet): + headers: List[str] = [str(sheet.cell(row=1, column=col).value) + for col in self._all_cols(sheet)] + self.headers_by_sheetname[sheet.title] = headers + + def _load_row(self, *, sheet: Worksheet, row: int): + headers = self.sheet_headers(sheet.title) + row_dict: Dict[str, Any] = {headers[col-1]: sheet.cell(row=row, column=col).value + for col in self._all_cols(sheet)} + return row_dict + + def load_content(self): + workbook: Workbook = load_workbook(self.filename) + self.workbook = workbook + for sheetname in workbook.sheetnames: + sheet: Worksheet = workbook[sheetname] + self._load_headers(sheet) + content = [] + for row in self._all_rows(sheet): + row_dict = self._load_row(sheet=sheet, row=row) + content.append(row_dict) + self.content_by_sheetname[sheetname] = content + return self.content_by_sheetname + class ItemManager(ItemTools, WorkbookManager): diff --git a/test/test_sheet_utils.py b/test/test_sheet_utils.py index 32dffb25b..40286d2e3 100644 --- a/test/test_sheet_utils.py +++ b/test/test_sheet_utils.py @@ -1,29 +1,29 @@ import os import pytest -from dcicutils.sheet_utils import WorkbookManager, ItemManager +from dcicutils.sheet_utils import ItemTools, WorkbookManager, ItemManager from .conftest_settings import TEST_DIR -def test_item_manager_parse_sheet_header(): - assert ItemManager.parse_sheet_header('.a') == ['a'] - assert ItemManager.parse_sheet_header('a') == ['a'] - assert ItemManager.parse_sheet_header('#0') == [0] - assert ItemManager.parse_sheet_header('0') == [0] - assert ItemManager.parse_sheet_header('foo.bar') == ['foo', 'bar'] - assert ItemManager.parse_sheet_header('a.b#0') == ['a', 'b', 0] - assert ItemManager.parse_sheet_header('x.xx#17#8.z') == ['x', 'xx', 17, 8, 'z'] +def test_item_tools_parse_sheet_header(): + assert ItemTools.parse_sheet_header('.a') == ['a'] + assert ItemTools.parse_sheet_header('a') == ['a'] + assert ItemTools.parse_sheet_header('#0') == [0] + assert ItemTools.parse_sheet_header('0') == [0] + assert ItemTools.parse_sheet_header('foo.bar') == ['foo', 'bar'] + assert ItemTools.parse_sheet_header('a.b#0') == ['a', 'b', 0] + assert ItemTools.parse_sheet_header('x.xx#17#8.z') == ['x', 'xx', 17, 8, 'z'] # We don't error-check this, but it shouldn't matter - assert ItemManager.parse_sheet_header('#abc') == ['abc'] - assert ItemManager.parse_sheet_header('.123') == [123] - assert ItemManager.parse_sheet_header('#abc.123#456.def') == ['abc', 123, 456, 'def'] + assert ItemTools.parse_sheet_header('#abc') == ['abc'] + assert ItemTools.parse_sheet_header('.123') == [123] + assert ItemTools.parse_sheet_header('#abc.123#456.def') == ['abc', 123, 456, 'def'] -def test_item_manager_parse_sheet_headers(): +def test_item_tools_parse_sheet_headers(): input = ['a.b', 'a.c', 'a.d#1', 'a.d#2'] expected = [['a', 'b'], ['a', 'c'], ['a', 'd', 1], ['a', 'd', 2]] - assert ItemManager.parse_sheet_headers(input) == expected + assert ItemTools.parse_sheet_headers(input) == expected @pytest.mark.parametrize('parsed_headers,expected_prototype', [ @@ -38,32 +38,67 @@ def test_item_manager_parse_sheet_headers(): (['a.b', 'a.c', 'a.d#0.foo', 'a.d#0.bar', 'a.d#1.foo', 'a.d#1.bar'], {'a': {'b': None, 'c': None, 'd': [{'foo': None, 'bar': None}, {'foo': None, 'bar': None}]}}), ]) -def test_item_manager_compute_patch_prototype(parsed_headers, expected_prototype): - parsed_headers = ItemManager.parse_sheet_headers(parsed_headers) - assert ItemManager.compute_patch_prototype(parsed_headers) == expected_prototype +def test_item_tools_compute_patch_prototype(parsed_headers, expected_prototype): + parsed_headers = ItemTools.parse_sheet_headers(parsed_headers) + assert ItemTools.compute_patch_prototype(parsed_headers) == expected_prototype @pytest.mark.parametrize('headers', [['0'], ['x', '0.y']]) -def test_item_manager_compute_patch_prototype_errors(headers): +def test_item_tools_compute_patch_prototype_errors(headers): - parsed_headers = ItemManager.parse_sheet_headers(headers) + parsed_headers = ItemTools.parse_sheet_headers(headers) with pytest.raises(ValueError) as exc: - ItemManager.compute_patch_prototype(parsed_headers) + ItemTools.compute_patch_prototype(parsed_headers) assert str(exc.value) == "A header cannot begin with a numeric ref: 0" -def test_item_manager_set_path_value(): +def test_item_tools_parse_value(): + + for x in [37, 19.3, True, False, None, 'simple text']: + assert ItemTools.parse_value(x) == x + + assert ItemTools.parse_value('3') == 3 + assert ItemTools.parse_value('+3') == 3 + assert ItemTools.parse_value('-3') == -3 + + assert ItemTools.parse_value('3.5') == 3.5 + assert ItemTools.parse_value('+3.5') == 3.5 + assert ItemTools.parse_value('-3.5') == -3.5 + + assert ItemTools.parse_value('3.5e1') == 35.0 + assert ItemTools.parse_value('+3.5e1') == 35.0 + assert ItemTools.parse_value('-3.5e1') == -35.0 + + assert ItemTools.parse_value('') is None + + assert ItemTools.parse_value('null') is None + assert ItemTools.parse_value('Null') is None + assert ItemTools.parse_value('NULL') is None + + assert ItemTools.parse_value('true') is True + assert ItemTools.parse_value('True') is True + assert ItemTools.parse_value('TRUE') is True + + assert ItemTools.parse_value('false') is False + assert ItemTools.parse_value('False') is False + assert ItemTools.parse_value('FALSE') is False + + assert ItemTools.parse_value('alpha|beta|gamma') == ['alpha', 'beta', 'gamma'] + assert ItemTools.parse_value('alpha|true|false|null||7|1.5') == ['alpha', True, False, None, None, 7, 1.5] + + +def test_item_tools_set_path_value(): x = {'foo': 1, 'bar': 2} - ItemManager.set_path_value(x, ['foo'], 3) + ItemTools.set_path_value(x, ['foo'], 3) assert x == {'foo': 3, 'bar': 2} x = {'foo': [11, 22, 33], 'bar': {'x': 'xx', 'y': 'yy'}} - ItemManager.set_path_value(x, ['foo', 1], 17) + ItemTools.set_path_value(x, ['foo', 1], 17) assert x == {'foo': [11, 17, 33], 'bar': {'x': 'xx', 'y': 'yy'}} x = {'foo': [11, 22, 33], 'bar': {'x': 'xx', 'y': 'yy'}} - ItemManager.set_path_value(x, ['bar', 'x'], 'something') + ItemTools.set_path_value(x, ['bar', 'x'], 'something') assert x == {'foo': [11, 22, 33], 'bar': {'x': 'something', 'y': 'yy'}} @@ -145,41 +180,6 @@ def test_workbook_manager_load_csv(): WorkbookManager.load_workbook(SAMPLE_CSV_FILE) -def test_item_manager_parse_value(): - - for x in [37, 19.3, True, False, None, 'simple text']: - assert ItemManager.parse_value(x) == x - - assert ItemManager.parse_value('3') == 3 - assert ItemManager.parse_value('+3') == 3 - assert ItemManager.parse_value('-3') == -3 - - assert ItemManager.parse_value('3.5') == 3.5 - assert ItemManager.parse_value('+3.5') == 3.5 - assert ItemManager.parse_value('-3.5') == -3.5 - - assert ItemManager.parse_value('3.5e1') == 35.0 - assert ItemManager.parse_value('+3.5e1') == 35.0 - assert ItemManager.parse_value('-3.5e1') == -35.0 - - assert ItemManager.parse_value('') is None - - assert ItemManager.parse_value('null') is None - assert ItemManager.parse_value('Null') is None - assert ItemManager.parse_value('NULL') is None - - assert ItemManager.parse_value('true') is True - assert ItemManager.parse_value('True') is True - assert ItemManager.parse_value('TRUE') is True - - assert ItemManager.parse_value('false') is False - assert ItemManager.parse_value('False') is False - assert ItemManager.parse_value('FALSE') is False - - assert ItemManager.parse_value('alpha|beta|gamma') == ['alpha', 'beta', 'gamma'] - assert ItemManager.parse_value('alpha|true|false|null||7|1.5') == ['alpha', True, False, None, None, 7, 1.5] - - def test_item_manager_load_content(): it = ItemManager(SAMPLE_XLSX_FILE) From 3ff63a94a4e7593f3b49e8188e6deaa578a0ec53 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Thu, 17 Aug 2023 11:13:39 -0400 Subject: [PATCH 011/101] First cut at useful functionality. --- dcicutils/sheet_utils.py | 301 ++++++++++++++++++++++++++++++++------- poetry.lock | 14 +- pyproject.toml | 1 + test/test_sheet_utils.py | 120 +++++++++++----- 4 files changed, 343 insertions(+), 93 deletions(-) diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index 8125f27d3..e2f0e1c4d 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -1,10 +1,15 @@ +import chardet import copy +import csv +import io +import openpyxl from dcicutils.common import AnyJsonData -from openpyxl import load_workbook +from dcicutils.misc_utils import ignored from openpyxl.worksheet.worksheet import Worksheet from openpyxl.workbook.workbook import Workbook -from typing import Any, Dict, List, Optional, Union +from tempfile import TemporaryFile +from typing import Any, Dict, Iterable, List, Union Header = str @@ -12,6 +17,40 @@ ParsedHeader = List[Union[str, int]] ParsedHeaders = List[ParsedHeader] SheetCellValue = Union[int, float, str] +SheetRow = List[SheetCellValue] +CsvReader = type(csv.reader(TemporaryFile())) + + +def prefer_number(value: SheetCellValue): + if isinstance(value, str): # the given value might be an int or float, in which case just fall through + if not value: + return None + value = value + ch0 = value[0] + if ch0 == '+' or ch0 == '-' or ch0.isdigit(): + try: + return int(value) + except Exception: + pass + try: + return float(value) + except Exception: + pass + # If we couldn't parse it as an int or float, fall through to returning the original value + pass + return value + + +def open_text_input_file_respecting_byte_order_mark(filename): + """ + Opens a file for text input, respecting a byte-order mark (BOM). + """ + with io.open(filename, 'rb') as fp: + leading_bytes = fp.read(4 * 8) # 4 bytes is all we need + bom_info = chardet.detect(leading_bytes) + detected_encoding = bom_info and bom_info.get('encoding') # tread lightly + + return io.open(filename, 'r', encoding=detected_encoding) class ItemTools: @@ -90,7 +129,7 @@ def assure_patch_prototype_shape(cls, *, parent: Union[Dict, List], keys: Parsed return parent @classmethod - def parse_value(cls, value: SheetCellValue) -> AnyJsonData: + def parse_item_value(cls, value: SheetCellValue) -> AnyJsonData: if isinstance(value, str): lvalue = value.lower() # TODO: We could consult a schema to make this less heuristic, but this may do for now @@ -101,19 +140,9 @@ def parse_value(cls, value: SheetCellValue) -> AnyJsonData: elif lvalue == 'null' or lvalue == '': return None elif '|' in value: - return [cls.parse_value(subvalue) for subvalue in value.split('|')] + return [cls.parse_item_value(subvalue) for subvalue in value.split('|')] else: - ch0 = value[0] - if ch0 == '+' or ch0 == '-' or ch0.isdigit(): - try: - return int(value) - except Exception: - pass - try: - return float(value) - except Exception: - pass - return value + return prefer_number(value) else: # presumably a number (int or float) return value @@ -128,25 +157,122 @@ def set_path_value(cls, datum: Union[List, Dict], path: ParsedHeader, value: Any cls.set_path_value(datum[key], more_path, value) -class WorkbookManager: +class AbstractTableSetManager: + """ + The TableSetManager is the spanning class of anything that wants to be able to load a table set, + regardless of what it wants to load it from. To do this, it must support a load_table_set method + that takes a filename and returns the file content in the form: + { + "Sheet1": [ + {...representation of row1 as some kind of dict...}, + {...representation of row2 as some kind of dict...} + ], + "Sheet2": [...], + ..., + } + Note that at this level of abstraction, we take no position on what form of representation is used + for the rows, as long as it is JSON data of some kind. It might be + {"col1": "val1", "col2", "val2", ...} + or it might be something more structured like + {"something": "val1", {"something_else": ["val2"]}} + Additionally, the values stored might be altered as well. In particular, the most likely alteration + is to turn "123" to 123 or "" to None, though the specifics of whether and how such transformations + happen is not constrained by this class. + """ @classmethod - def load_workbook(cls, filename: str): - wb = cls(filename) - return wb.load_content() + def load_table_set(cls, filename: str) -> Dict[str, List[AnyJsonData]]: + """ + Reads a filename and returns a dictionary that maps sheet names to rows of dictionary data. + For more information, see documentation of AbstractTableSetManager. + """ + raise NotImplementedError(f".load(...) is not implemented for {cls.__name__}.") + + +class BasicTableSetManager(AbstractTableSetManager): + """ + A BasicTableManager provides some structure that most kinds of parsers will need. + In particular, everything will likely need some way of storing headers and some way of storing content + of each sheet. Even a csv file, which doesn't have multiple tabs can be seen as the degenerate case + of this where there's only one set of headers and only one block of content. + """ + + def _create_sheet_processor_state(self, sheetname: str) -> Any: + """ + This method provides for the possibility that some parsers will want auxiliary state, + (such as parsed headers or a line count or a table of temporary names for objects to cross-link + or some other such feature) that it carries with it as it moves from line to line parsing things. + Subclasses might therefore want to make this do something more interesting. + """ + ignored(sheetname) # subclasses might need this, but we don't + return None def __init__(self, filename: str): self.filename: str = filename - self.workbook: Optional[Workbook] = None self.headers_by_sheetname: Dict[str, List[str]] = {} - self.content_by_sheetname: Dict[str, List[Any]] = {} + self.content_by_sheetname: Dict[str, List[AnyJsonData]] = {} + self.workbook: Any = self._initialize_workbook() def sheet_headers(self, sheetname: str) -> List[str]: return self.headers_by_sheetname[sheetname] - def sheet_content(self, sheetname: str) -> List[Any]: + def sheet_content(self, sheetname: str) -> List[AnyJsonData]: return self.content_by_sheetname[sheetname] + def _initialize_workbook(self) -> Any: + """This function is responsible for opening the workbook and returning a workbook object.""" + raise NotImplementedError(f"._initialize_workbook() is not implemented for {self.__class__.__name__}.") + + def load_content(self) -> Any: + raise NotImplementedError(f".load_content() is not implemented for {self.__class__.__name__}.") + + +class TableSetManager(BasicTableSetManager): + + @classmethod + def load_table_set(cls, filename: str) -> AnyJsonData: + table_set_manager: TableSetManager = cls(filename) + return table_set_manager.load_content() + + def __init__(self, filename: str): + super().__init__(filename=filename) + + @property + def sheetnames(self) -> List[str]: + raise NotImplementedError(f".sheetnames is not implemented for {self.__class__.__name__}..") + + def _raw_row_generator_for_sheetname(self, sheetname: str) -> Iterable[SheetRow]: + """ + Given a sheetname and a state (returned by _sheet_loader_state), return a generator for a set of row values. + What constitutes a row is just something that _sheet_col_enumerator will be happy receiving. + """ + raise NotImplementedError(f"._rows_for_sheetname(...) is not implemented for {self.__class__.__name__}.") + + def _process_row(self, sheetname: str, state: Any, row: List[SheetCellValue]) -> AnyJsonData: + """ + This needs to take a state and whatever represents a row and + must return a list of objects representing column values. + What constitutes a row is just something that _sheet_col_enumerator will be happy receiving. + """ + raise NotImplementedError(f"._process_row(...) is not implemented for {self.__class__.__name__}.") + + def load_content(self) -> AnyJsonData: + for sheetname in self.sheetnames: + sheet_content = [] + state = self._create_sheet_processor_state(sheetname) + for row_data in self._raw_row_generator_for_sheetname(sheetname): + processed_row_data: AnyJsonData = self._process_row(sheetname, state, row_data) + sheet_content.append(processed_row_data) + self.content_by_sheetname[sheetname] = sheet_content + return self.content_by_sheetname + + @classmethod + def parse_cell_value(cls, value: SheetCellValue) -> AnyJsonData: + return prefer_number(value) + + +class XlsxManager(TableSetManager): + @classmethod def _all_rows(cls, sheet: Worksheet): row_max = sheet.max_row @@ -159,32 +285,36 @@ def _all_cols(cls, sheet: Worksheet): for col in range(1, col_max + 1): yield col - def _load_headers(self, sheet: Worksheet): + @property + def sheetnames(self) -> List[str]: + return self.workbook.sheetnames + + def _initialize_workbook(self) -> Workbook: + return openpyxl.load_workbook(self.filename) + + def _raw_row_generator_for_sheetname(self, sheetname: str) -> Iterable[SheetRow]: + sheet = self.workbook[sheetname] + return (self._get_raw_row_content_tuple(sheet, row) + for row in self._all_rows(sheet)) + + def _get_raw_row_content_tuple(self, sheet: Worksheet, row: int) -> SheetRow: + return [sheet.cell(row=row, column=col).value + for col in self._all_cols(sheet)] + + def _create_sheet_processor_state(self, sheetname: str) -> Headers: + sheet = self.workbook[sheetname] headers: List[str] = [str(sheet.cell(row=1, column=col).value) for col in self._all_cols(sheet)] self.headers_by_sheetname[sheet.title] = headers + return headers - def _load_row(self, *, sheet: Worksheet, row: int): - headers = self.sheet_headers(sheet.title) - row_dict: Dict[str, Any] = {headers[col-1]: sheet.cell(row=row, column=col).value - for col in self._all_cols(sheet)} - return row_dict - - def load_content(self): - workbook: Workbook = load_workbook(self.filename) - self.workbook = workbook - for sheetname in workbook.sheetnames: - sheet: Worksheet = workbook[sheetname] - self._load_headers(sheet) - content = [] - for row in self._all_rows(sheet): - row_dict = self._load_row(sheet=sheet, row=row) - content.append(row_dict) - self.content_by_sheetname[sheetname] = content - return self.content_by_sheetname + def _process_row(self, sheetname: str, headers: Headers, row_data: SheetRow) -> AnyJsonData: + ignored(sheetname) + return {headers[i]: self.parse_cell_value(row_datum) + for i, row_datum in enumerate(row_data)} -class ItemManager(ItemTools, WorkbookManager): +class ItemManagerMixin(BasicTableSetManager): def __init__(self, filename: str): super().__init__(filename=filename) @@ -197,22 +327,85 @@ def sheet_patch_prototype(self, sheetname: str) -> Dict: def sheet_parsed_headers(self, sheetname: str) -> List[List[Union[int, str]]]: return self.parsed_headers_by_sheetname[sheetname] - def _load_headers(self, sheet: Worksheet): - super()._load_headers(sheet) - self._compile_sheet_headers(sheet.title) + def _create_sheet_processor_state(self, sheetname: str) -> ParsedHeaders: + super()._create_sheet_processor_state(sheetname) + self._compile_sheet_headers(sheetname) + return self.sheet_parsed_headers(sheetname) def _compile_sheet_headers(self, sheetname: str): headers = self.headers_by_sheetname[sheetname] - parsed_headers = self.parse_sheet_headers(headers) + parsed_headers = ItemTools.parse_sheet_headers(headers) self.parsed_headers_by_sheetname[sheetname] = parsed_headers - prototype = self.compute_patch_prototype(parsed_headers) + prototype = ItemTools.compute_patch_prototype(parsed_headers) self.patch_prototypes_by_sheetname[sheetname] = prototype - def _load_row(self, *, sheet: Worksheet, row: int): - parsed_headers = self.sheet_parsed_headers(sheet.title) - patch_item = copy.deepcopy(self.sheet_patch_prototype(sheet.title)) - for col in self._all_cols(sheet): - value = sheet.cell(row=row, column=col).value - parsed_value = self.parse_value(value) - self.set_path_value(patch_item, parsed_headers[col - 1], parsed_value) + def _process_row(self, sheetname: str, parsed_headers: ParsedHeaders, row_data: SheetRow) -> AnyJsonData: + patch_item = copy.deepcopy(self.sheet_patch_prototype(sheetname)) + for i, value in enumerate(row_data): + parsed_value = self.parse_cell_value(value) + ItemTools.set_path_value(patch_item, parsed_headers[i], parsed_value) return patch_item + + @classmethod + def parse_cell_value(cls, value: SheetCellValue) -> AnyJsonData: + return ItemTools.parse_item_value(value) + + +class ItemXlsxManager(ItemManagerMixin, XlsxManager): + pass + + +class CsvManager(TableSetManager): + + DEFAULT_SHEET_NAME = 'Sheet1' + + def __init__(self, filename: str, sheet_name: str = None): + super().__init__(filename=filename) + self.sheet_name = sheet_name or self.DEFAULT_SHEET_NAME + + @property + def sheetnames(self) -> List[str]: + return [self.sheet_name] + + def _initialize_workbook(self) -> CsvReader: + return self._get_csv_reader(self.filename) + + @classmethod + def _get_csv_reader(cls, filename) -> CsvReader: + return csv.reader(open_text_input_file_respecting_byte_order_mark(filename)) + + def _raw_row_generator_for_sheetname(self, sheetname: str) -> Iterable[SheetRow]: + return self.workbook + + def _create_sheet_processor_state(self, sheetname: str) -> Headers: + headers: Headers = self.headers_by_sheetname.get(sheetname) + if headers is None: + self.headers_by_sheetname[sheetname] = headers = self.workbook.__next__() + return headers + + def _process_row(self, sheetname: str, headers: Headers, row_data: SheetRow) -> AnyJsonData: + ignored(sheetname) + return {headers[i]: self.parse_cell_value(row_datum) + for i, row_datum in enumerate(row_data)} + + +class ItemCsvManager(ItemManagerMixin, CsvManager): + pass + + +class ItemManager(AbstractTableSetManager): + + @classmethod + def create_workbook(cls, filename: str) -> BasicTableSetManager: + if filename.endswith(".xlsx"): + workbook = ItemXlsxManager(filename) + elif filename.endswith(".csv"): + workbook = ItemCsvManager(filename) + else: + raise ValueError("Unknown workbook type: ") + return workbook + + @classmethod + def load_table_set(cls, filename: str) -> AnyJsonData: + workbook = cls.create_workbook(filename) + return workbook.load_content() diff --git a/poetry.lock b/poetry.lock index 480148ea1..95670b506 100644 --- a/poetry.lock +++ b/poetry.lock @@ -489,6 +489,18 @@ files = [ [package.dependencies] pycparser = "*" +[[package]] +name = "chardet" +version = "5.2.0" +description = "Universal encoding detector for Python 3" +category = "main" +optional = false +python-versions = ">=3.7" +files = [ + {file = "chardet-5.2.0-py3-none-any.whl", hash = "sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970"}, + {file = "chardet-5.2.0.tar.gz", hash = "sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7"}, +] + [[package]] name = "charset-normalizer" version = "3.2.0" @@ -1621,4 +1633,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more [metadata] lock-version = "2.0" python-versions = ">=3.7,<3.10" -content-hash = "9d01884634874c0304ebd91ae564ad7920cece54aea7de4c67955c2343e7d44b" +content-hash = "eb629a04469e24b917d9525dd06dac72f2014cc9ede879946909929f5c09b9fd" diff --git a/pyproject.toml b/pyproject.toml index 8fd8826a4..0ca37b8cf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,6 +44,7 @@ botocore = "^1.20.39" # This value is intentionally pinned and must not be changed casually. elasticsearch = "7.13.4" aws-requests-auth = ">=0.4.2,<1" +chardet = "^5.2.0" docker = "^4.4.4" gitpython = "^3.1.2" openpyxl = "^3.1.2" diff --git a/test/test_sheet_utils.py b/test/test_sheet_utils.py index 40286d2e3..df1ed522c 100644 --- a/test/test_sheet_utils.py +++ b/test/test_sheet_utils.py @@ -1,7 +1,7 @@ import os import pytest -from dcicutils.sheet_utils import ItemTools, WorkbookManager, ItemManager +from dcicutils.sheet_utils import ItemTools, XlsxManager, ItemXlsxManager, CsvManager, ItemCsvManager, ItemManager from .conftest_settings import TEST_DIR @@ -52,39 +52,39 @@ def test_item_tools_compute_patch_prototype_errors(headers): assert str(exc.value) == "A header cannot begin with a numeric ref: 0" -def test_item_tools_parse_value(): +def test_item_tools_parse_item_value(): for x in [37, 19.3, True, False, None, 'simple text']: - assert ItemTools.parse_value(x) == x + assert ItemTools.parse_item_value(x) == x - assert ItemTools.parse_value('3') == 3 - assert ItemTools.parse_value('+3') == 3 - assert ItemTools.parse_value('-3') == -3 + assert ItemTools.parse_item_value('3') == 3 + assert ItemTools.parse_item_value('+3') == 3 + assert ItemTools.parse_item_value('-3') == -3 - assert ItemTools.parse_value('3.5') == 3.5 - assert ItemTools.parse_value('+3.5') == 3.5 - assert ItemTools.parse_value('-3.5') == -3.5 + assert ItemTools.parse_item_value('3.5') == 3.5 + assert ItemTools.parse_item_value('+3.5') == 3.5 + assert ItemTools.parse_item_value('-3.5') == -3.5 - assert ItemTools.parse_value('3.5e1') == 35.0 - assert ItemTools.parse_value('+3.5e1') == 35.0 - assert ItemTools.parse_value('-3.5e1') == -35.0 + assert ItemTools.parse_item_value('3.5e1') == 35.0 + assert ItemTools.parse_item_value('+3.5e1') == 35.0 + assert ItemTools.parse_item_value('-3.5e1') == -35.0 - assert ItemTools.parse_value('') is None + assert ItemTools.parse_item_value('') is None - assert ItemTools.parse_value('null') is None - assert ItemTools.parse_value('Null') is None - assert ItemTools.parse_value('NULL') is None + assert ItemTools.parse_item_value('null') is None + assert ItemTools.parse_item_value('Null') is None + assert ItemTools.parse_item_value('NULL') is None - assert ItemTools.parse_value('true') is True - assert ItemTools.parse_value('True') is True - assert ItemTools.parse_value('TRUE') is True + assert ItemTools.parse_item_value('true') is True + assert ItemTools.parse_item_value('True') is True + assert ItemTools.parse_item_value('TRUE') is True - assert ItemTools.parse_value('false') is False - assert ItemTools.parse_value('False') is False - assert ItemTools.parse_value('FALSE') is False + assert ItemTools.parse_item_value('false') is False + assert ItemTools.parse_item_value('False') is False + assert ItemTools.parse_item_value('FALSE') is False - assert ItemTools.parse_value('alpha|beta|gamma') == ['alpha', 'beta', 'gamma'] - assert ItemTools.parse_value('alpha|true|false|null||7|1.5') == ['alpha', True, False, None, None, 7, 1.5] + assert ItemTools.parse_item_value('alpha|beta|gamma') == ['alpha', 'beta', 'gamma'] + assert ItemTools.parse_item_value('alpha|true|false|null||7|1.5') == ['alpha', True, False, None, None, 7, 1.5] def test_item_tools_set_path_value(): @@ -158,40 +158,84 @@ def test_item_tools_set_path_value(): SAMPLE_CSV_FILE = os.path.join(TEST_DIR, 'data_files/sample_items_sheet2.csv') -SAMPLE_CSV_FILE_RAW_CONTENT = SAMPLE_XLSX_FILE_RAW_CONTENT['Sheet2'] +SAMPLE_CSV_FILE_RAW_CONTENT = {CsvManager.DEFAULT_SHEET_NAME: SAMPLE_XLSX_FILE_RAW_CONTENT['Sheet2']} -SAMPLE_CSV_FILE_ITEM_CONTENT = SAMPLE_XLSX_FILE_ITEM_CONTENT['Sheet2'] +SAMPLE_CSV_FILE_ITEM_CONTENT = {ItemCsvManager.DEFAULT_SHEET_NAME: SAMPLE_XLSX_FILE_ITEM_CONTENT['Sheet2']} -def test_workbook_manager_load_content(): +def test_xlsx_manager_load_content(): - wt = WorkbookManager(SAMPLE_XLSX_FILE) + wt = XlsxManager(SAMPLE_XLSX_FILE) assert wt.load_content() == SAMPLE_XLSX_FILE_RAW_CONTENT -def test_workbook_manager_load_workbook(): +def test_xlsx_manager_load_workbook(): - assert WorkbookManager.load_workbook(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_RAW_CONTENT + assert XlsxManager.load_table_set(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_RAW_CONTENT -def test_workbook_manager_load_csv(): +def test_xlsx_manager_load_csv(): with pytest.raises(Exception): - WorkbookManager.load_workbook(SAMPLE_CSV_FILE) + XlsxManager.load_table_set(SAMPLE_CSV_FILE) -def test_item_manager_load_content(): +def test_item_xlsx_manager_load_content(): - it = ItemManager(SAMPLE_XLSX_FILE) + it = ItemXlsxManager(SAMPLE_XLSX_FILE) assert it.load_content() == SAMPLE_XLSX_FILE_ITEM_CONTENT -def test_item_manager_load_workbook(): +def test_item_xlsx_manager_load_workbook(): + + assert ItemXlsxManager.load_table_set(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_ITEM_CONTENT + + +def test_item_xlsx_manager_load_csv(): + + with pytest.raises(Exception): + ItemXlsxManager.load_table_set(SAMPLE_CSV_FILE) + - assert ItemManager.load_workbook(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_ITEM_CONTENT +def test_csv_manager_load_content(): + wt = CsvManager(SAMPLE_CSV_FILE) + assert wt.load_content() == SAMPLE_CSV_FILE_RAW_CONTENT -def test_item_manager_load_csv(): + +def test_csv_manager_load_workbook(): + + assert CsvManager.load_table_set(SAMPLE_CSV_FILE) == SAMPLE_CSV_FILE_RAW_CONTENT + + +def test_csv_manager_load_csv(): with pytest.raises(Exception): - ItemManager.load_workbook(SAMPLE_CSV_FILE) + CsvManager.load_table_set(SAMPLE_XLSX_FILE) + + +def test_item_csv_manager_load_content(): + + it = ItemCsvManager(SAMPLE_CSV_FILE) + assert it.load_content() == SAMPLE_CSV_FILE_ITEM_CONTENT + + +def test_item_csv_manager_load_workbook(): + + assert ItemCsvManager.load_table_set(SAMPLE_CSV_FILE) == SAMPLE_CSV_FILE_ITEM_CONTENT + + +def test_item_csv_manager_load_csv(): + + with pytest.raises(Exception): + ItemCsvManager.load_table_set(SAMPLE_XLSX_FILE) + + +def test_item_manager_load_workbook(): + + assert ItemManager.load_table_set(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_ITEM_CONTENT + + assert ItemManager.load_table_set(SAMPLE_CSV_FILE) == SAMPLE_CSV_FILE_ITEM_CONTENT + + with pytest.raises(ValueError): + ItemManager.load_table_set("something.else") From 39bd2e095898b36c819e5330f19ab94591792a6c Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Thu, 17 Aug 2023 11:20:34 -0400 Subject: [PATCH 012/101] Some name changes to make things more abstract. workbook becomes reader_agent, for example --- dcicutils/sheet_utils.py | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index e2f0e1c4d..82647ddb3 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -211,7 +211,7 @@ def __init__(self, filename: str): self.filename: str = filename self.headers_by_sheetname: Dict[str, List[str]] = {} self.content_by_sheetname: Dict[str, List[AnyJsonData]] = {} - self.workbook: Any = self._initialize_workbook() + self.reader_agent: Any = self._get_reader_agent() def sheet_headers(self, sheetname: str) -> List[str]: return self.headers_by_sheetname[sheetname] @@ -219,9 +219,9 @@ def sheet_headers(self, sheetname: str) -> List[str]: def sheet_content(self, sheetname: str) -> List[AnyJsonData]: return self.content_by_sheetname[sheetname] - def _initialize_workbook(self) -> Any: + def _get_reader_agent(self) -> Any: """This function is responsible for opening the workbook and returning a workbook object.""" - raise NotImplementedError(f"._initialize_workbook() is not implemented for {self.__class__.__name__}.") + raise NotImplementedError(f"._get_reader_agent() is not implemented for {self.__class__.__name__}.") def load_content(self) -> Any: raise NotImplementedError(f".load_content() is not implemented for {self.__class__.__name__}.") @@ -287,13 +287,13 @@ def _all_cols(cls, sheet: Worksheet): @property def sheetnames(self) -> List[str]: - return self.workbook.sheetnames + return self.reader_agent.sheetnames - def _initialize_workbook(self) -> Workbook: + def _get_reader_agent(self) -> Workbook: return openpyxl.load_workbook(self.filename) def _raw_row_generator_for_sheetname(self, sheetname: str) -> Iterable[SheetRow]: - sheet = self.workbook[sheetname] + sheet = self.reader_agent[sheetname] return (self._get_raw_row_content_tuple(sheet, row) for row in self._all_rows(sheet)) @@ -302,7 +302,7 @@ def _get_raw_row_content_tuple(self, sheet: Worksheet, row: int) -> SheetRow: for col in self._all_cols(sheet)] def _create_sheet_processor_state(self, sheetname: str) -> Headers: - sheet = self.workbook[sheetname] + sheet = self.reader_agent[sheetname] headers: List[str] = [str(sheet.cell(row=1, column=col).value) for col in self._all_cols(sheet)] self.headers_by_sheetname[sheet.title] = headers @@ -367,7 +367,7 @@ def __init__(self, filename: str, sheet_name: str = None): def sheetnames(self) -> List[str]: return [self.sheet_name] - def _initialize_workbook(self) -> CsvReader: + def _get_reader_agent(self) -> CsvReader: return self._get_csv_reader(self.filename) @classmethod @@ -375,12 +375,12 @@ def _get_csv_reader(cls, filename) -> CsvReader: return csv.reader(open_text_input_file_respecting_byte_order_mark(filename)) def _raw_row_generator_for_sheetname(self, sheetname: str) -> Iterable[SheetRow]: - return self.workbook + return self.reader_agent def _create_sheet_processor_state(self, sheetname: str) -> Headers: headers: Headers = self.headers_by_sheetname.get(sheetname) if headers is None: - self.headers_by_sheetname[sheetname] = headers = self.workbook.__next__() + self.headers_by_sheetname[sheetname] = headers = self.reader_agent.__next__() return headers def _process_row(self, sheetname: str, headers: Headers, row_data: SheetRow) -> AnyJsonData: @@ -396,16 +396,16 @@ class ItemCsvManager(ItemManagerMixin, CsvManager): class ItemManager(AbstractTableSetManager): @classmethod - def create_workbook(cls, filename: str) -> BasicTableSetManager: + def create_implementation_manager(cls, filename: str) -> BasicTableSetManager: if filename.endswith(".xlsx"): - workbook = ItemXlsxManager(filename) + reader_agent = ItemXlsxManager(filename) elif filename.endswith(".csv"): - workbook = ItemCsvManager(filename) + reader_agent = ItemCsvManager(filename) else: - raise ValueError("Unknown workbook type: ") - return workbook + raise ValueError(f"Unknown file type: {filename}") + return reader_agent @classmethod def load_table_set(cls, filename: str) -> AnyJsonData: - workbook = cls.create_workbook(filename) - return workbook.load_content() + manager = cls.create_implementation_manager(filename) + return manager.load_content() From 77b72f6452b72aed0b1ead9ba9a5a81c53838122 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Thu, 17 Aug 2023 11:30:52 -0400 Subject: [PATCH 013/101] Rename sheetname to tabname throughout, to be more clear that this is not the workbook level artifact. Better handling of init args. --- dcicutils/sheet_utils.py | 134 ++++++++++++++++++++------------------- test/test_sheet_utils.py | 4 +- 2 files changed, 72 insertions(+), 66 deletions(-) diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index 82647ddb3..5608e61ea 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -180,6 +180,11 @@ class AbstractTableSetManager: happen is not constrained by this class. """ + @classmethod + def __init__(self, **kwargs): + if kwargs: + raise ValueError(f"Unexpectd keyword arguments initializing {self.__class__.__name__}: {kwargs}") + @classmethod def load_table_set(cls, filename: str) -> Dict[str, List[AnyJsonData]]: """ @@ -197,27 +202,28 @@ class BasicTableSetManager(AbstractTableSetManager): of this where there's only one set of headers and only one block of content. """ - def _create_sheet_processor_state(self, sheetname: str) -> Any: + def _create_sheet_processor_state(self, tabname: str) -> Any: """ This method provides for the possibility that some parsers will want auxiliary state, (such as parsed headers or a line count or a table of temporary names for objects to cross-link or some other such feature) that it carries with it as it moves from line to line parsing things. Subclasses might therefore want to make this do something more interesting. """ - ignored(sheetname) # subclasses might need this, but we don't + ignored(tabname) # subclasses might need this, but we don't return None - def __init__(self, filename: str): + def __init__(self, filename: str, **kwargs): + super().__init__(**kwargs) self.filename: str = filename - self.headers_by_sheetname: Dict[str, List[str]] = {} - self.content_by_sheetname: Dict[str, List[AnyJsonData]] = {} + self.headers_by_tabname: Dict[str, List[str]] = {} + self.content_by_tabname: Dict[str, List[AnyJsonData]] = {} self.reader_agent: Any = self._get_reader_agent() - def sheet_headers(self, sheetname: str) -> List[str]: - return self.headers_by_sheetname[sheetname] + def sheet_headers(self, tabname: str) -> List[str]: + return self.headers_by_tabname[tabname] - def sheet_content(self, sheetname: str) -> List[AnyJsonData]: - return self.content_by_sheetname[sheetname] + def sheet_content(self, tabname: str) -> List[AnyJsonData]: + return self.content_by_tabname[tabname] def _get_reader_agent(self) -> Any: """This function is responsible for opening the workbook and returning a workbook object.""" @@ -234,21 +240,21 @@ def load_table_set(cls, filename: str) -> AnyJsonData: table_set_manager: TableSetManager = cls(filename) return table_set_manager.load_content() - def __init__(self, filename: str): - super().__init__(filename=filename) + def __init__(self, filename: str, **kwargs): + super().__init__(filename=filename, **kwargs) @property - def sheetnames(self) -> List[str]: - raise NotImplementedError(f".sheetnames is not implemented for {self.__class__.__name__}..") + def tabnames(self) -> List[str]: + raise NotImplementedError(f".tabnames is not implemented for {self.__class__.__name__}..") - def _raw_row_generator_for_sheetname(self, sheetname: str) -> Iterable[SheetRow]: + def _raw_row_generator_for_tabname(self, tabname: str) -> Iterable[SheetRow]: """ - Given a sheetname and a state (returned by _sheet_loader_state), return a generator for a set of row values. + Given a tabname and a state (returned by _sheet_loader_state), return a generator for a set of row values. What constitutes a row is just something that _sheet_col_enumerator will be happy receiving. """ - raise NotImplementedError(f"._rows_for_sheetname(...) is not implemented for {self.__class__.__name__}.") + raise NotImplementedError(f"._rows_for_tabname(...) is not implemented for {self.__class__.__name__}.") - def _process_row(self, sheetname: str, state: Any, row: List[SheetCellValue]) -> AnyJsonData: + def _process_row(self, tabname: str, state: Any, row: List[SheetCellValue]) -> AnyJsonData: """ This needs to take a state and whatever represents a row and must return a list of objects representing column values. @@ -257,14 +263,14 @@ def _process_row(self, sheetname: str, state: Any, row: List[SheetCellValue]) -> raise NotImplementedError(f"._process_row(...) is not implemented for {self.__class__.__name__}.") def load_content(self) -> AnyJsonData: - for sheetname in self.sheetnames: + for tabname in self.tabnames: sheet_content = [] - state = self._create_sheet_processor_state(sheetname) - for row_data in self._raw_row_generator_for_sheetname(sheetname): - processed_row_data: AnyJsonData = self._process_row(sheetname, state, row_data) + state = self._create_sheet_processor_state(tabname) + for row_data in self._raw_row_generator_for_tabname(tabname): + processed_row_data: AnyJsonData = self._process_row(tabname, state, row_data) sheet_content.append(processed_row_data) - self.content_by_sheetname[sheetname] = sheet_content - return self.content_by_sheetname + self.content_by_tabname[tabname] = sheet_content + return self.content_by_tabname @classmethod def parse_cell_value(cls, value: SheetCellValue) -> AnyJsonData: @@ -286,14 +292,14 @@ def _all_cols(cls, sheet: Worksheet): yield col @property - def sheetnames(self) -> List[str]: + def tabnames(self) -> List[str]: return self.reader_agent.sheetnames def _get_reader_agent(self) -> Workbook: return openpyxl.load_workbook(self.filename) - def _raw_row_generator_for_sheetname(self, sheetname: str) -> Iterable[SheetRow]: - sheet = self.reader_agent[sheetname] + def _raw_row_generator_for_tabname(self, tabname: str) -> Iterable[SheetRow]: + sheet = self.reader_agent[tabname] return (self._get_raw_row_content_tuple(sheet, row) for row in self._all_rows(sheet)) @@ -301,46 +307,46 @@ def _get_raw_row_content_tuple(self, sheet: Worksheet, row: int) -> SheetRow: return [sheet.cell(row=row, column=col).value for col in self._all_cols(sheet)] - def _create_sheet_processor_state(self, sheetname: str) -> Headers: - sheet = self.reader_agent[sheetname] + def _create_sheet_processor_state(self, tabname: str) -> Headers: + sheet = self.reader_agent[tabname] headers: List[str] = [str(sheet.cell(row=1, column=col).value) for col in self._all_cols(sheet)] - self.headers_by_sheetname[sheet.title] = headers + self.headers_by_tabname[sheet.title] = headers return headers - def _process_row(self, sheetname: str, headers: Headers, row_data: SheetRow) -> AnyJsonData: - ignored(sheetname) + def _process_row(self, tabname: str, headers: Headers, row_data: SheetRow) -> AnyJsonData: + ignored(tabname) return {headers[i]: self.parse_cell_value(row_datum) for i, row_datum in enumerate(row_data)} class ItemManagerMixin(BasicTableSetManager): - def __init__(self, filename: str): - super().__init__(filename=filename) - self.patch_prototypes_by_sheetname: Dict[str, Dict] = {} - self.parsed_headers_by_sheetname: Dict[str, List[List[Union[int, str]]]] = {} + def __init__(self, filename: str, **kwargs): + super().__init__(filename=filename, **kwargs) + self.patch_prototypes_by_tabname: Dict[str, Dict] = {} + self.parsed_headers_by_tabname: Dict[str, List[List[Union[int, str]]]] = {} - def sheet_patch_prototype(self, sheetname: str) -> Dict: - return self.patch_prototypes_by_sheetname[sheetname] + def sheet_patch_prototype(self, tabname: str) -> Dict: + return self.patch_prototypes_by_tabname[tabname] - def sheet_parsed_headers(self, sheetname: str) -> List[List[Union[int, str]]]: - return self.parsed_headers_by_sheetname[sheetname] + def sheet_parsed_headers(self, tabname: str) -> List[List[Union[int, str]]]: + return self.parsed_headers_by_tabname[tabname] - def _create_sheet_processor_state(self, sheetname: str) -> ParsedHeaders: - super()._create_sheet_processor_state(sheetname) - self._compile_sheet_headers(sheetname) - return self.sheet_parsed_headers(sheetname) + def _create_sheet_processor_state(self, tabname: str) -> ParsedHeaders: + super()._create_sheet_processor_state(tabname) + self._compile_sheet_headers(tabname) + return self.sheet_parsed_headers(tabname) - def _compile_sheet_headers(self, sheetname: str): - headers = self.headers_by_sheetname[sheetname] + def _compile_sheet_headers(self, tabname: str): + headers = self.headers_by_tabname[tabname] parsed_headers = ItemTools.parse_sheet_headers(headers) - self.parsed_headers_by_sheetname[sheetname] = parsed_headers + self.parsed_headers_by_tabname[tabname] = parsed_headers prototype = ItemTools.compute_patch_prototype(parsed_headers) - self.patch_prototypes_by_sheetname[sheetname] = prototype + self.patch_prototypes_by_tabname[tabname] = prototype - def _process_row(self, sheetname: str, parsed_headers: ParsedHeaders, row_data: SheetRow) -> AnyJsonData: - patch_item = copy.deepcopy(self.sheet_patch_prototype(sheetname)) + def _process_row(self, tabname: str, parsed_headers: ParsedHeaders, row_data: SheetRow) -> AnyJsonData: + patch_item = copy.deepcopy(self.sheet_patch_prototype(tabname)) for i, value in enumerate(row_data): parsed_value = self.parse_cell_value(value) ItemTools.set_path_value(patch_item, parsed_headers[i], parsed_value) @@ -357,15 +363,15 @@ class ItemXlsxManager(ItemManagerMixin, XlsxManager): class CsvManager(TableSetManager): - DEFAULT_SHEET_NAME = 'Sheet1' + DEFAULT_TAB_NAME = 'Sheet1' - def __init__(self, filename: str, sheet_name: str = None): - super().__init__(filename=filename) - self.sheet_name = sheet_name or self.DEFAULT_SHEET_NAME + def __init__(self, filename: str, sheet_name: str = None, **kwargs): + super().__init__(filename=filename, **kwargs) + self.tab_name = sheet_name or self.DEFAULT_TAB_NAME @property - def sheetnames(self) -> List[str]: - return [self.sheet_name] + def tabnames(self) -> List[str]: + return [self.tab_name] def _get_reader_agent(self) -> CsvReader: return self._get_csv_reader(self.filename) @@ -374,17 +380,17 @@ def _get_reader_agent(self) -> CsvReader: def _get_csv_reader(cls, filename) -> CsvReader: return csv.reader(open_text_input_file_respecting_byte_order_mark(filename)) - def _raw_row_generator_for_sheetname(self, sheetname: str) -> Iterable[SheetRow]: + def _raw_row_generator_for_tabname(self, tabname: str) -> Iterable[SheetRow]: return self.reader_agent - def _create_sheet_processor_state(self, sheetname: str) -> Headers: - headers: Headers = self.headers_by_sheetname.get(sheetname) + def _create_sheet_processor_state(self, tabname: str) -> Headers: + headers: Headers = self.headers_by_tabname.get(tabname) if headers is None: - self.headers_by_sheetname[sheetname] = headers = self.reader_agent.__next__() + self.headers_by_tabname[tabname] = headers = self.reader_agent.__next__() return headers - def _process_row(self, sheetname: str, headers: Headers, row_data: SheetRow) -> AnyJsonData: - ignored(sheetname) + def _process_row(self, tabname: str, headers: Headers, row_data: SheetRow) -> AnyJsonData: + ignored(tabname) return {headers[i]: self.parse_cell_value(row_datum) for i, row_datum in enumerate(row_data)} @@ -396,11 +402,11 @@ class ItemCsvManager(ItemManagerMixin, CsvManager): class ItemManager(AbstractTableSetManager): @classmethod - def create_implementation_manager(cls, filename: str) -> BasicTableSetManager: + def create_implementation_manager(cls, filename: str, **kwargs) -> BasicTableSetManager: if filename.endswith(".xlsx"): - reader_agent = ItemXlsxManager(filename) + reader_agent = ItemXlsxManager(filename, **kwargs) elif filename.endswith(".csv"): - reader_agent = ItemCsvManager(filename) + reader_agent = ItemCsvManager(filename, **kwargs) else: raise ValueError(f"Unknown file type: {filename}") return reader_agent diff --git a/test/test_sheet_utils.py b/test/test_sheet_utils.py index df1ed522c..4a32e928f 100644 --- a/test/test_sheet_utils.py +++ b/test/test_sheet_utils.py @@ -158,9 +158,9 @@ def test_item_tools_set_path_value(): SAMPLE_CSV_FILE = os.path.join(TEST_DIR, 'data_files/sample_items_sheet2.csv') -SAMPLE_CSV_FILE_RAW_CONTENT = {CsvManager.DEFAULT_SHEET_NAME: SAMPLE_XLSX_FILE_RAW_CONTENT['Sheet2']} +SAMPLE_CSV_FILE_RAW_CONTENT = {CsvManager.DEFAULT_TAB_NAME: SAMPLE_XLSX_FILE_RAW_CONTENT['Sheet2']} -SAMPLE_CSV_FILE_ITEM_CONTENT = {ItemCsvManager.DEFAULT_SHEET_NAME: SAMPLE_XLSX_FILE_ITEM_CONTENT['Sheet2']} +SAMPLE_CSV_FILE_ITEM_CONTENT = {ItemCsvManager.DEFAULT_TAB_NAME: SAMPLE_XLSX_FILE_ITEM_CONTENT['Sheet2']} def test_xlsx_manager_load_content(): From ba8c55c922bdf967d18b9201b7fbebb3bfeb5f7b Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Thu, 17 Aug 2023 12:10:23 -0400 Subject: [PATCH 014/101] Add some doc strings. Rename load_table_set to just load. Arrange for ItemManager.load to take a tab_name argument so that CSV files can perhaps infer a type name. --- dcicutils/sheet_utils.py | 91 +++++++++++++++++++++++++--------------- test/test_sheet_utils.py | 22 +++++----- 2 files changed, 68 insertions(+), 45 deletions(-) diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index 5608e61ea..fc2e4752a 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -160,7 +160,7 @@ def set_path_value(cls, datum: Union[List, Dict], path: ParsedHeader, value: Any class AbstractTableSetManager: """ The TableSetManager is the spanning class of anything that wants to be able to load a table set, - regardless of what it wants to load it from. To do this, it must support a load_table_set method + regardless of what it wants to load it from. To do this, it must support a load method that takes a filename and returns the file content in the form: { "Sheet1": [ @@ -180,13 +180,12 @@ class AbstractTableSetManager: happen is not constrained by this class. """ - @classmethod def __init__(self, **kwargs): if kwargs: - raise ValueError(f"Unexpectd keyword arguments initializing {self.__class__.__name__}: {kwargs}") + raise ValueError(f"Got unexpected keywords: {kwargs}") @classmethod - def load_table_set(cls, filename: str) -> Dict[str, List[AnyJsonData]]: + def load(cls, filename: str) -> Dict[str, List[AnyJsonData]]: """ Reads a filename and returns a dictionary that maps sheet names to rows of dictionary data. For more information, see documentation of AbstractTableSetManager. @@ -202,16 +201,6 @@ class BasicTableSetManager(AbstractTableSetManager): of this where there's only one set of headers and only one block of content. """ - def _create_sheet_processor_state(self, tabname: str) -> Any: - """ - This method provides for the possibility that some parsers will want auxiliary state, - (such as parsed headers or a line count or a table of temporary names for objects to cross-link - or some other such feature) that it carries with it as it moves from line to line parsing things. - Subclasses might therefore want to make this do something more interesting. - """ - ignored(tabname) # subclasses might need this, but we don't - return None - def __init__(self, filename: str, **kwargs): super().__init__(**kwargs) self.filename: str = filename @@ -219,12 +208,22 @@ def __init__(self, filename: str, **kwargs): self.content_by_tabname: Dict[str, List[AnyJsonData]] = {} self.reader_agent: Any = self._get_reader_agent() - def sheet_headers(self, tabname: str) -> List[str]: + def tab_headers(self, tabname: str) -> List[str]: return self.headers_by_tabname[tabname] - def sheet_content(self, tabname: str) -> List[AnyJsonData]: + def tab_content(self, tabname: str) -> List[AnyJsonData]: return self.content_by_tabname[tabname] + def _create_tab_processor_state(self, tabname: str) -> Any: + """ + This method provides for the possibility that some parsers will want auxiliary state, + (such as parsed headers or a line count or a table of temporary names for objects to cross-link + or some other such feature) that it carries with it as it moves from line to line parsing things. + Subclasses might therefore want to make this do something more interesting. + """ + ignored(tabname) # subclasses might need this, but we don't + return None + def _get_reader_agent(self) -> Any: """This function is responsible for opening the workbook and returning a workbook object.""" raise NotImplementedError(f"._get_reader_agent() is not implemented for {self.__class__.__name__}.") @@ -236,12 +235,12 @@ def load_content(self) -> Any: class TableSetManager(BasicTableSetManager): @classmethod - def load_table_set(cls, filename: str) -> AnyJsonData: + def load(cls, filename: str) -> AnyJsonData: table_set_manager: TableSetManager = cls(filename) return table_set_manager.load_content() - def __init__(self, filename: str, **kwargs): - super().__init__(filename=filename, **kwargs) + def __init__(self, filename: str): + super().__init__(filename=filename) @property def tabnames(self) -> List[str]: @@ -250,7 +249,6 @@ def tabnames(self) -> List[str]: def _raw_row_generator_for_tabname(self, tabname: str) -> Iterable[SheetRow]: """ Given a tabname and a state (returned by _sheet_loader_state), return a generator for a set of row values. - What constitutes a row is just something that _sheet_col_enumerator will be happy receiving. """ raise NotImplementedError(f"._rows_for_tabname(...) is not implemented for {self.__class__.__name__}.") @@ -258,14 +256,14 @@ def _process_row(self, tabname: str, state: Any, row: List[SheetCellValue]) -> A """ This needs to take a state and whatever represents a row and must return a list of objects representing column values. - What constitutes a row is just something that _sheet_col_enumerator will be happy receiving. + What constitutes a processed up to the class, but other than that the result must be a JSON dictionary. """ raise NotImplementedError(f"._process_row(...) is not implemented for {self.__class__.__name__}.") def load_content(self) -> AnyJsonData: for tabname in self.tabnames: sheet_content = [] - state = self._create_sheet_processor_state(tabname) + state = self._create_tab_processor_state(tabname) for row_data in self._raw_row_generator_for_tabname(tabname): processed_row_data: AnyJsonData = self._process_row(tabname, state, row_data) sheet_content.append(processed_row_data) @@ -278,6 +276,9 @@ def parse_cell_value(cls, value: SheetCellValue) -> AnyJsonData: class XlsxManager(TableSetManager): + """ + This implements the mechanism to get a series of rows out of the sheets in an XLSX file. + """ @classmethod def _all_rows(cls, sheet: Worksheet): @@ -307,7 +308,7 @@ def _get_raw_row_content_tuple(self, sheet: Worksheet, row: int) -> SheetRow: return [sheet.cell(row=row, column=col).value for col in self._all_cols(sheet)] - def _create_sheet_processor_state(self, tabname: str) -> Headers: + def _create_tab_processor_state(self, tabname: str) -> Headers: sheet = self.reader_agent[tabname] headers: List[str] = [str(sheet.cell(row=1, column=col).value) for col in self._all_cols(sheet)] @@ -321,6 +322,10 @@ def _process_row(self, tabname: str, headers: Headers, row_data: SheetRow) -> An class ItemManagerMixin(BasicTableSetManager): + """ + This can add functionality to a reader such as an XlsxManager or a CsvManager in order to make its rows + get handled like Items instead of just flat table rows. + """ def __init__(self, filename: str, **kwargs): super().__init__(filename=filename, **kwargs) @@ -333,8 +338,10 @@ def sheet_patch_prototype(self, tabname: str) -> Dict: def sheet_parsed_headers(self, tabname: str) -> List[List[Union[int, str]]]: return self.parsed_headers_by_tabname[tabname] - def _create_sheet_processor_state(self, tabname: str) -> ParsedHeaders: - super()._create_sheet_processor_state(tabname) + def _create_tab_processor_state(self, tabname: str) -> ParsedHeaders: + super()._create_tab_processor_state(tabname) + # This will create state that allows us to efficiently assign values in the right place on each row + # by setting up a prototype we can copy and then drop values into. self._compile_sheet_headers(tabname) return self.sheet_parsed_headers(tabname) @@ -358,16 +365,23 @@ def parse_cell_value(cls, value: SheetCellValue) -> AnyJsonData: class ItemXlsxManager(ItemManagerMixin, XlsxManager): + """ + This layers item-style row processing functionality on an XLSX file. + """ pass class CsvManager(TableSetManager): + """ + This implements the mechanism to get a series of rows out of the sheet in a csv file, + returning a result that still looks like there could have been multiple tabs. + """ DEFAULT_TAB_NAME = 'Sheet1' - def __init__(self, filename: str, sheet_name: str = None, **kwargs): - super().__init__(filename=filename, **kwargs) - self.tab_name = sheet_name or self.DEFAULT_TAB_NAME + def __init__(self, filename: str, tab_name=None): + super().__init__(filename=filename) + self.tab_name = tab_name or self.DEFAULT_TAB_NAME @property def tabnames(self) -> List[str]: @@ -383,7 +397,7 @@ def _get_csv_reader(cls, filename) -> CsvReader: def _raw_row_generator_for_tabname(self, tabname: str) -> Iterable[SheetRow]: return self.reader_agent - def _create_sheet_processor_state(self, tabname: str) -> Headers: + def _create_tab_processor_state(self, tabname: str) -> Headers: headers: Headers = self.headers_by_tabname.get(tabname) if headers is None: self.headers_by_tabname[tabname] = headers = self.reader_agent.__next__() @@ -396,22 +410,31 @@ def _process_row(self, tabname: str, headers: Headers, row_data: SheetRow) -> An class ItemCsvManager(ItemManagerMixin, CsvManager): + """ + This layers item-style row processing functionality on a CSV file. + """ pass class ItemManager(AbstractTableSetManager): + """ + This class will open a .xlsx or .csv file and load its content in our standard format. + (See more detailed description in AbstractTableManager.) + """ @classmethod - def create_implementation_manager(cls, filename: str, **kwargs) -> BasicTableSetManager: + def create_implementation_manager(cls, filename: str, tab_name=None) -> BasicTableSetManager: if filename.endswith(".xlsx"): - reader_agent = ItemXlsxManager(filename, **kwargs) + if tab_name is not None: + raise ValueError(f".xlsx files don't need tab_name={tab_name!r}") + reader_agent = ItemXlsxManager(filename) elif filename.endswith(".csv"): - reader_agent = ItemCsvManager(filename, **kwargs) + reader_agent = ItemCsvManager(filename, tab_name=tab_name) else: raise ValueError(f"Unknown file type: {filename}") return reader_agent @classmethod - def load_table_set(cls, filename: str) -> AnyJsonData: - manager = cls.create_implementation_manager(filename) + def load(cls, filename: str, tab_name=None) -> AnyJsonData: + manager = cls.create_implementation_manager(filename, tab_name=tab_name) return manager.load_content() diff --git a/test/test_sheet_utils.py b/test/test_sheet_utils.py index 4a32e928f..c2809a9f4 100644 --- a/test/test_sheet_utils.py +++ b/test/test_sheet_utils.py @@ -171,13 +171,13 @@ def test_xlsx_manager_load_content(): def test_xlsx_manager_load_workbook(): - assert XlsxManager.load_table_set(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_RAW_CONTENT + assert XlsxManager.load(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_RAW_CONTENT def test_xlsx_manager_load_csv(): with pytest.raises(Exception): - XlsxManager.load_table_set(SAMPLE_CSV_FILE) + XlsxManager.load(SAMPLE_CSV_FILE) def test_item_xlsx_manager_load_content(): @@ -188,13 +188,13 @@ def test_item_xlsx_manager_load_content(): def test_item_xlsx_manager_load_workbook(): - assert ItemXlsxManager.load_table_set(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_ITEM_CONTENT + assert ItemXlsxManager.load(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_ITEM_CONTENT def test_item_xlsx_manager_load_csv(): with pytest.raises(Exception): - ItemXlsxManager.load_table_set(SAMPLE_CSV_FILE) + ItemXlsxManager.load(SAMPLE_CSV_FILE) def test_csv_manager_load_content(): @@ -205,13 +205,13 @@ def test_csv_manager_load_content(): def test_csv_manager_load_workbook(): - assert CsvManager.load_table_set(SAMPLE_CSV_FILE) == SAMPLE_CSV_FILE_RAW_CONTENT + assert CsvManager.load(SAMPLE_CSV_FILE) == SAMPLE_CSV_FILE_RAW_CONTENT def test_csv_manager_load_csv(): with pytest.raises(Exception): - CsvManager.load_table_set(SAMPLE_XLSX_FILE) + CsvManager.load(SAMPLE_XLSX_FILE) def test_item_csv_manager_load_content(): @@ -222,20 +222,20 @@ def test_item_csv_manager_load_content(): def test_item_csv_manager_load_workbook(): - assert ItemCsvManager.load_table_set(SAMPLE_CSV_FILE) == SAMPLE_CSV_FILE_ITEM_CONTENT + assert ItemCsvManager.load(SAMPLE_CSV_FILE) == SAMPLE_CSV_FILE_ITEM_CONTENT def test_item_csv_manager_load_csv(): with pytest.raises(Exception): - ItemCsvManager.load_table_set(SAMPLE_XLSX_FILE) + ItemCsvManager.load(SAMPLE_XLSX_FILE) def test_item_manager_load_workbook(): - assert ItemManager.load_table_set(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_ITEM_CONTENT + assert ItemManager.load(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_ITEM_CONTENT - assert ItemManager.load_table_set(SAMPLE_CSV_FILE) == SAMPLE_CSV_FILE_ITEM_CONTENT + assert ItemManager.load(SAMPLE_CSV_FILE) == SAMPLE_CSV_FILE_ITEM_CONTENT with pytest.raises(ValueError): - ItemManager.load_table_set("something.else") + ItemManager.load("something.else") From 50488cb7411d29b72fa8109f6de93d1841861541 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Thu, 17 Aug 2023 12:24:46 -0400 Subject: [PATCH 015/101] Add load_items function. Fix some test names. Update changelog. --- CHANGELOG.rst | 15 +++++++++++++-- dcicutils/sheet_utils.py | 3 +++ test/test_sheet_utils.py | 24 ++++++++++++++++++------ 3 files changed, 34 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 61f334d68..f07b9a4c3 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -12,9 +12,20 @@ Change Log * New module ``sheet_utils`` for loading workbooks. - * class ``WorkbookManager`` for loading raw data + * Important things of interest: - * class ``ItemManager`` for loading item data + * Class ``ItemManager`` for loading Item-style data + from either ``.xlsx`` or ``.csv`` files. + + * Function ``load_items`` that does the same as ``ItemManager.load``. + + * Various low-level implementation classes such as: + + * Classes ``XlsxManager`` and ``CsvManager`` for loading raw data + from ``.xlsx`` and ``.csv`` files, respectively. + + * Classes ``ItemXlsxManager`` and ``ItemCsvManager`` for loading Item-style data + from ``.xlsx`` and ``.csv`` files, respectively. 7.7.2 diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index fc2e4752a..7a6959a47 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -438,3 +438,6 @@ def create_implementation_manager(cls, filename: str, tab_name=None) -> BasicTab def load(cls, filename: str, tab_name=None) -> AnyJsonData: manager = cls.create_implementation_manager(filename, tab_name=tab_name) return manager.load_content() + + +load_items = ItemManager.load diff --git a/test/test_sheet_utils.py b/test/test_sheet_utils.py index c2809a9f4..b98c56fa5 100644 --- a/test/test_sheet_utils.py +++ b/test/test_sheet_utils.py @@ -1,7 +1,9 @@ import os import pytest -from dcicutils.sheet_utils import ItemTools, XlsxManager, ItemXlsxManager, CsvManager, ItemCsvManager, ItemManager +from dcicutils.sheet_utils import ( + ItemTools, XlsxManager, ItemXlsxManager, CsvManager, ItemCsvManager, ItemManager, load_items, +) from .conftest_settings import TEST_DIR @@ -169,7 +171,7 @@ def test_xlsx_manager_load_content(): assert wt.load_content() == SAMPLE_XLSX_FILE_RAW_CONTENT -def test_xlsx_manager_load_workbook(): +def test_xlsx_manager_load(): assert XlsxManager.load(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_RAW_CONTENT @@ -186,7 +188,7 @@ def test_item_xlsx_manager_load_content(): assert it.load_content() == SAMPLE_XLSX_FILE_ITEM_CONTENT -def test_item_xlsx_manager_load_workbook(): +def test_item_xlsx_manager_load(): assert ItemXlsxManager.load(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_ITEM_CONTENT @@ -203,7 +205,7 @@ def test_csv_manager_load_content(): assert wt.load_content() == SAMPLE_CSV_FILE_RAW_CONTENT -def test_csv_manager_load_workbook(): +def test_csv_manager_load(): assert CsvManager.load(SAMPLE_CSV_FILE) == SAMPLE_CSV_FILE_RAW_CONTENT @@ -220,7 +222,7 @@ def test_item_csv_manager_load_content(): assert it.load_content() == SAMPLE_CSV_FILE_ITEM_CONTENT -def test_item_csv_manager_load_workbook(): +def test_item_csv_manager_load(): assert ItemCsvManager.load(SAMPLE_CSV_FILE) == SAMPLE_CSV_FILE_ITEM_CONTENT @@ -231,7 +233,7 @@ def test_item_csv_manager_load_csv(): ItemCsvManager.load(SAMPLE_XLSX_FILE) -def test_item_manager_load_workbook(): +def test_item_manager_load(): assert ItemManager.load(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_ITEM_CONTENT @@ -239,3 +241,13 @@ def test_item_manager_load_workbook(): with pytest.raises(ValueError): ItemManager.load("something.else") + + +def test_load_items(): + + assert load_items(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_ITEM_CONTENT + + assert load_items(SAMPLE_CSV_FILE) == SAMPLE_CSV_FILE_ITEM_CONTENT + + with pytest.raises(ValueError): + load_items("something.else") From 807e525965b11f6506b944b7775b84bd2e640082 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Thu, 17 Aug 2023 12:48:41 -0400 Subject: [PATCH 016/101] Experimental bug fix from Will to hopefully make get_schema_names work. --- dcicutils/ff_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dcicutils/ff_utils.py b/dcicutils/ff_utils.py index e50ececf8..37a0439db 100644 --- a/dcicutils/ff_utils.py +++ b/dcicutils/ff_utils.py @@ -961,7 +961,7 @@ def get_schema_names(key=None, ff_env=None): if value.get('isAbstract') is True: continue # some test schemas in local don't have the id field - schema_filename = value.get('id') + schema_filename = value.get('$id') if schema_filename: schema_name[key] = schema_filename.split('/')[-1][:-5] return schema_name From 2a8e81a420ee629ad0e20d25e0d98282d0d27cab Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Thu, 17 Aug 2023 13:02:39 -0400 Subject: [PATCH 017/101] update changelog --- CHANGELOG.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index f07b9a4c3..0eda8ff0c 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -27,6 +27,8 @@ Change Log * Classes ``ItemXlsxManager`` and ``ItemCsvManager`` for loading Item-style data from ``.xlsx`` and ``.csv`` files, respectively. +* Contains a fix for a bug in ``ff_utils.get_schema_names`` (`C4-1086 `_). + 7.7.2 ===== From 718054adeef8c77b737a0ae3895c61db0fccce0f Mon Sep 17 00:00:00 2001 From: Kent M Pitman Date: Thu, 17 Aug 2023 17:23:51 -0400 Subject: [PATCH 018/101] Update dcicutils/sheet_utils.py Co-authored-by: drio18 <58236592+drio18@users.noreply.github.com> --- dcicutils/sheet_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index 7a6959a47..4b3dae21c 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -172,7 +172,7 @@ class AbstractTableSetManager: } Note that at this level of abstraction, we take no position on what form of representation is used for the rows, as long as it is JSON data of some kind. It might be - {"col1": "val1", "col2", "val2", ...} + {"col1": "val1", "col2": "val2", ...} or it might be something more structured like {"something": "val1", {"something_else": ["val2"]}} Additionally, the values stored might be altered as well. In particular, the most likely alteration From 56d1459c735c22579786c45312fa2e0ccd33466c Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Thu, 17 Aug 2023 17:56:56 -0400 Subject: [PATCH 019/101] Add some comments in response to Doug's code review. --- dcicutils/sheet_utils.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index 4b3dae21c..4060e9f0d 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -157,6 +157,11 @@ def set_path_value(cls, datum: Union[List, Dict], path: ParsedHeader, value: Any cls.set_path_value(datum[key], more_path, value) +# TODO: Consider whether this might want to be an abstract base class. Some change might be needed. +# Doug thinks we might want (metaclass=ABCMeta) here to make this an abstract base class. +# I am less certain but open to discussion. Among other things, as implemented now, +# the __init__ method here needs to run and the documentation says that ABC's won't appear +# in the method resolution order. -kmp 17-Aug-2023 class AbstractTableSetManager: """ The TableSetManager is the spanning class of anything that wants to be able to load a table set, @@ -184,6 +189,7 @@ def __init__(self, **kwargs): if kwargs: raise ValueError(f"Got unexpected keywords: {kwargs}") + # TODO: Consider whether this should be an abstractmethod (but first see detailed design note at top of class.) @classmethod def load(cls, filename: str) -> Dict[str, List[AnyJsonData]]: """ From 2facf9ef9ff31fdf1a712815641e52d9c26fc7a4 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Thu, 17 Aug 2023 18:16:54 -0400 Subject: [PATCH 020/101] Support TSV files. --- dcicutils/sheet_utils.py | 22 +++++++++++++++++++ test/test_sheet_utils.py | 46 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 67 insertions(+), 1 deletion(-) diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index 4060e9f0d..072e36e21 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -158,10 +158,12 @@ def set_path_value(cls, datum: Union[List, Dict], path: ParsedHeader, value: Any # TODO: Consider whether this might want to be an abstract base class. Some change might be needed. +# # Doug thinks we might want (metaclass=ABCMeta) here to make this an abstract base class. # I am less certain but open to discussion. Among other things, as implemented now, # the __init__ method here needs to run and the documentation says that ABC's won't appear # in the method resolution order. -kmp 17-Aug-2023 +# See also discussion at https://github.com/4dn-dcic/utils/pull/276#discussion_r1297775535 class AbstractTableSetManager: """ The TableSetManager is the spanning class of anything that wants to be able to load a table set, @@ -407,6 +409,7 @@ def _create_tab_processor_state(self, tabname: str) -> Headers: headers: Headers = self.headers_by_tabname.get(tabname) if headers is None: self.headers_by_tabname[tabname] = headers = self.reader_agent.__next__() + print(f"Headers={headers}") return headers def _process_row(self, tabname: str, headers: Headers, row_data: SheetRow) -> AnyJsonData: @@ -422,6 +425,23 @@ class ItemCsvManager(ItemManagerMixin, CsvManager): pass +class TsvManager(CsvManager): + """ + TSV files are just CSV files with tabs instead of commas as separators. + (We do not presently handle any escaping of strange characters. May need to add handling for backslash escaping.) + """ + @classmethod + def _get_csv_reader(cls, filename) -> CsvReader: + return csv.reader(open_text_input_file_respecting_byte_order_mark(filename), delimiter='\t') + + +class ItemTsvManager(ItemManagerMixin, TsvManager): + """ + This layers item-style row processing functionality on a TSV file. + """ + pass + + class ItemManager(AbstractTableSetManager): """ This class will open a .xlsx or .csv file and load its content in our standard format. @@ -436,6 +456,8 @@ def create_implementation_manager(cls, filename: str, tab_name=None) -> BasicTab reader_agent = ItemXlsxManager(filename) elif filename.endswith(".csv"): reader_agent = ItemCsvManager(filename, tab_name=tab_name) + elif filename.endswith(".tsv"): + reader_agent = ItemTsvManager(filename, tab_name=tab_name) else: raise ValueError(f"Unknown file type: {filename}") return reader_agent diff --git a/test/test_sheet_utils.py b/test/test_sheet_utils.py index b98c56fa5..1915b3a71 100644 --- a/test/test_sheet_utils.py +++ b/test/test_sheet_utils.py @@ -2,7 +2,11 @@ import pytest from dcicutils.sheet_utils import ( - ItemTools, XlsxManager, ItemXlsxManager, CsvManager, ItemCsvManager, ItemManager, load_items, + # High-level interfaces + ItemManager, load_items, + # Low-level implementation + ItemTools, XlsxManager, ItemXlsxManager, + CsvManager, ItemCsvManager, TsvManager, ItemTsvManager, ) from .conftest_settings import TEST_DIR @@ -164,6 +168,12 @@ def test_item_tools_set_path_value(): SAMPLE_CSV_FILE_ITEM_CONTENT = {ItemCsvManager.DEFAULT_TAB_NAME: SAMPLE_XLSX_FILE_ITEM_CONTENT['Sheet2']} +SAMPLE_TSV_FILE = os.path.join(TEST_DIR, 'data_files/sample_items_sheet2.tsv') + +SAMPLE_TSV_FILE_RAW_CONTENT = {TsvManager.DEFAULT_TAB_NAME: SAMPLE_XLSX_FILE_RAW_CONTENT['Sheet2']} + +SAMPLE_TSV_FILE_ITEM_CONTENT = {ItemTsvManager.DEFAULT_TAB_NAME: SAMPLE_XLSX_FILE_ITEM_CONTENT['Sheet2']} + def test_xlsx_manager_load_content(): @@ -233,6 +243,40 @@ def test_item_csv_manager_load_csv(): ItemCsvManager.load(SAMPLE_XLSX_FILE) +def test_tsv_manager_load_content(): + + wt = TsvManager(SAMPLE_TSV_FILE) + assert wt.load_content() == SAMPLE_TSV_FILE_RAW_CONTENT + + +def test_tsv_manager_load(): + + assert TsvManager.load(SAMPLE_TSV_FILE) == SAMPLE_TSV_FILE_RAW_CONTENT + + +def test_tsv_manager_load_csv(): + + with pytest.raises(Exception): + TsvManager.load(SAMPLE_XLSX_FILE) + + +def test_item_tsv_manager_load_content(): + + it = ItemTsvManager(SAMPLE_TSV_FILE) + assert it.load_content() == SAMPLE_TSV_FILE_ITEM_CONTENT + + +def test_item_tsv_manager_load(): + + assert ItemTsvManager.load(SAMPLE_TSV_FILE) == SAMPLE_TSV_FILE_ITEM_CONTENT + + +def test_item_tsv_manager_load_csv(): + + with pytest.raises(Exception): + ItemTsvManager.load(SAMPLE_XLSX_FILE) + + def test_item_manager_load(): assert ItemManager.load(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_ITEM_CONTENT From bcc4e636c3bea64a7cc792a949f2c4a5897066b9 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Thu, 17 Aug 2023 18:19:01 -0400 Subject: [PATCH 021/101] Add changelog info about tsv files. --- CHANGELOG.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index b57796c8a..2796e8def 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -15,17 +15,17 @@ Change Log * Important things of interest: * Class ``ItemManager`` for loading Item-style data - from either ``.xlsx`` or ``.csv`` files. + from any ``.xlsx``, ``.csv`` or ``.tsv`` files. * Function ``load_items`` that does the same as ``ItemManager.load``. * Various low-level implementation classes such as: - * Classes ``XlsxManager`` and ``CsvManager`` for loading raw data - from ``.xlsx`` and ``.csv`` files, respectively. + * Classes ``XlsxManager``, ``CsvManager`` and ``TsvManager`` for loading raw data + from ``.xlsx``, ``.csv``, and ``.tsv`` files, respectively. - * Classes ``ItemXlsxManager`` and ``ItemCsvManager`` for loading Item-style data - from ``.xlsx`` and ``.csv`` files, respectively. + * Classes ``ItemXlsxManager``, ``ItemCsvManager``, and ``ItemTsvManager`` for loading Item-style data + from ``.xlsx``, ``.csv``, and ``.tsv`` files, respectively. * Contains a fix for a bug in ``ff_utils.get_schema_names`` (`C4-1086 `_). From 9de282e5e5475b66b36b2cabe3260ee521df9077 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Thu, 17 Aug 2023 18:22:11 -0400 Subject: [PATCH 022/101] Add a missing data file. --- test/data_files/sample_items_sheet2.tsv | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 test/data_files/sample_items_sheet2.tsv diff --git a/test/data_files/sample_items_sheet2.tsv b/test/data_files/sample_items_sheet2.tsv new file mode 100644 index 000000000..d2c9e0e47 --- /dev/null +++ b/test/data_files/sample_items_sheet2.tsv @@ -0,0 +1,3 @@ +name age mother.name mother.age father.name father.age friends#0.name friends#0.age friends#1.name friends#1.age +bill 23 mary 58 fred 63 sam 22 arthur 19 +joe 9 estrella 35 anthony 34 anders 9 \ No newline at end of file From 8d6495f5acc2b5e2d48f6991f601d8b3c6439245 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Wed, 23 Aug 2023 16:43:55 -0400 Subject: [PATCH 023/101] First stable cut at schema hinting. Doesn't find schemas automatically yet, though. --- dcicutils/misc_utils.py | 9 + dcicutils/sheet_utils.py | 207 ++++++++++++++++++---- test/data_files/sample_items2.csv | 5 + test/data_files/sample_items3.csv | 5 + test/test_misc_utils.py | 29 +++- test/test_sheet_utils.py | 274 +++++++++++++++++++++++++++++- 6 files changed, 495 insertions(+), 34 deletions(-) create mode 100644 test/data_files/sample_items2.csv create mode 100644 test/data_files/sample_items3.csv diff --git a/dcicutils/misc_utils.py b/dcicutils/misc_utils.py index fd0747d43..393b33435 100644 --- a/dcicutils/misc_utils.py +++ b/dcicutils/misc_utils.py @@ -1339,6 +1339,15 @@ def capitalize1(s): return s[:1].upper() + s[1:] +uuid_re = re.compile(r'(?i)[{]?(?:[0-9a-f]{4}-?){8}[}]?') + + +def is_uuid(instance): + # Python's UUID ignores all dashes, whereas Postgres is more strict + # http://www.postgresql.org/docs/9.2/static/datatype-uuid.html + return bool(uuid_re.match(instance)) + + def string_list(s): """ Turns a comma-separated list into an actual list, trimming whitespace and ignoring nulls. diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index 072e36e21..b1ed8d604 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -3,13 +3,15 @@ import csv import io import openpyxl +import uuid from dcicutils.common import AnyJsonData +from dcicutils.lang_utils import conjoined_list, maybe_pluralize from dcicutils.misc_utils import ignored from openpyxl.worksheet.worksheet import Worksheet from openpyxl.workbook.workbook import Workbook from tempfile import TemporaryFile -from typing import Any, Dict, Iterable, List, Union +from typing import Any, Dict, Iterable, List, Optional, Union Header = str @@ -21,6 +23,17 @@ CsvReader = type(csv.reader(TemporaryFile())) +def unwanted_kwargs(*, context, kwargs, context_plural=False, detailed=False): + if kwargs: + unwanted = [f"{argname}={value!r}" if detailed else argname + for argname, value in kwargs.items() + if value is not None] + if unwanted: + does_not = "don't" if context_plural else "doesn't" + raise ValueError(f"{context} {does_not} use" + f" {maybe_pluralize(unwanted, 'keyword argument')} {conjoined_list(unwanted)}.") + + def prefer_number(value: SheetCellValue): if isinstance(value, str): # the given value might be an int or float, in which case just fall through if not value: @@ -53,6 +66,57 @@ def open_text_input_file_respecting_byte_order_mark(filename): return io.open(filename, 'r', encoding=detected_encoding) +class TypeHint: + def apply_hint(self, value): + return value + + def __str__(self): + return f"<{self.__class__.__name__}>" + + def __repr__(self): + return self.__str__() + + +class BoolHint(TypeHint): + + def apply_hint(self, value): + if isinstance(value, str) and value: + if 'true'.startswith(value.lower()): + return True + elif 'false'.startswith(value.lower()): + return False + return super().apply_hint(value) + + +class EnumHint(TypeHint): + + def __str__(self): + return f"" + + def __init__(self, value_map): + self.value_map = value_map + + def apply_hint(self, value): + if isinstance(value, str): + if value in self.value_map: + result = self.value_map[value] + return result + else: + lvalue = value.lower() + found = [] + for lkey, key in self.value_map.items(): + if lkey.startswith(lvalue): + found.append(lkey) + if len(found) == 1: + [only_found] = found + result = self.value_map[only_found] + return result + return super().apply_hint(value) + + +OptionalTypeHints = List[Optional[TypeHint]] + + class ItemTools: """ Implements operations on table-related data without pre-supposing the specific representation of the table. @@ -128,8 +192,10 @@ def assure_patch_prototype_shape(cls, *, parent: Union[Dict, List], keys: Parsed cls.assure_patch_prototype_shape(parent=parent[key0], keys=more_keys) return parent + INSTAGUIDS_ENABLED = False # Experimental feature not enabled by default + @classmethod - def parse_item_value(cls, value: SheetCellValue) -> AnyJsonData: + def parse_item_value(cls, value: SheetCellValue, context=None) -> AnyJsonData: if isinstance(value, str): lvalue = value.lower() # TODO: We could consult a schema to make this less heuristic, but this may do for now @@ -140,12 +206,30 @@ def parse_item_value(cls, value: SheetCellValue) -> AnyJsonData: elif lvalue == 'null' or lvalue == '': return None elif '|' in value: - return [cls.parse_item_value(subvalue) for subvalue in value.split('|')] + if value == '|': # Use '|' for [] + return [] + else: + if value.endswith("|"): # Use 'foo|' for ['foo'] + value = value[:-1] + return [cls.parse_item_value(subvalue, context=context) for subvalue in value.split('|')] + elif cls.INSTAGUIDS_ENABLED and context is not None and value.startswith('#'): + # Note that this clause MUST follow '|' clause above so '#foo|#bar' isn't seen as instaguid + return cls.get_instaguid(value, context=context) else: return prefer_number(value) else: # presumably a number (int or float) return value + @classmethod + def get_instaguid(cls, guid_placeholder: str, *, context: Optional[Dict] = None): + if context is None: + return guid_placeholder + else: + referent = context.get(guid_placeholder) + if not referent: + context[guid_placeholder] = referent = str(uuid.uuid4()) + return referent + @classmethod def set_path_value(cls, datum: Union[List, Dict], path: ParsedHeader, value: Any, force: bool = False): if (value is None or value == '') and not force: @@ -156,6 +240,36 @@ def set_path_value(cls, datum: Union[List, Dict], path: ParsedHeader, value: Any else: cls.set_path_value(datum[key], more_path, value) + @classmethod + def find_type_hint(cls, parsed_header: ParsedHeader, schema: Any): + + def finder(subheader, subschema): + if not parsed_header: + return None + else: + [key1, *other_headers] = subheader + if isinstance(key1, str) and isinstance(subschema, dict): + if subschema.get('type') == 'object': + def1 = subschema.get('properties', {}).get(key1) + if not other_headers: + if def1 is not None: + t = def1.get('type') + if t == 'string': + enum = def1.get('enum') + if enum: + mapping = {e.lower(): e for e in enum} + return EnumHint(mapping) + elif t == 'boolean': + return BoolHint() + else: + pass # fall through to asking super() + else: + pass # fall through to asking super() + else: + return finder(subheader=other_headers, subschema=def1) + + return finder(subheader=parsed_header, subschema=schema) + # TODO: Consider whether this might want to be an abstract base class. Some change might be needed. # @@ -188,8 +302,7 @@ class AbstractTableSetManager: """ def __init__(self, **kwargs): - if kwargs: - raise ValueError(f"Got unexpected keywords: {kwargs}") + unwanted_kwargs(context=self.__class__.__name__, kwargs=kwargs) # TODO: Consider whether this should be an abstractmethod (but first see detailed design note at top of class.) @classmethod @@ -247,8 +360,8 @@ def load(cls, filename: str) -> AnyJsonData: table_set_manager: TableSetManager = cls(filename) return table_set_manager.load_content() - def __init__(self, filename: str): - super().__init__(filename=filename) + def __init__(self, filename: str, **kwargs): + super().__init__(filename=filename, **kwargs) @property def tabnames(self) -> List[str]: @@ -335,23 +448,36 @@ class ItemManagerMixin(BasicTableSetManager): get handled like Items instead of just flat table rows. """ - def __init__(self, filename: str, **kwargs): + def __init__(self, filename: str, schemas=None, **kwargs): super().__init__(filename=filename, **kwargs) self.patch_prototypes_by_tabname: Dict[str, Dict] = {} - self.parsed_headers_by_tabname: Dict[str, List[List[Union[int, str]]]] = {} + self.parsed_headers_by_tabname: Dict[str, ParsedHeaders] = {} + self.type_hints_by_tabname: Dict[str, OptionalTypeHints] = {} + self.schemas = schemas or {} + self._instaguid_context_table: Dict[str, str] = {} def sheet_patch_prototype(self, tabname: str) -> Dict: - return self.patch_prototypes_by_tabname[tabname] + result = self.patch_prototypes_by_tabname[tabname] + return result - def sheet_parsed_headers(self, tabname: str) -> List[List[Union[int, str]]]: + def sheet_parsed_headers(self, tabname: str) -> ParsedHeaders: return self.parsed_headers_by_tabname[tabname] - def _create_tab_processor_state(self, tabname: str) -> ParsedHeaders: - super()._create_tab_processor_state(tabname) - # This will create state that allows us to efficiently assign values in the right place on each row - # by setting up a prototype we can copy and then drop values into. - self._compile_sheet_headers(tabname) - return self.sheet_parsed_headers(tabname) + def sheet_type_hints(self, tabname: str) -> OptionalTypeHints: + return self.type_hints_by_tabname[tabname] + + class SheetState: + + def __init__(self, parsed_headers: ParsedHeaders, type_hints: OptionalTypeHints): + self.parsed_headers = parsed_headers + self.type_hints = type_hints + + def _compile_type_hints(self, tabname: str): + parsed_headers = self.sheet_parsed_headers(tabname) + schema = self.schemas.get(tabname) + type_hints = [ItemTools.find_type_hint(parsed_header, schema) if schema else None + for parsed_header in parsed_headers] + self.type_hints_by_tabname[tabname] = type_hints def _compile_sheet_headers(self, tabname: str): headers = self.headers_by_tabname[tabname] @@ -360,16 +486,29 @@ def _compile_sheet_headers(self, tabname: str): prototype = ItemTools.compute_patch_prototype(parsed_headers) self.patch_prototypes_by_tabname[tabname] = prototype - def _process_row(self, tabname: str, parsed_headers: ParsedHeaders, row_data: SheetRow) -> AnyJsonData: + def _create_tab_processor_state(self, tabname: str) -> SheetState: + super()._create_tab_processor_state(tabname) + # This will create state that allows us to efficiently assign values in the right place on each row + # by setting up a prototype we can copy and then drop values into. + self._compile_sheet_headers(tabname) + self._compile_type_hints(tabname) + return self.SheetState(parsed_headers=self.sheet_parsed_headers(tabname), + type_hints=self.sheet_type_hints(tabname)) + + def _process_row(self, tabname: str, state: SheetState, row_data: SheetRow) -> AnyJsonData: + parsed_headers = state.parsed_headers + type_hints = state.type_hints patch_item = copy.deepcopy(self.sheet_patch_prototype(tabname)) for i, value in enumerate(row_data): parsed_value = self.parse_cell_value(value) + type_hint = type_hints[i] + if type_hint: + parsed_value = type_hint.apply_hint(parsed_value) ItemTools.set_path_value(patch_item, parsed_headers[i], parsed_value) return patch_item - @classmethod - def parse_cell_value(cls, value: SheetCellValue) -> AnyJsonData: - return ItemTools.parse_item_value(value) + def parse_cell_value(self, value: SheetCellValue) -> AnyJsonData: + return ItemTools.parse_item_value(value, context=self._instaguid_context_table) class ItemXlsxManager(ItemManagerMixin, XlsxManager): @@ -387,9 +526,10 @@ class CsvManager(TableSetManager): DEFAULT_TAB_NAME = 'Sheet1' - def __init__(self, filename: str, tab_name=None): - super().__init__(filename=filename) + def __init__(self, filename: str, tab_name: Optional[str] = None, escaping: bool = False, **kwargs): + super().__init__(filename=filename, **kwargs) self.tab_name = tab_name or self.DEFAULT_TAB_NAME + self.escaping = escaping @property def tabnames(self) -> List[str]: @@ -409,7 +549,6 @@ def _create_tab_processor_state(self, tabname: str) -> Headers: headers: Headers = self.headers_by_tabname.get(tabname) if headers is None: self.headers_by_tabname[tabname] = headers = self.reader_agent.__next__() - print(f"Headers={headers}") return headers def _process_row(self, tabname: str, headers: Headers, row_data: SheetRow) -> AnyJsonData: @@ -449,22 +588,26 @@ class ItemManager(AbstractTableSetManager): """ @classmethod - def create_implementation_manager(cls, filename: str, tab_name=None) -> BasicTableSetManager: + def create_implementation_manager(cls, filename: str, escaping=None, **kwargs) -> BasicTableSetManager: if filename.endswith(".xlsx"): - if tab_name is not None: - raise ValueError(f".xlsx files don't need tab_name={tab_name!r}") - reader_agent = ItemXlsxManager(filename) + # unwanted_kwargs(context="ItemManager for .xlsx files", kwargs=kwargs) + reader_agent = ItemXlsxManager(filename, escaping=escaping, **kwargs) elif filename.endswith(".csv"): - reader_agent = ItemCsvManager(filename, tab_name=tab_name) + tab_name = kwargs.pop('tab_name', None) + # unwanted_kwargs(context="ItemManager for .csv files", kwargs=kwargs) + reader_agent = ItemCsvManager(filename, escaping=escaping, tab_name=tab_name, **kwargs) elif filename.endswith(".tsv"): - reader_agent = ItemTsvManager(filename, tab_name=tab_name) + tab_name = kwargs.pop('tab_name', None) + # unwanted_kwargs(context="ItemManager for .tsv files", kwargs=kwargs) + reader_agent = ItemTsvManager(filename, escaping=escaping, tab_name=tab_name, **kwargs) else: raise ValueError(f"Unknown file type: {filename}") return reader_agent @classmethod - def load(cls, filename: str, tab_name=None) -> AnyJsonData: - manager = cls.create_implementation_manager(filename, tab_name=tab_name) + def load(cls, filename: str, tab_name: Optional[str] = None, escaping: Optional[bool] = None, + schemas: Optional[Dict] = None) -> AnyJsonData: + manager = cls.create_implementation_manager(filename, tab_name=tab_name, escaping=escaping, schemas=schemas) return manager.load_content() diff --git a/test/data_files/sample_items2.csv b/test/data_files/sample_items2.csv new file mode 100644 index 000000000..2e32bf426 --- /dev/null +++ b/test/data_files/sample_items2.csv @@ -0,0 +1,5 @@ +name,sex,member +john,M,false +juan,male,true +igor,unknown, +mary,Female,t diff --git a/test/data_files/sample_items3.csv b/test/data_files/sample_items3.csv new file mode 100644 index 000000000..ee2d61b61 --- /dev/null +++ b/test/data_files/sample_items3.csv @@ -0,0 +1,5 @@ +name,sex,uuid,father,mother,parents,children +John,Male,#john,#igor,#mary,, +Juan,Male,#juan,,,#igor|#mary, +Igor,Male,#igor,,,,#john| +Mary,Female,#mary,,,,#john| diff --git a/test/test_misc_utils.py b/test/test_misc_utils.py index 895a25757..6a9f266f7 100644 --- a/test/test_misc_utils.py +++ b/test/test_misc_utils.py @@ -17,7 +17,7 @@ from dcicutils.misc_utils import ( PRINT, ignored, ignorable, filtered_warnings, get_setting_from_context, TestApp, VirtualApp, VirtualAppError, _VirtualAppHelper, # noqa - yes, this is a protected member, but we still want to test it - Retry, apply_dict_overrides, utc_today_str, RateManager, environ_bool, str_to_bool, + Retry, apply_dict_overrides, utc_today_str, RateManager, environ_bool, str_to_bool, is_uuid, LockoutManager, check_true, remove_prefix, remove_suffix, full_class_name, full_object_name, constantly, keyword_as_title, file_contents, CachedField, camel_case_to_snake_case, snake_case_to_camel_case, make_counter, CustomizableProperty, UncustomizedInstance, getattr_customized, copy_json, url_path_join, @@ -1990,6 +1990,33 @@ def test_capitalize1(token, expected): assert capitalize1(token) == expected +def test_is_uuid(): + + good_uuid = str(uuid.uuid4()) + bad_uuid = '123-456-789' + + assert not is_uuid("12345678abcd678123456781234") # wrong length. expecting 32 digits + assert not is_uuid("12-3456781234abcd1234567812345678") # hyphens only allowed at multiple of four boundaries + assert not is_uuid("12-3456781234abcd1234567-812345678") # ditto + + assert is_uuid("123456781234abcd1234567812345678") + assert is_uuid("12345678abcd56781234ABCD12345678") + assert is_uuid("1234-5678abcd56781234ABCD12345678") + assert is_uuid("12345678abcd-56781234ABCD1234-5678") + assert is_uuid("1234-5678-abcd56781234ABCD-12345678") + assert is_uuid("1234-5678-abcd-56781234ABCD12345678") + assert is_uuid("1234-5678-abcd-5678-1234-ABCD-1234-5678") + assert is_uuid("1234-5678-abcd-5678-1234-ABCD-1234-5678-") # we don't really want this, but we tolerate it + + assert is_uuid("{12345678abcd56781234ABCD12345678}") # braces are optionally allowed + assert is_uuid("{1234-5678-abcd5678-1234-ABCD-1234-5678}") # ditto + assert is_uuid("1234-5678-abcd5678-1234-ABCD-1234-5678}") # ditto + assert is_uuid("{1234-5678-abcd5678-1234-ABCD-1234-5678-}") # balanced braces trailing hyphen tolerated + + assert is_uuid(good_uuid) is True + assert is_uuid(bad_uuid) is False + + def test_string_list(): assert string_list('') == [] diff --git a/test/test_sheet_utils.py b/test/test_sheet_utils.py index 1915b3a71..83cf76bb4 100644 --- a/test/test_sheet_utils.py +++ b/test/test_sheet_utils.py @@ -1,16 +1,71 @@ +import json import os import pytest +from collections import namedtuple +from dcicutils.common import AnyJsonData +from dcicutils.misc_utils import is_uuid, local_attrs from dcicutils.sheet_utils import ( # High-level interfaces ItemManager, load_items, # Low-level implementation ItemTools, XlsxManager, ItemXlsxManager, CsvManager, ItemCsvManager, TsvManager, ItemTsvManager, + # TypeHint, EnumHint, + BoolHint, + # Utilities + prefer_number, unwanted_kwargs, ) +from typing import Dict from .conftest_settings import TEST_DIR +def test_prefer_number(): + + assert prefer_number('') is None + assert prefer_number('123') == 123 + assert prefer_number('3.14') == 3.14 + assert prefer_number('abc') == 'abc' + assert prefer_number('123i') == '123i' + assert prefer_number('123e') == '123e' + assert prefer_number('123e0') == 123.0 + assert prefer_number('123e1') == 1230.0 + assert prefer_number('123e+1') == 1230.0 + assert prefer_number('123e-1') == 12.3 + + +def test_unwanted_kwargs_without_error(): + unwanted_kwargs(context="Function foo", kwargs={}) + unwanted_kwargs(context="Function foo", kwargs={}, context_plural=True, detailed=True) + + +tst_args = "context,context_plural,detailed,kwargs,message" + +TstArgs = namedtuple("TstArgs1", tst_args, defaults=(None,) * len(tst_args.split(','))) + + +@pytest.mark.parametrize(tst_args, [ + TstArgs(context="Function foo", context_plural=False, detailed=False, kwargs={'a': 1}, + message="Function foo doesn't use keyword argument a."), + TstArgs(context="Function foo", context_plural=False, detailed=False, kwargs={'a': 1, 'b': 2}, + message="Function foo doesn't use keyword arguments a and b."), + TstArgs(context="Functions like foo", context_plural=True, detailed=False, kwargs={'a': 1}, + message="Functions like foo don't use keyword argument a."), + TstArgs(context="Functions like foo", context_plural=True, detailed=False, kwargs={'a': 1, 'b': 2}, + message="Functions like foo don't use keyword arguments a and b."), + # Don't need to do all the cases again + TstArgs(context="Function foo", kwargs={'a': 1, 'b': 2}, + message="Function foo doesn't use keyword arguments a and b."), # noQA - PyCharm can't see defaults + TstArgs(context="Function foo", detailed=True, kwargs={'a': 1, 'b': 2}, + message="Function foo doesn't use keyword arguments a=1 and b=2."), # noQA PyCharm can't see defaults +]) +def test_unwanted_kwargs_with_error(context, context_plural, detailed, kwargs, message): + + with pytest.raises(ValueError) as exc: + unwanted_kwargs(context=context, kwargs=kwargs, context_plural=context_plural, detailed=detailed) + assert str(exc.value) == message + + def test_item_tools_parse_sheet_header(): assert ItemTools.parse_sheet_header('.a') == ['a'] assert ItemTools.parse_sheet_header('a') == ['a'] @@ -58,7 +113,7 @@ def test_item_tools_compute_patch_prototype_errors(headers): assert str(exc.value) == "A header cannot begin with a numeric ref: 0" -def test_item_tools_parse_item_value(): +def test_item_tools_parse_item_value_basic(): for x in [37, 19.3, True, False, None, 'simple text']: assert ItemTools.parse_item_value(x) == x @@ -89,10 +144,50 @@ def test_item_tools_parse_item_value(): assert ItemTools.parse_item_value('False') is False assert ItemTools.parse_item_value('FALSE') is False + assert ItemTools.parse_item_value('|') == [] # special case: lone '|' means empty + assert ItemTools.parse_item_value('alpha|') == ['alpha'] # special case: trailing '|' means singleton + assert ItemTools.parse_item_value('|alpha|') == [None, 'alpha'] + assert ItemTools.parse_item_value('|alpha') == [None, 'alpha'] assert ItemTools.parse_item_value('alpha|beta|gamma') == ['alpha', 'beta', 'gamma'] assert ItemTools.parse_item_value('alpha|true|false|null||7|1.5') == ['alpha', True, False, None, None, 7, 1.5] +@pytest.mark.parametrize('instaguids_enabled', [True, False]) +def test_item_tools_parse_item_value_guids(instaguids_enabled): + + with local_attrs(ItemTools, INSTAGUIDS_ENABLED=instaguids_enabled): + + sample_simple_field_input= "#foo" + + parsed = ItemTools.parse_item_value(sample_simple_field_input) + assert parsed == sample_simple_field_input + + context = {} + parsed = ItemTools.parse_item_value(sample_simple_field_input, context=context) + if instaguids_enabled: + assert is_uuid(parsed) + assert parsed == context[sample_simple_field_input] + else: + assert parsed == sample_simple_field_input + assert context == {} + + sample_compound_field_input = '#foo|#bar' + sample_compound_field_list = ['#foo', '#bar'] + + parsed = ItemTools.parse_item_value(sample_compound_field_input) + assert parsed == sample_compound_field_list + + context = {} + parsed = ItemTools.parse_item_value(sample_compound_field_input, context=context) + assert isinstance(parsed, list) + if instaguids_enabled: + assert all(is_uuid(x) for x in parsed) + assert '#foo' in context and '#bar' in context + else: + assert parsed == sample_compound_field_list + assert context == {} + + def test_item_tools_set_path_value(): x = {'foo': 1, 'bar': 2} @@ -108,6 +203,49 @@ def test_item_tools_set_path_value(): assert x == {'foo': [11, 22, 33], 'bar': {'x': 'something', 'y': 'yy'}} +def test_item_tools_find_type_hint(): + + assert ItemTools.find_type_hint(['foo', 'bar'], None) is None + assert ItemTools.find_type_hint(['foo', 'bar'], "something") is None + assert ItemTools.find_type_hint(['foo', 'bar'], {}) is None + + actual = ItemTools.find_type_hint(['foo', 'bar'], {"type": "object"}) + assert actual is None + + schema = { + "type": "object", + "properties": { + "foo": { + "type": "boolean" + } + } + } + actual = ItemTools.find_type_hint(['foo', 'bar'], schema) + assert actual is None + + actual = ItemTools.find_type_hint(['foo'], schema) + assert isinstance(actual, BoolHint) + + schema = { + "type": "object", + "properties": { + "foo": { + "type": "object", + "properties": { + "bar": { + "type": "boolean" + } + } + } + } + } + actual = ItemTools.find_type_hint(['foo', 'bar'], schema) + assert isinstance(actual, BoolHint) + + actual = ItemTools.find_type_hint(['foo'], schema) + assert actual is None + + SAMPLE_XLSX_FILE = os.path.join(TEST_DIR, 'data_files/sample_items.xlsx') SAMPLE_XLSX_FILE_RAW_CONTENT = { @@ -295,3 +433,137 @@ def test_load_items(): with pytest.raises(ValueError): load_items("something.else") + + +SAMPLE_CSV_FILE2_SCHEMAS = { + "Person": { + "type": "object", + "properties": { + "name": {"type": "string"}, + "sex": {"type": "string", "enum": ["Male", "Female"]}, + "member": {"type": "boolean"} + } + } +} + +SAMPLE_CSV_FILE2_CONTENT = { + CsvManager.DEFAULT_TAB_NAME: [ + {"name": "john", "sex": "M", "member": "false"}, + {"name": "juan", "sex": "male", "member": "true"}, + {"name": "igor", "sex": "unknown", "member": None}, + {"name": "mary", "sex": "Female", "member": "t"} + ] +} + +SAMPLE_CSV_FILE2_ITEM_CONTENT = { + ItemCsvManager.DEFAULT_TAB_NAME: [ + {"name": "john", "sex": "M", "member": False}, + {"name": "juan", "sex": "male", "member": True}, + {"name": "igor", "sex": "unknown", "member": None}, + {"name": "mary", "sex": "Female", "member": "t"} + ] +} + +SAMPLE_CSV_FILE2_PERSON_CONTENT_HINTED = { + "Person": [ + {"name": "john", "sex": "Male", "member": False}, + {"name": "juan", "sex": "Male", "member": True}, + {"name": "igor", "sex": "unknown", "member": None}, + {"name": "mary", "sex": "Female", "member": True} + ] +} + +SAMPLE_CSV_FILE2 = os.path.join(TEST_DIR, 'data_files/sample_items2.csv') + +SAMPLE_CSV_FILE3_SCHEMAS = { + "Person": { + "type": "object", + "properties": { + "name": {"type": "string"}, + "sex": {"type": "string", "enum": ["Male", "Female"]}, + "children": {"type": "array", "items": {"type": "string"}}, + "parents": {"type": "array", "items": {"type": "string"}}, + "mother": {"type": "string"}, + "father": {"type": "string"}, + } + } +} + +SAMPLE_CSV_FILE3_PERSON_CONTENT_HINTED = { + "Person": [ + {"name": "John", "uuid": "#john", "sex": "Male", "father": "#igor", "mother": "#mary", "parents": None, "children": None}, + {"name": "Juan", "uuid": "#juan", "sex": "Male", "father": None, "mother": None, "parents": ["#igor", "#mary"], "children": None}, + {"name": "Igor", "uuid": "#igor", "sex": "Male", "father": None, "mother": None, "parents": None, "children": ["#john"]}, + {"name": "Mary", "uuid": "#mary", "sex": "Female", "father": None, "mother": None, "parents": None, "children": ["#john"]} + ] +} + +SAMPLE_CSV_FILE3 = os.path.join(TEST_DIR, 'data_files/sample_items3.csv') + + +def matches_template(json1: AnyJsonData, json2: AnyJsonData, *, previous_matches: Dict[str, str] = None) -> bool: + if previous_matches is None: + previous_matches = {} + if isinstance(json1, dict) and isinstance(json2, dict): + keys1 = set(json1.keys()) + keys2 = set(json2.keys()) + if keys1 != keys2: + print(f"Keys don't match: {keys1} vs {keys2}") + return False + return all(matches_template(json1[key], json2[key], previous_matches=previous_matches) for key in keys1) + elif isinstance(json1, list) and isinstance(json2, list): + n1 = len(json1) + n2 = len(json2) + if n1 != n2: + print(f"Length doesn't match: {n1} vs {n2}") + return False + return all(matches_template(json1[i], json2[i], previous_matches=previous_matches) for i in range(n1)) + elif isinstance(json1, str) and isinstance(json2, str) and is_uuid(json1) and json2.startswith("#"): + previously_matched = previous_matches.get(json2) + if previously_matched: + result = json1 == previously_matched + if not result: + print(f"Instaguid mismatch: {json1} vs {json2}") + return result + else: + # Remember the match + previous_matches[json2] = json1 + return True + else: # any other atomic items can be just directly compared + result = json1 == json2 + if not result: + print(f"Unequal: {json1} vs {json2}") + return result + + +def test_load_items_with_schema(): + + print("Case 1") + expected = SAMPLE_CSV_FILE2_CONTENT + actual = CsvManager.load(SAMPLE_CSV_FILE2) + assert actual == expected + + print("Case 2") + expected = SAMPLE_CSV_FILE2_ITEM_CONTENT + actual = load_items(SAMPLE_CSV_FILE2, schemas=SAMPLE_CSV_FILE2_SCHEMAS) + assert actual == expected + + print("Case 3") + expected = SAMPLE_CSV_FILE2_PERSON_CONTENT_HINTED + actual = load_items(SAMPLE_CSV_FILE2, schemas=SAMPLE_CSV_FILE2_SCHEMAS, tab_name='Person') + assert actual == expected + + +@pytest.mark.parametrize('instaguids_enabled', [True, False]) +def test_load_items_with_schema_and_instaguids(instaguids_enabled): + + with local_attrs(ItemTools, INSTAGUIDS_ENABLED=instaguids_enabled): + + expected = SAMPLE_CSV_FILE3_PERSON_CONTENT_HINTED + print("expected=", json.dumps(expected, indent=2)) + actual = load_items(SAMPLE_CSV_FILE3, schemas=SAMPLE_CSV_FILE3_SCHEMAS, tab_name='Person') + print("actual=", json.dumps(actual, indent=2)) + if instaguids_enabled: + assert matches_template(actual, expected) + else: + assert actual == expected # no substitution performed From 56f702aaa381fe96f456f2a8e5558c0f41d40027 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Wed, 23 Aug 2023 22:04:34 -0400 Subject: [PATCH 024/101] Mark chardet as an acceptable license for use. --- dcicutils/license_utils.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/dcicutils/license_utils.py b/dcicutils/license_utils.py index 855fa5c80..db18fd7df 100644 --- a/dcicutils/license_utils.py +++ b/dcicutils/license_utils.py @@ -810,6 +810,12 @@ class C4InfrastructureLicenseChecker(LicenseChecker): 'pytest-timeout', # MIT Licensed ], + # Linking = With Restrictions, Private Use = Yes + # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + 'GNU Lesser General Public License v2 or later (LGPLv2+)': [ + 'chardet' # used at runtime during server operation (ingestion), but not modified or distributed + ], + # Linking = With Restrictions, Private Use = Yes # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses 'GNU Lesser General Public License v3 or later (LGPLv3+)': [ From 60ada3fb1f8c24fec7aa89eae0ee7640fa9b555c Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Wed, 23 Aug 2023 22:32:52 -0400 Subject: [PATCH 025/101] Backport some small fixes and cosmetics from the schemas branch. --- dcicutils/sheet_utils.py | 50 +++++++++++++++++++++++++++------------- 1 file changed, 34 insertions(+), 16 deletions(-) diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index 072e36e21..f98b5f755 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -5,6 +5,7 @@ import openpyxl from dcicutils.common import AnyJsonData +from dcicutils.lang_utils import conjoined_list, maybe_pluralize from dcicutils.misc_utils import ignored from openpyxl.worksheet.worksheet import Worksheet from openpyxl.workbook.workbook import Workbook @@ -21,6 +22,17 @@ CsvReader = type(csv.reader(TemporaryFile())) +def unwanted_kwargs(*, context, kwargs, context_plural=False, detailed=False): + if kwargs: + unwanted = [f"{argname}={value!r}" if detailed else argname + for argname, value in kwargs.items() + if value is not None] + if unwanted: + does_not = "don't" if context_plural else "doesn't" + raise ValueError(f"{context} {does_not} use" + f" {maybe_pluralize(unwanted, 'keyword argument')} {conjoined_list(unwanted)}.") + + def prefer_number(value: SheetCellValue): if isinstance(value, str): # the given value might be an int or float, in which case just fall through if not value: @@ -140,7 +152,12 @@ def parse_item_value(cls, value: SheetCellValue) -> AnyJsonData: elif lvalue == 'null' or lvalue == '': return None elif '|' in value: - return [cls.parse_item_value(subvalue) for subvalue in value.split('|')] + if value == '|': # Use '|' for [] + return [] + else: + if value.endswith("|"): # Use 'foo|' for ['foo'] + value = value[:-1] + return [cls.parse_item_value(subvalue) for subvalue in value.split('|')] else: return prefer_number(value) else: # presumably a number (int or float) @@ -188,8 +205,7 @@ class AbstractTableSetManager: """ def __init__(self, **kwargs): - if kwargs: - raise ValueError(f"Got unexpected keywords: {kwargs}") + unwanted_kwargs(context=self.__class__.__name__, kwargs=kwargs) # TODO: Consider whether this should be an abstractmethod (but first see detailed design note at top of class.) @classmethod @@ -247,8 +263,8 @@ def load(cls, filename: str) -> AnyJsonData: table_set_manager: TableSetManager = cls(filename) return table_set_manager.load_content() - def __init__(self, filename: str): - super().__init__(filename=filename) + def __init__(self, filename: str, **kwargs): + super().__init__(filename=filename, **kwargs) @property def tabnames(self) -> List[str]: @@ -338,12 +354,12 @@ class ItemManagerMixin(BasicTableSetManager): def __init__(self, filename: str, **kwargs): super().__init__(filename=filename, **kwargs) self.patch_prototypes_by_tabname: Dict[str, Dict] = {} - self.parsed_headers_by_tabname: Dict[str, List[List[Union[int, str]]]] = {} + self.parsed_headers_by_tabname: Dict[str, ParsedHeaders] = {} def sheet_patch_prototype(self, tabname: str) -> Dict: return self.patch_prototypes_by_tabname[tabname] - def sheet_parsed_headers(self, tabname: str) -> List[List[Union[int, str]]]: + def sheet_parsed_headers(self, tabname: str) -> ParsedHeaders: return self.parsed_headers_by_tabname[tabname] def _create_tab_processor_state(self, tabname: str) -> ParsedHeaders: @@ -387,8 +403,8 @@ class CsvManager(TableSetManager): DEFAULT_TAB_NAME = 'Sheet1' - def __init__(self, filename: str, tab_name=None): - super().__init__(filename=filename) + def __init__(self, filename: str, tab_name=None, **kwargs): + super().__init__(filename=filename, **kwargs) self.tab_name = tab_name or self.DEFAULT_TAB_NAME @property @@ -409,7 +425,6 @@ def _create_tab_processor_state(self, tabname: str) -> Headers: headers: Headers = self.headers_by_tabname.get(tabname) if headers is None: self.headers_by_tabname[tabname] = headers = self.reader_agent.__next__() - print(f"Headers={headers}") return headers def _process_row(self, tabname: str, headers: Headers, row_data: SheetRow) -> AnyJsonData: @@ -449,15 +464,18 @@ class ItemManager(AbstractTableSetManager): """ @classmethod - def create_implementation_manager(cls, filename: str, tab_name=None) -> BasicTableSetManager: + def create_implementation_manager(cls, filename: str, **kwargs) -> BasicTableSetManager: if filename.endswith(".xlsx"): - if tab_name is not None: - raise ValueError(f".xlsx files don't need tab_name={tab_name!r}") - reader_agent = ItemXlsxManager(filename) + # unwanted_kwargs(context="ItemManager for .xlsx files", kwargs=kwargs) + reader_agent = ItemXlsxManager(filename, **kwargs) elif filename.endswith(".csv"): - reader_agent = ItemCsvManager(filename, tab_name=tab_name) + tab_name = kwargs.pop('tab_name', None) + # unwanted_kwargs(context="ItemManager for .csv files", kwargs=kwargs) + reader_agent = ItemCsvManager(filename, tab_name=tab_name, **kwargs) elif filename.endswith(".tsv"): - reader_agent = ItemTsvManager(filename, tab_name=tab_name) + tab_name = kwargs.pop('tab_name', None) + # unwanted_kwargs(context="ItemManager for .tsv files", kwargs=kwargs) + reader_agent = ItemTsvManager(filename, tab_name=tab_name, **kwargs) else: raise ValueError(f"Unknown file type: {filename}") return reader_agent From 690a833efa3576d92660bef400793ea0087c56f5 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Wed, 23 Aug 2023 22:33:40 -0400 Subject: [PATCH 026/101] Cosmetic fix. --- dcicutils/sheet_utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index b1ed8d604..dfcb1a30f 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -457,8 +457,7 @@ def __init__(self, filename: str, schemas=None, **kwargs): self._instaguid_context_table: Dict[str, str] = {} def sheet_patch_prototype(self, tabname: str) -> Dict: - result = self.patch_prototypes_by_tabname[tabname] - return result + return self.patch_prototypes_by_tabname[tabname] def sheet_parsed_headers(self, tabname: str) -> ParsedHeaders: return self.parsed_headers_by_tabname[tabname] From 946b9987273f2918357cc1d5c33e5451c10d9b41 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Wed, 23 Aug 2023 22:36:06 -0400 Subject: [PATCH 027/101] Add some missing newlines in data files. --- test/data_files/sample_items_sheet2.csv | 2 +- test/data_files/sample_items_sheet2.tsv | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test/data_files/sample_items_sheet2.csv b/test/data_files/sample_items_sheet2.csv index b1d3ec2da..95567c42a 100644 --- a/test/data_files/sample_items_sheet2.csv +++ b/test/data_files/sample_items_sheet2.csv @@ -1,3 +1,3 @@ name,age,mother.name,mother.age,father.name,father.age,friends#0.name,friends#0.age,friends#1.name,friends#1.age bill,23,mary,58,fred,63,sam,22,arthur,19 -joe,9,estrella,35,anthony,34,anders,9,, \ No newline at end of file +joe,9,estrella,35,anthony,34,anders,9,, diff --git a/test/data_files/sample_items_sheet2.tsv b/test/data_files/sample_items_sheet2.tsv index d2c9e0e47..e862bf36d 100644 --- a/test/data_files/sample_items_sheet2.tsv +++ b/test/data_files/sample_items_sheet2.tsv @@ -1,3 +1,3 @@ name age mother.name mother.age father.name father.age friends#0.name friends#0.age friends#1.name friends#1.age bill 23 mary 58 fred 63 sam 22 arthur 19 -joe 9 estrella 35 anthony 34 anders 9 \ No newline at end of file +joe 9 estrella 35 anthony 34 anders 9 From 36e7de064bc3988f8da946f2388de59505d3f33e Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Wed, 23 Aug 2023 23:57:55 -0400 Subject: [PATCH 028/101] Support for coping with .tsv files where trailing whitespace is 'helpfully' removed by an editor that doesn't understand such whitespace might be significant in TSVs. --- dcicutils/sheet_utils.py | 22 +++++++++++++------ .../sample_items_sheet_2.tsv.README.text | 4 ++++ 2 files changed, 19 insertions(+), 7 deletions(-) create mode 100644 test/data_files/sample_items_sheet_2.tsv.README.text diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index f98b5f755..fbd51194f 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -10,7 +10,7 @@ from openpyxl.worksheet.worksheet import Worksheet from openpyxl.workbook.workbook import Workbook from tempfile import TemporaryFile -from typing import Any, Dict, Iterable, List, Union +from typing import Any, Dict, Iterable, List, Optional, Union Header = str @@ -228,11 +228,11 @@ class BasicTableSetManager(AbstractTableSetManager): def __init__(self, filename: str, **kwargs): super().__init__(**kwargs) self.filename: str = filename - self.headers_by_tabname: Dict[str, List[str]] = {} + self.headers_by_tabname: Dict[str, Headers] = {} self.content_by_tabname: Dict[str, List[AnyJsonData]] = {} self.reader_agent: Any = self._get_reader_agent() - def tab_headers(self, tabname: str) -> List[str]: + def tab_headers(self, tabname: str) -> Headers: return self.headers_by_tabname[tabname] def tab_content(self, tabname: str) -> List[AnyJsonData]: @@ -334,8 +334,8 @@ def _get_raw_row_content_tuple(self, sheet: Worksheet, row: int) -> SheetRow: def _create_tab_processor_state(self, tabname: str) -> Headers: sheet = self.reader_agent[tabname] - headers: List[str] = [str(sheet.cell(row=1, column=col).value) - for col in self._all_cols(sheet)] + headers: Headers = [str(sheet.cell(row=1, column=col).value) + for col in self._all_cols(sheet)] self.headers_by_tabname[sheet.title] = headers return headers @@ -418,11 +418,19 @@ def _get_reader_agent(self) -> CsvReader: def _get_csv_reader(cls, filename) -> CsvReader: return csv.reader(open_text_input_file_respecting_byte_order_mark(filename)) + PAD_TRAILING_TABS = True + def _raw_row_generator_for_tabname(self, tabname: str) -> Iterable[SheetRow]: - return self.reader_agent + headers = self.tab_headers(tabname) + n_headers = len(headers) + for row_data in self.reader_agent: + n_cols = len(row_data) + if self.PAD_TRAILING_TABS and n_cols < n_headers: + row_data = row_data + [''] * (n_headers - n_cols) + yield row_data def _create_tab_processor_state(self, tabname: str) -> Headers: - headers: Headers = self.headers_by_tabname.get(tabname) + headers: Optional[Headers] = self.headers_by_tabname.get(tabname) if headers is None: self.headers_by_tabname[tabname] = headers = self.reader_agent.__next__() return headers diff --git a/test/data_files/sample_items_sheet_2.tsv.README.text b/test/data_files/sample_items_sheet_2.tsv.README.text new file mode 100644 index 000000000..efefaf654 --- /dev/null +++ b/test/data_files/sample_items_sheet_2.tsv.README.text @@ -0,0 +1,4 @@ +Note that one of the lines in file sample_items_sheet_2.tsv has two blank fields at end of line. +PyCharm and perhaps other editors "helpfully" removes trailing whitespace from lines, +so the number of columns varies line-to-line. Instead of insisting on explicit tabs at end of line, +we pad such short lines with nulls when reading from the file. From 6f097a6284e4f58612d97c112c564e18cce37598 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Thu, 24 Aug 2023 00:22:22 -0400 Subject: [PATCH 029/101] Document our choice of why is_uuid is defined here as it is. --- dcicutils/misc_utils.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/dcicutils/misc_utils.py b/dcicutils/misc_utils.py index 0d4b63840..a13a79117 100644 --- a/dcicutils/misc_utils.py +++ b/dcicutils/misc_utils.py @@ -1349,12 +1349,21 @@ def capitalize1(s): return s[:1].upper() + s[1:] -uuid_re = re.compile(r'(?i)[{]?(?:[0-9a-f]{4}-?){8}[}]?') +""" +Python's UUID ignores all dashes, whereas Postgres is more strict +http://www.postgresql.org/docs/9.2/static/datatype-uuid.html +See also http://www.postgresql.org/docs/9.2/static/datatype-uuid.html +And, anyway, this pattern is what our portals have been doing +for quite a while, so it's the most stable choice for us now. +""" +uuid_re = re.compile(r'(?i)[{]?(?:[0-9a-f]{4}-?){8}[}]?') def is_uuid(instance): - # Python's UUID ignores all dashes, whereas Postgres is more strict - # http://www.postgresql.org/docs/9.2/static/datatype-uuid.html + """ + Predicate returns true for any group of 32 hex characters with optional hyphens every four characters. + We insist on lowercase to make matching faster. See other notes on this design choice above. + """ return bool(uuid_re.match(instance)) From 477c7a2bfccc8f8e71b07d507e8f52214c40c712 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Thu, 24 Aug 2023 00:24:46 -0400 Subject: [PATCH 030/101] PEP8 --- dcicutils/misc_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dcicutils/misc_utils.py b/dcicutils/misc_utils.py index a13a79117..aeca4a326 100644 --- a/dcicutils/misc_utils.py +++ b/dcicutils/misc_utils.py @@ -1359,6 +1359,7 @@ def capitalize1(s): uuid_re = re.compile(r'(?i)[{]?(?:[0-9a-f]{4}-?){8}[}]?') + def is_uuid(instance): """ Predicate returns true for any group of 32 hex characters with optional hyphens every four characters. From 09b4c436d7d0643e60c3ad88a0c0651cb60cc4bb Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Thu, 24 Aug 2023 04:16:54 -0400 Subject: [PATCH 031/101] Fix error handling to be clearer. --- dcicutils/sheet_utils.py | 76 ++++++++++++++++++------ test/test_sheet_utils.py | 122 ++++++++++++++++++++++++++++----------- 2 files changed, 144 insertions(+), 54 deletions(-) diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index d5b6262e4..ba2282aa5 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -6,7 +6,7 @@ import uuid from dcicutils.common import AnyJsonData -from dcicutils.lang_utils import conjoined_list, maybe_pluralize +from dcicutils.lang_utils import conjoined_list, disjoined_list, maybe_pluralize from dcicutils.misc_utils import ignored from openpyxl.worksheet.worksheet import Worksheet from openpyxl.workbook.workbook import Workbook @@ -23,6 +23,30 @@ CsvReader = type(csv.reader(TemporaryFile())) +class LoadFailure(Exception): + """ + In general, we'd prefer to load up the spreadsheet with clumsy data that can then be validated in detail, + but some errors are so confusing or so problematic that we need to just fail the load right away. + """ + pass + + +class LoadArgumentsError(LoadFailure): + """ + Errors of this class represent situations where we can't get started because + there's a problem with the given arguments. + """ + pass + + +class LoadTableError(LoadFailure): + """ + Errors of this class represent situations where we can't get started because + there's a problem with some table's syntax, for example headers that don't make sense. + """ + pass + + def unwanted_kwargs(*, context, kwargs, context_plural=False, detailed=False): if kwargs: unwanted = [f"{argname}={value!r}" if detailed else argname @@ -30,8 +54,8 @@ def unwanted_kwargs(*, context, kwargs, context_plural=False, detailed=False): if value is not None] if unwanted: does_not = "don't" if context_plural else "doesn't" - raise ValueError(f"{context} {does_not} use" - f" {maybe_pluralize(unwanted, 'keyword argument')} {conjoined_list(unwanted)}.") + raise LoadArgumentsError(f"{context} {does_not} use" + f" {maybe_pluralize(unwanted, 'keyword argument')} {conjoined_list(unwanted)}.") def prefer_number(value: SheetCellValue): @@ -165,7 +189,7 @@ def compute_patch_prototype(cls, parsed_headers: ParsedHeaders): for parsed_header in parsed_headers: parsed_header0 = parsed_header[0] if isinstance(parsed_header0, int): - raise ValueError(f"A header cannot begin with a numeric ref: {parsed_header0}") + raise LoadTableError(f"A header cannot begin with a numeric ref: {parsed_header0}") cls.assure_patch_prototype_shape(parent=prototype, keys=parsed_header) return prototype @@ -184,7 +208,7 @@ def assure_patch_prototype_shape(cls, *, parent: Union[Dict, List], keys: Parsed if key0 == n: parent.append(placeholder) elif key0 > n: - raise Exception("Numeric items must occur sequentially.") + raise LoadTableError("Numeric items must occur sequentially.") elif isinstance(key0, str): if key0 not in parent: parent[key0] = placeholder @@ -311,7 +335,7 @@ def load(cls, filename: str) -> Dict[str, List[AnyJsonData]]: Reads a filename and returns a dictionary that maps sheet names to rows of dictionary data. For more information, see documentation of AbstractTableSetManager. """ - raise NotImplementedError(f".load(...) is not implemented for {cls.__name__}.") + raise NotImplementedError(f".load(...) is not implemented for {cls.__name__}.") # noQA class BasicTableSetManager(AbstractTableSetManager): @@ -335,7 +359,8 @@ def tab_headers(self, tabname: str) -> Headers: def tab_content(self, tabname: str) -> List[AnyJsonData]: return self.content_by_tabname[tabname] - def _create_tab_processor_state(self, tabname: str) -> Any: + @classmethod + def _create_tab_processor_state(cls, tabname: str) -> Any: """ This method provides for the possibility that some parsers will want auxiliary state, (such as parsed headers or a line count or a table of temporary names for objects to cross-link @@ -347,16 +372,23 @@ def _create_tab_processor_state(self, tabname: str) -> Any: def _get_reader_agent(self) -> Any: """This function is responsible for opening the workbook and returning a workbook object.""" - raise NotImplementedError(f"._get_reader_agent() is not implemented for {self.__class__.__name__}.") + raise NotImplementedError(f"._get_reader_agent() is not implemented for {self.__class__.__name__}.") # noQA def load_content(self) -> Any: - raise NotImplementedError(f".load_content() is not implemented for {self.__class__.__name__}.") + raise NotImplementedError(f".load_content() is not implemented for {self.__class__.__name__}.") # noQA class TableSetManager(BasicTableSetManager): + ALLOWED_FILE_EXTENSIONS = None + @classmethod def load(cls, filename: str) -> AnyJsonData: + if cls.ALLOWED_FILE_EXTENSIONS: + if not any(filename.lower().endswith(suffix) for suffix in cls.ALLOWED_FILE_EXTENSIONS): + raise LoadArgumentsError(f"The TableSetManager subclass {cls.__name__} expects only" + f" {disjoined_list(cls.ALLOWED_FILE_EXTENSIONS)} filenames: {filename}") + table_set_manager: TableSetManager = cls(filename) return table_set_manager.load_content() @@ -365,13 +397,13 @@ def __init__(self, filename: str, **kwargs): @property def tabnames(self) -> List[str]: - raise NotImplementedError(f".tabnames is not implemented for {self.__class__.__name__}..") + raise NotImplementedError(f".tabnames is not implemented for {self.__class__.__name__}..") # noQA def _raw_row_generator_for_tabname(self, tabname: str) -> Iterable[SheetRow]: """ Given a tabname and a state (returned by _sheet_loader_state), return a generator for a set of row values. """ - raise NotImplementedError(f"._rows_for_tabname(...) is not implemented for {self.__class__.__name__}.") + raise NotImplementedError(f"._rows_for_tabname(...) is not implemented for {self.__class__.__name__}.") # noQA def _process_row(self, tabname: str, state: Any, row: List[SheetCellValue]) -> AnyJsonData: """ @@ -379,7 +411,7 @@ def _process_row(self, tabname: str, state: Any, row: List[SheetCellValue]) -> A must return a list of objects representing column values. What constitutes a processed up to the class, but other than that the result must be a JSON dictionary. """ - raise NotImplementedError(f"._process_row(...) is not implemented for {self.__class__.__name__}.") + raise NotImplementedError(f"._process_row(...) is not implemented for {self.__class__.__name__}.") # noQA def load_content(self) -> AnyJsonData: for tabname in self.tabnames: @@ -401,6 +433,8 @@ class XlsxManager(TableSetManager): This implements the mechanism to get a series of rows out of the sheets in an XLSX file. """ + ALLOWED_FILE_EXTENSIONS = ['.xlsx'] + @classmethod def _all_rows(cls, sheet: Worksheet): row_max = sheet.max_row @@ -510,7 +544,7 @@ def parse_cell_value(self, value: SheetCellValue) -> AnyJsonData: return ItemTools.parse_item_value(value, context=self._instaguid_context_table) -class ItemXlsxManager(ItemManagerMixin, XlsxManager): +class XlsxItemManager(ItemManagerMixin, XlsxManager): """ This layers item-style row processing functionality on an XLSX file. """ @@ -523,6 +557,8 @@ class CsvManager(TableSetManager): returning a result that still looks like there could have been multiple tabs. """ + ALLOWED_FILE_EXTENSIONS = ['.csv'] + DEFAULT_TAB_NAME = 'Sheet1' def __init__(self, filename: str, tab_name: Optional[str] = None, escaping: bool = False, **kwargs): @@ -564,7 +600,7 @@ def _process_row(self, tabname: str, headers: Headers, row_data: SheetRow) -> An for i, row_datum in enumerate(row_data)} -class ItemCsvManager(ItemManagerMixin, CsvManager): +class CsvItemManager(ItemManagerMixin, CsvManager): """ This layers item-style row processing functionality on a CSV file. """ @@ -576,12 +612,14 @@ class TsvManager(CsvManager): TSV files are just CSV files with tabs instead of commas as separators. (We do not presently handle any escaping of strange characters. May need to add handling for backslash escaping.) """ + ALLOWED_FILE_EXTENSIONS = ['.tsv', '.tsv.txt'] + @classmethod def _get_csv_reader(cls, filename) -> CsvReader: return csv.reader(open_text_input_file_respecting_byte_order_mark(filename), delimiter='\t') -class ItemTsvManager(ItemManagerMixin, TsvManager): +class TsvItemManager(ItemManagerMixin, TsvManager): """ This layers item-style row processing functionality on a TSV file. """ @@ -598,17 +636,17 @@ class ItemManager(AbstractTableSetManager): def create_implementation_manager(cls, filename: str, escaping=None, **kwargs) -> BasicTableSetManager: if filename.endswith(".xlsx"): # unwanted_kwargs(context="ItemManager for .xlsx files", kwargs=kwargs) - reader_agent = ItemXlsxManager(filename, escaping=escaping, **kwargs) + reader_agent = XlsxItemManager(filename, escaping=escaping, **kwargs) elif filename.endswith(".csv"): tab_name = kwargs.pop('tab_name', None) # unwanted_kwargs(context="ItemManager for .csv files", kwargs=kwargs) - reader_agent = ItemCsvManager(filename, escaping=escaping, tab_name=tab_name, **kwargs) + reader_agent = CsvItemManager(filename, escaping=escaping, tab_name=tab_name, **kwargs) elif filename.endswith(".tsv"): tab_name = kwargs.pop('tab_name', None) # unwanted_kwargs(context="ItemManager for .tsv files", kwargs=kwargs) - reader_agent = ItemTsvManager(filename, escaping=escaping, tab_name=tab_name, **kwargs) + reader_agent = TsvItemManager(filename, escaping=escaping, tab_name=tab_name, **kwargs) else: - raise ValueError(f"Unknown file type: {filename}") + raise LoadArgumentsError(f"Unknown file type: {filename}") return reader_agent @classmethod diff --git a/test/test_sheet_utils.py b/test/test_sheet_utils.py index cfd20207b..4834c9fdd 100644 --- a/test/test_sheet_utils.py +++ b/test/test_sheet_utils.py @@ -9,10 +9,13 @@ # High-level interfaces ItemManager, load_items, # Low-level implementation - ItemTools, XlsxManager, ItemXlsxManager, - CsvManager, ItemCsvManager, TsvManager, ItemTsvManager, + BasicTableSetManager, + ItemTools, XlsxManager, XlsxItemManager, + CsvManager, CsvItemManager, TsvManager, TsvItemManager, # TypeHint, EnumHint, BoolHint, + # Error handling + LoadFailure, LoadArgumentsError, LoadTableError, # Utilities prefer_number, unwanted_kwargs, ) @@ -20,6 +23,34 @@ from .conftest_settings import TEST_DIR +def test_load_failure(): + + sample_message = "This is a test." + + load_failure_object = LoadFailure(sample_message) + assert isinstance(load_failure_object, LoadFailure) + assert str(load_failure_object) == sample_message + + +def test_load_argument_error(): + + sample_message = "This is a test." + + load_failure_object = LoadArgumentsError(sample_message) + assert isinstance(load_failure_object, LoadArgumentsError) + assert str(load_failure_object) == sample_message + + +def test_load_table_error(): + + sample_message = "This is a test." + + load_failure_object = LoadTableError(sample_message) + assert isinstance(load_failure_object, LoadTableError) + assert str(load_failure_object) == sample_message + + + def test_prefer_number(): assert prefer_number('') is None @@ -61,11 +92,16 @@ def test_unwanted_kwargs_without_error(): ]) def test_unwanted_kwargs_with_error(context, context_plural, detailed, kwargs, message): - with pytest.raises(ValueError) as exc: + with pytest.raises(LoadArgumentsError) as exc: unwanted_kwargs(context=context, kwargs=kwargs, context_plural=context_plural, detailed=detailed) assert str(exc.value) == message +def test_back_table_set_create_state(): + + assert BasicTableSetManager._create_tab_processor_state('some-tab') is None + + def test_item_tools_parse_sheet_header(): assert ItemTools.parse_sheet_header('.a') == ['a'] assert ItemTools.parse_sheet_header('a') == ['a'] @@ -108,7 +144,7 @@ def test_item_tools_compute_patch_prototype(parsed_headers, expected_prototype): def test_item_tools_compute_patch_prototype_errors(headers): parsed_headers = ItemTools.parse_sheet_headers(headers) - with pytest.raises(ValueError) as exc: + with pytest.raises(LoadTableError) as exc: ItemTools.compute_patch_prototype(parsed_headers) assert str(exc.value) == "A header cannot begin with a numeric ref: 0" @@ -205,6 +241,8 @@ def test_item_tools_set_path_value(): def test_item_tools_find_type_hint(): + assert ItemTools.find_type_hint(None, 'anything') is None + assert ItemTools.find_type_hint(['foo', 'bar'], None) is None assert ItemTools.find_type_hint(['foo', 'bar'], "something") is None assert ItemTools.find_type_hint(['foo', 'bar'], {}) is None @@ -304,13 +342,13 @@ def test_item_tools_find_type_hint(): SAMPLE_CSV_FILE_RAW_CONTENT = {CsvManager.DEFAULT_TAB_NAME: SAMPLE_XLSX_FILE_RAW_CONTENT['Sheet2']} -SAMPLE_CSV_FILE_ITEM_CONTENT = {ItemCsvManager.DEFAULT_TAB_NAME: SAMPLE_XLSX_FILE_ITEM_CONTENT['Sheet2']} +SAMPLE_CSV_FILE_ITEM_CONTENT = {CsvItemManager.DEFAULT_TAB_NAME: SAMPLE_XLSX_FILE_ITEM_CONTENT['Sheet2']} SAMPLE_TSV_FILE = os.path.join(TEST_DIR, 'data_files/sample_items_sheet2.tsv') SAMPLE_TSV_FILE_RAW_CONTENT = {TsvManager.DEFAULT_TAB_NAME: SAMPLE_XLSX_FILE_RAW_CONTENT['Sheet2']} -SAMPLE_TSV_FILE_ITEM_CONTENT = {ItemTsvManager.DEFAULT_TAB_NAME: SAMPLE_XLSX_FILE_ITEM_CONTENT['Sheet2']} +SAMPLE_TSV_FILE_ITEM_CONTENT = {TsvItemManager.DEFAULT_TAB_NAME: SAMPLE_XLSX_FILE_ITEM_CONTENT['Sheet2']} def test_xlsx_manager_load_content(): @@ -326,26 +364,29 @@ def test_xlsx_manager_load(): def test_xlsx_manager_load_csv(): - with pytest.raises(Exception): + with pytest.raises(LoadArgumentsError) as exc: XlsxManager.load(SAMPLE_CSV_FILE) + assert str(exc.value).startswith('The TableSetManager subclass XlsxManager' + ' expects only .xlsx filenames:') -def test_item_xlsx_manager_load_content(): +def test_xlsx_item_manager_load_content(): - it = ItemXlsxManager(SAMPLE_XLSX_FILE) + it = XlsxItemManager(SAMPLE_XLSX_FILE) assert it.load_content() == SAMPLE_XLSX_FILE_ITEM_CONTENT -def test_item_xlsx_manager_load(): - - assert ItemXlsxManager.load(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_ITEM_CONTENT +def test_xlsx_item_manager_load(): + assert XlsxItemManager.load(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_ITEM_CONTENT -def test_item_xlsx_manager_load_csv(): - with pytest.raises(Exception): - ItemXlsxManager.load(SAMPLE_CSV_FILE) +def test_xlsx_item_manager_load_csv(): + with pytest.raises(LoadArgumentsError) as exc: + XlsxItemManager.load(SAMPLE_CSV_FILE) + assert str(exc.value).startswith('The TableSetManager subclass XlsxItemManager' + ' expects only .xlsx filenames:') def test_csv_manager_load_content(): @@ -360,26 +401,29 @@ def test_csv_manager_load(): def test_csv_manager_load_csv(): - with pytest.raises(Exception): + with pytest.raises(LoadArgumentsError) as exc: CsvManager.load(SAMPLE_XLSX_FILE) + assert str(exc.value).startswith('The TableSetManager subclass CsvManager' + ' expects only .csv filenames:') -def test_item_csv_manager_load_content(): +def test_csv_item_manager_load_content(): - it = ItemCsvManager(SAMPLE_CSV_FILE) + it = CsvItemManager(SAMPLE_CSV_FILE) assert it.load_content() == SAMPLE_CSV_FILE_ITEM_CONTENT -def test_item_csv_manager_load(): +def test_csv_item_manager_load(): - assert ItemCsvManager.load(SAMPLE_CSV_FILE) == SAMPLE_CSV_FILE_ITEM_CONTENT + assert CsvItemManager.load(SAMPLE_CSV_FILE) == SAMPLE_CSV_FILE_ITEM_CONTENT -def test_item_csv_manager_load_csv(): - - with pytest.raises(Exception): - ItemCsvManager.load(SAMPLE_XLSX_FILE) +def test_csv_item_manager_load_csv(): + with pytest.raises(LoadArgumentsError) as exc: + CsvItemManager.load(SAMPLE_XLSX_FILE) + assert str(exc.value).startswith('The TableSetManager subclass CsvItemManager' + ' expects only .csv filenames:') def test_tsv_manager_load_content(): @@ -394,25 +438,29 @@ def test_tsv_manager_load(): def test_tsv_manager_load_csv(): - with pytest.raises(Exception): + with pytest.raises(LoadArgumentsError) as exc: TsvManager.load(SAMPLE_XLSX_FILE) + assert str(exc.value).startswith('The TableSetManager subclass TsvManager' + ' expects only .tsv or .tsv.txt filenames:') -def test_item_tsv_manager_load_content(): +def test_tsv_item_manager_load_content(): - it = ItemTsvManager(SAMPLE_TSV_FILE) + it = TsvItemManager(SAMPLE_TSV_FILE) assert it.load_content() == SAMPLE_TSV_FILE_ITEM_CONTENT -def test_item_tsv_manager_load(): +def test_tsv_item_manager_load(): - assert ItemTsvManager.load(SAMPLE_TSV_FILE) == SAMPLE_TSV_FILE_ITEM_CONTENT + assert TsvItemManager.load(SAMPLE_TSV_FILE) == SAMPLE_TSV_FILE_ITEM_CONTENT -def test_item_tsv_manager_load_csv(): +def test_tsv_item_manager_load_csv(): - with pytest.raises(Exception): - ItemTsvManager.load(SAMPLE_XLSX_FILE) + with pytest.raises(LoadArgumentsError) as exc: + TsvItemManager.load(SAMPLE_XLSX_FILE) + assert str(exc.value).startswith('The TableSetManager subclass TsvItemManager' + ' expects only .tsv or .tsv.txt filenames:') def test_item_manager_load(): @@ -421,8 +469,11 @@ def test_item_manager_load(): assert ItemManager.load(SAMPLE_CSV_FILE) == SAMPLE_CSV_FILE_ITEM_CONTENT - with pytest.raises(ValueError): + assert ItemManager.load(SAMPLE_TSV_FILE) == SAMPLE_TSV_FILE_ITEM_CONTENT + + with pytest.raises(LoadArgumentsError) as exc: ItemManager.load("something.else") + assert str(exc.value) == "Unknown file type: something.else" def test_load_items(): @@ -431,8 +482,9 @@ def test_load_items(): assert load_items(SAMPLE_CSV_FILE) == SAMPLE_CSV_FILE_ITEM_CONTENT - with pytest.raises(ValueError): + with pytest.raises(LoadArgumentsError) as exc: load_items("something.else") + assert str(exc.value) == "Unknown file type: something.else" SAMPLE_CSV_FILE2_SCHEMAS = { @@ -456,7 +508,7 @@ def test_load_items(): } SAMPLE_CSV_FILE2_ITEM_CONTENT = { - ItemCsvManager.DEFAULT_TAB_NAME: [ + CsvItemManager.DEFAULT_TAB_NAME: [ {"name": "john", "sex": "M", "member": False}, {"name": "juan", "sex": "male", "member": True}, {"name": "igor", "sex": "unknown", "member": None}, From f3bd81536a039a1bfaf585be6696b8db6271e97d Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Thu, 24 Aug 2023 05:08:07 -0400 Subject: [PATCH 032/101] Fix CHANGELOG to reflect recent renamings. --- CHANGELOG.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index dd292f4d5..b88893ac8 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -24,7 +24,7 @@ Change Log * Classes ``XlsxManager``, ``CsvManager`` and ``TsvManager`` for loading raw data from ``.xlsx``, ``.csv``, and ``.tsv`` files, respectively. - * Classes ``ItemXlsxManager``, ``ItemCsvManager``, and ``ItemTsvManager`` for loading Item-style data + * Classes ``XlsxItemManager``, ``CsvItemManager``, and ``TsvItemManager`` for loading Item-style data from ``.xlsx``, ``.csv``, and ``.tsv`` files, respectively. * Contains a fix for a bug in ``ff_utils.get_schema_names`` (`C4-1086 `_). From 7627f6fa0ed5338d5e1f87c5cf7079d0211b2680 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Thu, 24 Aug 2023 05:10:18 -0400 Subject: [PATCH 033/101] Fix a type hint and some PEP8. --- dcicutils/sheet_utils.py | 2 +- test/test_sheet_utils.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index ba2282aa5..2de95e83c 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -265,7 +265,7 @@ def set_path_value(cls, datum: Union[List, Dict], path: ParsedHeader, value: Any cls.set_path_value(datum[key], more_path, value) @classmethod - def find_type_hint(cls, parsed_header: ParsedHeader, schema: Any): + def find_type_hint(cls, parsed_header: Optional[ParsedHeader], schema: Any): def finder(subheader, subschema): if not parsed_header: diff --git a/test/test_sheet_utils.py b/test/test_sheet_utils.py index 4834c9fdd..2bc5f594b 100644 --- a/test/test_sheet_utils.py +++ b/test/test_sheet_utils.py @@ -50,7 +50,6 @@ def test_load_table_error(): assert str(load_failure_object) == sample_message - def test_prefer_number(): assert prefer_number('') is None @@ -388,6 +387,7 @@ def test_xlsx_item_manager_load_csv(): assert str(exc.value).startswith('The TableSetManager subclass XlsxItemManager' ' expects only .xlsx filenames:') + def test_csv_manager_load_content(): wt = CsvManager(SAMPLE_CSV_FILE) @@ -425,6 +425,7 @@ def test_csv_item_manager_load_csv(): assert str(exc.value).startswith('The TableSetManager subclass CsvItemManager' ' expects only .csv filenames:') + def test_tsv_manager_load_content(): wt = TsvManager(SAMPLE_TSV_FILE) From 98cd37c7e5ab8aa0ce094b0ba11736d1afc4b0e8 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Thu, 24 Aug 2023 05:35:03 -0400 Subject: [PATCH 034/101] Implement a cut at escaping for tsv files. --- dcicutils/sheet_utils.py | 46 +++++++++++++++++++++++++++++++++------- test/test_sheet_utils.py | 8 +++++++ 2 files changed, 46 insertions(+), 8 deletions(-) diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index 2de95e83c..fcc69211c 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -561,10 +561,9 @@ class CsvManager(TableSetManager): DEFAULT_TAB_NAME = 'Sheet1' - def __init__(self, filename: str, tab_name: Optional[str] = None, escaping: bool = False, **kwargs): + def __init__(self, filename: str, tab_name: Optional[str] = None, **kwargs): super().__init__(filename=filename, **kwargs) self.tab_name = tab_name or self.DEFAULT_TAB_NAME - self.escaping = escaping @property def tabnames(self) -> List[str]: @@ -614,10 +613,43 @@ class TsvManager(CsvManager): """ ALLOWED_FILE_EXTENSIONS = ['.tsv', '.tsv.txt'] + def __init__(self, filename: str, escaping: Optional[bool] = None, **kwargs): + super().__init__(filename=filename, **kwargs) + self.escaping: bool = escaping or False + @classmethod def _get_csv_reader(cls, filename) -> CsvReader: return csv.reader(open_text_input_file_respecting_byte_order_mark(filename), delimiter='\t') + def parse_cell_value(self, value: SheetCellValue) -> AnyJsonData: + if self.escaping and isinstance(value, str) and '\\' in value: + value = self.expand_escape_sequences(value) + return super().parse_cell_value(value) + + @classmethod + def expand_escape_sequences(cls, text: str) -> str: + s = io.StringIO() + escaping = False + for ch in text: + if escaping: + if ch == 'r': + s.write('\r') + elif ch == 't': + s.write('\t') + elif ch == 'n': + s.write('\n') + elif ch == '\\': + s.write('\\') + else: + # Rather than err, just leave other sequences as-is. + s.write(f"\\{ch}") + escaping = False + elif ch == '\\': + escaping = True + else: + s.write(ch) + return s.getvalue() + class TsvItemManager(ItemManagerMixin, TsvManager): """ @@ -633,17 +665,15 @@ class ItemManager(AbstractTableSetManager): """ @classmethod - def create_implementation_manager(cls, filename: str, escaping=None, **kwargs) -> BasicTableSetManager: + def create_implementation_manager(cls, filename: str, **kwargs) -> BasicTableSetManager: if filename.endswith(".xlsx"): - # unwanted_kwargs(context="ItemManager for .xlsx files", kwargs=kwargs) - reader_agent = XlsxItemManager(filename, escaping=escaping, **kwargs) + reader_agent = XlsxItemManager(filename, **kwargs) elif filename.endswith(".csv"): tab_name = kwargs.pop('tab_name', None) - # unwanted_kwargs(context="ItemManager for .csv files", kwargs=kwargs) - reader_agent = CsvItemManager(filename, escaping=escaping, tab_name=tab_name, **kwargs) + reader_agent = CsvItemManager(filename, tab_name=tab_name, **kwargs) elif filename.endswith(".tsv"): + escaping = kwargs.pop('escaping', None) tab_name = kwargs.pop('tab_name', None) - # unwanted_kwargs(context="ItemManager for .tsv files", kwargs=kwargs) reader_agent = TsvItemManager(filename, escaping=escaping, tab_name=tab_name, **kwargs) else: raise LoadArgumentsError(f"Unknown file type: {filename}") diff --git a/test/test_sheet_utils.py b/test/test_sheet_utils.py index 2bc5f594b..d902084bd 100644 --- a/test/test_sheet_utils.py +++ b/test/test_sheet_utils.py @@ -432,6 +432,14 @@ def test_tsv_manager_load_content(): assert wt.load_content() == SAMPLE_TSV_FILE_RAW_CONTENT +def test_tsv_manager_expand_escape_sequences(): + + assert TsvManager.expand_escape_sequences("foo") == "foo" + assert TsvManager.expand_escape_sequences("foo\\tbar") == "foo\tbar" + assert TsvManager.expand_escape_sequences("\\r\\t\\n\\\\") == "\r\t\n\\" + assert TsvManager.expand_escape_sequences("foo\\fbar") == "foo\\fbar" + + def test_tsv_manager_load(): assert TsvManager.load(SAMPLE_TSV_FILE) == SAMPLE_TSV_FILE_RAW_CONTENT From 3852e56e304755635b306b9ef4376537c5a2238f Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Thu, 24 Aug 2023 20:44:03 -0400 Subject: [PATCH 035/101] Add a test case for all of the pieces of parsing and schema hinting put together. --- dcicutils/sheet_utils.py | 69 +++++++-- .../sample_items_for_real_schemas.csv | 3 + test/test_sheet_utils.py | 140 +++++++++++++++++- 3 files changed, 200 insertions(+), 12 deletions(-) create mode 100644 test/data_files/sample_items_for_real_schemas.csv diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index fcc69211c..9a83bf6f9 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -6,8 +6,11 @@ import uuid from dcicutils.common import AnyJsonData +from dcicutils.env_utils import public_env_name, EnvUtils +from dcicutils.ff_utils import get_schema from dcicutils.lang_utils import conjoined_list, disjoined_list, maybe_pluralize -from dcicutils.misc_utils import ignored +from dcicutils.misc_utils import ignored, PRINT +from dcicutils.task_utils import pmap from openpyxl.worksheet.worksheet import Worksheet from openpyxl.workbook.workbook import Workbook from tempfile import TemporaryFile @@ -315,6 +318,7 @@ class AbstractTableSetManager: "Sheet2": [...], ..., } + It also needs some implementation of the .tabnames property. Note that at this level of abstraction, we take no position on what form of representation is used for the rows, as long as it is JSON data of some kind. It might be {"col1": "val1", "col2": "val2", ...} @@ -337,6 +341,10 @@ def load(cls, filename: str) -> Dict[str, List[AnyJsonData]]: """ raise NotImplementedError(f".load(...) is not implemented for {cls.__name__}.") # noQA + @property + def tabnames(self) -> List[str]: + raise NotImplementedError(f".tabnames is not implemented for {self.__class__.__name__}..") # noQA + class BasicTableSetManager(AbstractTableSetManager): """ @@ -395,10 +403,6 @@ def load(cls, filename: str) -> AnyJsonData: def __init__(self, filename: str, **kwargs): super().__init__(filename=filename, **kwargs) - @property - def tabnames(self) -> List[str]: - raise NotImplementedError(f".tabnames is not implemented for {self.__class__.__name__}..") # noQA - def _raw_row_generator_for_tabname(self, tabname: str) -> Iterable[SheetRow]: """ Given a tabname and a state (returned by _sheet_loader_state), return a generator for a set of row values. @@ -476,18 +480,62 @@ def _process_row(self, tabname: str, headers: Headers, row_data: SheetRow) -> An for i, row_datum in enumerate(row_data)} -class ItemManagerMixin(BasicTableSetManager): +class SchemaAutoloadMixin(AbstractTableSetManager): + + SCHEMA_CACHE = {} # Shared cache. Do not override. Use .clear_schema_cache() to clear it. + CACHE_SCHEMAS = True # Controls whether we're doing caching at all + AUTOLOAD_SCHEMAS_DEFAULT = False + + def __init__(self, autoload_schemas: Optional[bool] = None, portal_env: Optional[str] = None, **kwargs): + if portal_env is None: + portal_env = public_env_name(EnvUtils.PRD_ENV_NAME) + PRINT(f"The portal_env was not explicitly supplied. Schemas will come from portal_env={portal_env!r}.") + super().__init__(**kwargs) + self.autoload_schemas: bool = self.AUTOLOAD_SCHEMAS_DEFAULT if autoload_schemas is None else autoload_schemas + self.portal_env: Optional[str] = portal_env + + def fetch_relevant_schemas(self, schema_names: List[str]): + # The schema_names argument is not normally given, but it is there for easier testing + def fetch_schema(schema_name): + schema = self.fetch_schema(schema_name, portal_env=self.portal_env) + return schema_name, schema + if self.autoload_schemas and self.portal_env: + autoloaded = {tabname: schema + for tabname, schema in pmap(fetch_schema, schema_names)} + return autoloaded + else: + return {} + + @classmethod + def fetch_schema(cls, schema_name: str, *, portal_env: str): + def just_fetch_it(): + return get_schema(schema_name, ff_env=portal_env) + if cls.CACHE_SCHEMAS: + schema: Optional[AnyJsonData] = cls.SCHEMA_CACHE.get(schema_name) + if schema is None: + cls.SCHEMA_CACHE[schema_name] = schema = just_fetch_it() + return schema + else: + return just_fetch_it() + + @classmethod + def clear_schema_cache(cls): + for key in list(cls.SCHEMA_CACHE.keys()): # important to get the list of keys as a separate object first + cls.SCHEMA_CACHE.pop(key, None) + + +class ItemManagerMixin(SchemaAutoloadMixin, BasicTableSetManager): """ This can add functionality to a reader such as an XlsxManager or a CsvManager in order to make its rows get handled like Items instead of just flat table rows. """ - def __init__(self, filename: str, schemas=None, **kwargs): + def __init__(self, filename: str, schemas: Optional[Dict[str, AnyJsonData]] = None, **kwargs): super().__init__(filename=filename, **kwargs) self.patch_prototypes_by_tabname: Dict[str, Dict] = {} self.parsed_headers_by_tabname: Dict[str, ParsedHeaders] = {} self.type_hints_by_tabname: Dict[str, OptionalTypeHints] = {} - self.schemas = schemas or {} + self.schemas = schemas or self.fetch_relevant_schemas(self.tabnames) self._instaguid_context_table: Dict[str, str] = {} def sheet_patch_prototype(self, tabname: str) -> Dict: @@ -681,8 +729,9 @@ def create_implementation_manager(cls, filename: str, **kwargs) -> BasicTableSet @classmethod def load(cls, filename: str, tab_name: Optional[str] = None, escaping: Optional[bool] = None, - schemas: Optional[Dict] = None) -> AnyJsonData: - manager = cls.create_implementation_manager(filename, tab_name=tab_name, escaping=escaping, schemas=schemas) + schemas: Optional[Dict] = None, autoload_schemas: Optional[bool] = None) -> AnyJsonData: + manager = cls.create_implementation_manager(filename, tab_name=tab_name, escaping=escaping, schemas=schemas, + autoload_schemas=autoload_schemas) return manager.load_content() diff --git a/test/data_files/sample_items_for_real_schemas.csv b/test/data_files/sample_items_for_real_schemas.csv new file mode 100644 index 000000000..29af47792 --- /dev/null +++ b/test/data_files/sample_items_for_real_schemas.csv @@ -0,0 +1,3 @@ +accession,fragment_size_selection_method +foo,spri +bar,blue diff --git a/test/test_sheet_utils.py b/test/test_sheet_utils.py index d902084bd..8557e1278 100644 --- a/test/test_sheet_utils.py +++ b/test/test_sheet_utils.py @@ -1,15 +1,18 @@ +import contextlib import json import os import pytest from collections import namedtuple +from dcicutils import sheet_utils as sheet_utils_module from dcicutils.common import AnyJsonData from dcicutils.misc_utils import is_uuid, local_attrs +from dcicutils.qa_utils import printed_output from dcicutils.sheet_utils import ( # High-level interfaces ItemManager, load_items, # Low-level implementation - BasicTableSetManager, + BasicTableSetManager, SchemaAutoloadMixin, ItemTools, XlsxManager, XlsxItemManager, CsvManager, CsvItemManager, TsvManager, TsvItemManager, # TypeHint, EnumHint, @@ -19,8 +22,10 @@ # Utilities prefer_number, unwanted_kwargs, ) -from typing import Dict +from typing import Dict, Optional +from unittest import mock from .conftest_settings import TEST_DIR +from .helpers import using_fresh_ff_state_for_testing def test_load_failure(): @@ -660,3 +665,134 @@ def test_load_items_with_schema_and_instaguids(instaguids_enabled): assert matches_template(actual, expected) else: assert actual == expected # no substitution performed + + +class SchemaAutoloaderForTesting(SchemaAutoloadMixin): + pass + + +@contextlib.contextmanager +def schema_autoloader_for_testing(**kwargs) -> SchemaAutoloadMixin: + autoloader: Optional[SchemaAutoloadMixin] = None + success = False + try: + autoloader: SchemaAutoloadMixin = SchemaAutoloaderForTesting(**kwargs) + assert autoloader.SCHEMA_CACHE == {}, "The schema cache is not clean." + yield autoloader + success = True + finally: + if autoloader is not None: + autoloader.clear_schema_cache() + assert autoloader.SCHEMA_CACHE == SchemaAutoloadMixin.SCHEMA_CACHE == {} + if not success: + raise + + +@using_fresh_ff_state_for_testing() +@pytest.mark.integrated +@pytest.mark.parametrize('portal_env', [None, 'data']) +def test_schema_autoload_mixin_caching(portal_env): + + with schema_autoloader_for_testing(portal_env=portal_env) as autoloader: + + assert autoloader.portal_env == 'data' # it should have defaulted even if we didn't supply it + + assert autoloader.SCHEMA_CACHE == SchemaAutoloadMixin.SCHEMA_CACHE == {} + + sample_schema_name = 'foo' + sample_schema = {'mock_schema_for': 'foo'} + + with mock.patch.object(sheet_utils_module, "get_schema") as mock_get_schema: + mock_get_schema.return_value = sample_schema + assert autoloader.fetch_schema(sample_schema_name, portal_env=autoloader.portal_env) == sample_schema + + schema_cache_with_sample_schema = {sample_schema_name: sample_schema} + assert SchemaAutoloadMixin.SCHEMA_CACHE == schema_cache_with_sample_schema + assert autoloader.SCHEMA_CACHE == schema_cache_with_sample_schema + + +@using_fresh_ff_state_for_testing() +@pytest.mark.integrated +@pytest.mark.parametrize('portal_env', [None, 'data']) +def test_schema_autoload_mixin_fetch_schema(portal_env): + + with schema_autoloader_for_testing(portal_env=portal_env) as autoloader: + + assert autoloader.portal_env == 'data' + + user_schema = autoloader.fetch_schema('user', portal_env=autoloader.portal_env) + + assert user_schema['$id'] == '/profiles/user.json' + assert user_schema['title'] == 'User' + assert 'properties' in user_schema + + +@using_fresh_ff_state_for_testing() +@pytest.mark.integrated +@pytest.mark.parametrize('autoload_schemas', [True, False]) +@pytest.mark.parametrize('cache_schemas', [True, False]) +@pytest.mark.parametrize('portal_env', [None, 'data']) +def test_schema_autoload_mixin_fetch_relevant_schemas(autoload_schemas, cache_schemas, portal_env): + + with printed_output() as printed: + with local_attrs(SchemaAutoloadMixin, CACHE_SCHEMAS=cache_schemas): + with schema_autoloader_for_testing(portal_env=portal_env, autoload_schemas=autoload_schemas) as autoloader: + + assert autoloader.portal_env == 'data' + + if autoload_schemas: + + schemas = autoloader.fetch_relevant_schemas(['User', 'Lab']) + assert isinstance(schemas, dict) + assert len(schemas) == 2 + assert set(schemas.keys()) == {'User', 'Lab'} + + else: + + assert autoloader.fetch_relevant_schemas(['User', 'Lab']) == {} + + if portal_env == 'data': + assert printed.lines == [] + else: + assert printed.lines == [ + "The portal_env was not explicitly supplied. Schemas will come from portal_env='data'." + ] + + +SAMPLE_ITEMS_FOR_REAL_SCHEMAS_FILE = os.path.join(TEST_DIR, 'data_files/sample_items_for_real_schemas.csv') + + +@using_fresh_ff_state_for_testing() +@pytest.mark.integrated +def test_workbook_with_schemas(): + + actual_data = CsvManager(filename=SAMPLE_ITEMS_FOR_REAL_SCHEMAS_FILE, tab_name='ExperimentSeq').load_content() + expected_data = { + "ExperimentSeq": [ + { + "accession": "foo", + "fragment_size_selection_method": "spri" + }, + { + "accession": "bar", + "fragment_size_selection_method": "blue" + } + ] + } + assert actual_data == expected_data + + actual_items = load_items(SAMPLE_ITEMS_FOR_REAL_SCHEMAS_FILE, + tab_name='ExperimentSeq', autoload_schemas=True) + expected_items = { + "ExperimentSeq": [ + { + "accession": "foo", + "fragment_size_selection_method": "SPRI beads" + }, + { + "accession": "bar", + "fragment_size_selection_method": "BluePippin" + } + ] + } + assert actual_items == expected_items From 660df9c0000ab4d7b8a41da1a8289052dc1e6525 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Fri, 25 Aug 2023 11:42:56 -0400 Subject: [PATCH 036/101] Small cosmetic changes and some additional support for upcoming work. --- CHANGELOG.rst | 8 ++- dcicutils/misc_utils.py | 41 ++++++++++++++ dcicutils/sheet_utils.py | 119 +++++++++++++++++++++++++++------------ pyproject.toml | 2 +- test/test_misc_utils.py | 97 ++++++++++++++++++++++++++++--- test/test_sheet_utils.py | 26 ++++----- 6 files changed, 233 insertions(+), 60 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index b88893ac8..79f60120f 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -19,7 +19,7 @@ Change Log * Function ``load_items`` that does the same as ``ItemManager.load``. - * Various low-level implementation classes such as: + * Various lower-level implementation classes such as: * Classes ``XlsxManager``, ``CsvManager`` and ``TsvManager`` for loading raw data from ``.xlsx``, ``.csv``, and ``.tsv`` files, respectively. @@ -27,7 +27,11 @@ Change Log * Classes ``XlsxItemManager``, ``CsvItemManager``, and ``TsvItemManager`` for loading Item-style data from ``.xlsx``, ``.csv``, and ``.tsv`` files, respectively. -* Contains a fix for a bug in ``ff_utils.get_schema_names`` (`C4-1086 `_). +* New functionality in ``misc_utils``: + + * New function ``is_uuid`` (migrated from Fourfront) + * New function ``pad_to`` + * New class ``JsonLinesReader`` 7.9.0 diff --git a/dcicutils/misc_utils.py b/dcicutils/misc_utils.py index aeca4a326..de188b872 100644 --- a/dcicutils/misc_utils.py +++ b/dcicutils/misc_utils.py @@ -9,6 +9,7 @@ import inspect import math import io +import json import os import logging import pytz @@ -2329,3 +2330,43 @@ def parse_in_radix(text: str, *, radix: int): except Exception: pass raise ValueError(f"Unable to parse: {text!r}") + + +def pad_to(target_size: int, data: list, *, padding=None): + actual_size = len(data) + if actual_size < target_size: + data = data + [padding] * (target_size - actual_size) + return data + + +class JsonLinesReader: + + def __init__(self, fp, padded=False, padding=None): + self.fp = fp + self.padded: bool = padded + self.padding = padding + self.headers = None # Might change after we see first line + + def __iter__(self): + first_line = True + n_headers = 0 + for raw_line in self.fp: + line = json.loads(raw_line) + if first_line: + first_line = False + if isinstance(line, list): + self.headers = line + n_headers = len(line) + continue + # If length of line is mroe than we expect, ignore it. Let user put comments beyond our table + # But if length of line is less than we expect, extend the line with None + if self.headers: + if not isinstance(line, list): + raise Exception("If the first line is a list, all lines must be.") + if self.padded and len(line) < n_headers: + line = pad_to(n_headers, line, padding=self.padding) + yield dict(zip(self.headers, line)) + elif isinstance(line, dict): + yield line + else: + raise Exception(f"If the first line is not a list, all lines must be dictionaries: {line!r}") diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index 9a83bf6f9..df3e16e43 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -3,18 +3,19 @@ import csv import io import openpyxl +import os import uuid from dcicutils.common import AnyJsonData from dcicutils.env_utils import public_env_name, EnvUtils from dcicutils.ff_utils import get_schema from dcicutils.lang_utils import conjoined_list, disjoined_list, maybe_pluralize -from dcicutils.misc_utils import ignored, PRINT +from dcicutils.misc_utils import ignored, PRINT, pad_to from dcicutils.task_utils import pmap from openpyxl.worksheet.worksheet import Worksheet from openpyxl.workbook.workbook import Workbook from tempfile import TemporaryFile -from typing import Any, Dict, Iterable, List, Optional, Union +from typing import Any, Dict, Iterable, List, Optional, Type, Union Header = str @@ -334,7 +335,7 @@ def __init__(self, **kwargs): # TODO: Consider whether this should be an abstractmethod (but first see detailed design note at top of class.) @classmethod - def load(cls, filename: str) -> Dict[str, List[AnyJsonData]]: + def load(cls, filename: str, **kwargs) -> Dict[str, List[AnyJsonData]]: """ Reads a filename and returns a dictionary that maps sheet names to rows of dictionary data. For more information, see documentation of AbstractTableSetManager. @@ -354,6 +355,8 @@ class BasicTableSetManager(AbstractTableSetManager): of this where there's only one set of headers and only one block of content. """ + ALLOWED_FILE_EXTENSIONS: List[str] = [] + def __init__(self, filename: str, **kwargs): super().__init__(**kwargs) self.filename: str = filename @@ -387,17 +390,26 @@ def load_content(self) -> Any: class TableSetManager(BasicTableSetManager): - - ALLOWED_FILE_EXTENSIONS = None + """ + This is the base class for all things that read tablesets. Those may be: + * Excel workbook readers (.xlsx) + * Comma-separated file readers (.csv) + * Tab-separarated file readers (.tsv in most of the world, but Microsoft stupidly calls this .txt, outright + refusing to write a .tsv file, so many people seem to compromise and call this .tsv.txt) + Unimplemented formats that could easily be made to do the same thing: + * JSON files + * JSON lines files + * YAML files + """ @classmethod - def load(cls, filename: str) -> AnyJsonData: + def load(cls, filename: str, **kwargs) -> AnyJsonData: if cls.ALLOWED_FILE_EXTENSIONS: if not any(filename.lower().endswith(suffix) for suffix in cls.ALLOWED_FILE_EXTENSIONS): raise LoadArgumentsError(f"The TableSetManager subclass {cls.__name__} expects only" f" {disjoined_list(cls.ALLOWED_FILE_EXTENSIONS)} filenames: {filename}") - table_set_manager: TableSetManager = cls(filename) + table_set_manager: TableSetManager = cls(filename, **kwargs) return table_set_manager.load_content() def __init__(self, filename: str, **kwargs): @@ -432,6 +444,33 @@ def parse_cell_value(cls, value: SheetCellValue) -> AnyJsonData: return prefer_number(value) +class TableSetManagerRegistry: + + ALL_TABLE_SET_MANAGERS: Dict[str, Type[TableSetManager]] = {} + + @classmethod + def register(cls, class_to_register: Type[TableSetManager]): + for ext in class_to_register.ALLOWED_FILE_EXTENSIONS: + existing = cls.ALL_TABLE_SET_MANAGERS.get(ext) + if existing: + raise Exception(f"Tried to define {class_to_register} to extension {ext}," + f" but {existing} already claimed that.") + cls.ALL_TABLE_SET_MANAGERS[ext] = class_to_register + return class_to_register + + @classmethod + def manager_for_filename(cls, filename: str) -> Type[TableSetManager]: + base = os.path.basename(filename) + dotparts = base.split('.') + while dotparts: + suffix = f".{'.'.join(dotparts)}" + found = cls.ALL_TABLE_SET_MANAGERS.get(suffix) + if found: + return found + dotparts = dotparts[1:] + raise LoadArgumentsError(f"Unknown file type: {filename}") + + class XlsxManager(TableSetManager): """ This implements the mechanism to get a series of rows out of the sheets in an XLSX file. @@ -484,7 +523,7 @@ class SchemaAutoloadMixin(AbstractTableSetManager): SCHEMA_CACHE = {} # Shared cache. Do not override. Use .clear_schema_cache() to clear it. CACHE_SCHEMAS = True # Controls whether we're doing caching at all - AUTOLOAD_SCHEMAS_DEFAULT = False + AUTOLOAD_SCHEMAS_DEFAULT = True def __init__(self, autoload_schemas: Optional[bool] = None, portal_env: Optional[str] = None, **kwargs): if portal_env is None: @@ -592,6 +631,7 @@ def parse_cell_value(self, value: SheetCellValue) -> AnyJsonData: return ItemTools.parse_item_value(value, context=self._instaguid_context_table) +@TableSetManagerRegistry.register class XlsxItemManager(ItemManagerMixin, XlsxManager): """ This layers item-style row processing functionality on an XLSX file. @@ -599,29 +639,35 @@ class XlsxItemManager(ItemManagerMixin, XlsxManager): pass -class CsvManager(TableSetManager): - """ - This implements the mechanism to get a series of rows out of the sheet in a csv file, - returning a result that still looks like there could have been multiple tabs. - """ - - ALLOWED_FILE_EXTENSIONS = ['.csv'] +class SingleTableMixin(AbstractTableSetManager): DEFAULT_TAB_NAME = 'Sheet1' - def __init__(self, filename: str, tab_name: Optional[str] = None, **kwargs): - super().__init__(filename=filename, **kwargs) + def __init__(self, tab_name: Optional[str] = None, **kwargs): + super().__init__(**kwargs) self.tab_name = tab_name or self.DEFAULT_TAB_NAME @property def tabnames(self) -> List[str]: return [self.tab_name] + +class CsvManager(SingleTableMixin, TableSetManager): + """ + This implements the mechanism to get a series of rows out of the sheet in a csv file, + returning a result that still looks like there could have been multiple tabs. + """ + + ALLOWED_FILE_EXTENSIONS = ['.csv'] + + def __init__(self, filename: str, **kwargs): + super().__init__(filename=filename, **kwargs) + def _get_reader_agent(self) -> CsvReader: - return self._get_csv_reader(self.filename) + return self._get_reader_agent_for_filename(self.filename) @classmethod - def _get_csv_reader(cls, filename) -> CsvReader: + def _get_reader_agent_for_filename(cls, filename) -> CsvReader: return csv.reader(open_text_input_file_respecting_byte_order_mark(filename)) PAD_TRAILING_TABS = True @@ -630,9 +676,8 @@ def _raw_row_generator_for_tabname(self, tabname: str) -> Iterable[SheetRow]: headers = self.tab_headers(tabname) n_headers = len(headers) for row_data in self.reader_agent: - n_cols = len(row_data) - if self.PAD_TRAILING_TABS and n_cols < n_headers: - row_data = row_data + [''] * (n_headers - n_cols) + if self.PAD_TRAILING_TABS: + row_data = pad_to(n_headers, row_data, padding='') yield row_data def _create_tab_processor_state(self, tabname: str) -> Headers: @@ -647,6 +692,7 @@ def _process_row(self, tabname: str, headers: Headers, row_data: SheetRow) -> An for i, row_datum in enumerate(row_data)} +@TableSetManagerRegistry.register class CsvItemManager(ItemManagerMixin, CsvManager): """ This layers item-style row processing functionality on a CSV file. @@ -666,7 +712,7 @@ def __init__(self, filename: str, escaping: Optional[bool] = None, **kwargs): self.escaping: bool = escaping or False @classmethod - def _get_csv_reader(cls, filename) -> CsvReader: + def _get_reader_agent_for_filename(cls, filename) -> CsvReader: return csv.reader(open_text_input_file_respecting_byte_order_mark(filename), delimiter='\t') def parse_cell_value(self, value: SheetCellValue) -> AnyJsonData: @@ -699,6 +745,7 @@ def expand_escape_sequences(cls, text: str) -> str: return s.getvalue() +@TableSetManagerRegistry.register class TsvItemManager(ItemManagerMixin, TsvManager): """ This layers item-style row processing functionality on a TSV file. @@ -714,24 +761,22 @@ class ItemManager(AbstractTableSetManager): @classmethod def create_implementation_manager(cls, filename: str, **kwargs) -> BasicTableSetManager: - if filename.endswith(".xlsx"): - reader_agent = XlsxItemManager(filename, **kwargs) - elif filename.endswith(".csv"): - tab_name = kwargs.pop('tab_name', None) - reader_agent = CsvItemManager(filename, tab_name=tab_name, **kwargs) - elif filename.endswith(".tsv"): - escaping = kwargs.pop('escaping', None) - tab_name = kwargs.pop('tab_name', None) - reader_agent = TsvItemManager(filename, escaping=escaping, tab_name=tab_name, **kwargs) - else: - raise LoadArgumentsError(f"Unknown file type: {filename}") + reader_agent_class = TableSetManagerRegistry.manager_for_filename(filename) + reader_agent = reader_agent_class(filename, **kwargs) return reader_agent @classmethod - def load(cls, filename: str, tab_name: Optional[str] = None, escaping: Optional[bool] = None, - schemas: Optional[Dict] = None, autoload_schemas: Optional[bool] = None) -> AnyJsonData: + def load(cls, filename: str, + tab_name: Optional[str] = None, + escaping: Optional[bool] = None, + schemas: Optional[Dict] = None, + autoload_schemas: Optional[bool] = None, + **kwargs) -> Dict[str, List[AnyJsonData]]: + """ + Given a filename and various options + """ manager = cls.create_implementation_manager(filename, tab_name=tab_name, escaping=escaping, schemas=schemas, - autoload_schemas=autoload_schemas) + autoload_schemas=autoload_schemas, **kwargs) return manager.load_content() diff --git a/pyproject.toml b/pyproject.toml index aaa4371f7..b3e907b9d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dcicutils" -version = "7.9.0.1b2" # to become "7.10.0" +version = "7.9.0.1b3" # to become "7.10.0" description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources" authors = ["4DN-DCIC Team "] license = "MIT" diff --git a/test/test_misc_utils.py b/test/test_misc_utils.py index b940877f5..0017bd16e 100644 --- a/test/test_misc_utils.py +++ b/test/test_misc_utils.py @@ -30,12 +30,13 @@ classproperty, classproperty_cached, classproperty_cached_each_subclass, Singleton, NamedObject, obsolete, ObsoleteError, CycleError, TopologicalSorter, keys_and_values_to_dict, dict_to_keys_and_values, is_c4_arn, deduplicate_list, chunked, parse_in_radix, format_in_radix, managed_property, future_datetime, - MIN_DATETIME, MIN_DATETIME_UTC, INPUT, builtin_print, map_chunked, to_camel_case, + MIN_DATETIME, MIN_DATETIME_UTC, INPUT, builtin_print, map_chunked, to_camel_case, pad_to, JsonLinesReader, ) from dcicutils.qa_utils import ( Occasionally, ControlledTime, override_environ as qa_override_environ, MockFileSystem, printed_output, raises_regexp, MockId, MockLog, input_series, ) +from typing import Any, Dict, List from unittest import mock @@ -1094,7 +1095,7 @@ def test_lockout_manager(): protected_action = "simulated action" - # The function now() will get us the time. This assure us that binding datetime.datetime + # The function now() will get us the time. This assures us that binding datetime.datetime # will not be affecting us. now = datetime_module.datetime.now @@ -1197,7 +1198,7 @@ def test_rate_manager(): # PyCharm thinks this is not used. -kmp 26-Jul-2020 # r = RateManager(interval_seconds=60, safety_seconds=1, allowed_attempts=4) - # The function now() will get us the time. This assure us that binding datetime.datetime + # The function now() will get us the time. This assures us that binding datetime.datetime # will not be affecting us. now = datetime_module.datetime.now @@ -1885,7 +1886,7 @@ def test_cached_field_mocked(self): assert field.get() == val5 assert field.get() == val5 - dt.sleep(self.DEFAULT_TIMEOUT) # Fast forward to where we're going to refill again + dt.sleep(self.DEFAULT_TIMEOUT) # Fast-forward to where we're going to refill again val6 = field.get() assert val6 != val5 @@ -2077,7 +2078,7 @@ def test_copy_json(obj): def test_copy_json_side_effects(): - obj = {'foo': [1, 2, 3], 'bar': [{'x': 4, 'y': 5}, {'x': 2, 'y': 7}]} + obj: Dict[str, Any] = {'foo': [1, 2, 3], 'bar': [{'x': 4, 'y': 5}, {'x': 2, 'y': 7}]} obj_copy = copy_json(obj) obj['foo'][1] = 20 obj['bar'][0]['y'] = 500 # NoQA - PyCharm wrongly fears there are type errors in this line, that it will fail. @@ -2931,7 +2932,7 @@ class SubClock(Clock): assert str(exc.value) == ("The subclasses= argument to classproperty_cached.reset must not be False" " because classproperty_cached does not use per-subclass caches.") - # This will clear SubClock cache, bu that's shared with the Clock cache, so both will clear. + # This will clear SubClock cache, but that's shared with the Clock cache, so both will clear. assert classproperty_cached.reset(instance_class=SubClock, attribute_name='sample') is True c_t5 = Clock.sample # This should recompute Clock.sample cache, which is shared by SubCLock @@ -3285,7 +3286,7 @@ def test_deduplicate_list(): xlen = len(x) assert sorted(deduplicate_list(x)) == ['a', 'b', 'c'] - assert len(x) == xlen # make sure there was no side-effect to the original list + assert len(x) == xlen # make sure there was no side effect to the original list y = ['a'] y0 = deduplicate_list(y) @@ -3495,3 +3496,85 @@ def test_map_chunked(): res = map_chunked(lambda x: ''.join(x), "abcdefghij", chunk_size=4, reduce=lambda x: '.'.join(x)) assert res == 'abcd.efgh.ij' + + +def test_pad_to(): + + assert pad_to(5, []) == [None, None, None, None, None] + assert pad_to(5, [], padding='foo') == ['foo', 'foo', 'foo', 'foo', 'foo'] + + assert pad_to(5, ['x']) == ['x', None, None, None, None] + assert pad_to(5, ['x'], padding='foo') == ['x', 'foo', 'foo', 'foo', 'foo'] + + six_elements = ['a', 'b', 'c', 'd', 'e', 'f'] + + assert pad_to(5, six_elements) == six_elements + assert pad_to(5, six_elements, padding='foo') + + +def test_json_lines_reader_dicts(): + + print() # start on a fresh line + + mfs = MockFileSystem() + + with mfs.mock_exists_open_remove(): + + item1 = {"foo": 1, "bar": 2} + item2 = {"foo": 3, "bar": 4} + + item1_str = json.dumps(item1) + item2_str = json.dumps(item2) + + sample_lines = [item1_str, item2_str] + + sample_filename = "somefile.jsonl" + + with io.open(sample_filename, 'w') as fp: + for line in sample_lines: + print(line, file=fp) + + for file, content in mfs.files.items(): + print("=" * 20, file, "=" * 20) + print(content.decode('utf-8')) + print("=" * 80) + + with io.open(sample_filename) as fp: + assert [line for line in JsonLinesReader(fp)] == [item1, item2] + + +def test_json_lines_reader_lists(): + + print() # start on a fresh line + + mfs = MockFileSystem() + + with mfs.mock_exists_open_remove(): + + item1 = {"foo": 1, "bar": 2} + item2 = {"foo": 3, "bar": 4} + + headers: List[str] = list(item1.keys()) + + item1_str = json.dumps([item1[header] for header in headers]) + item2_str = json.dumps([item2[header] for header in headers]) + + sample_lines = [item1_str, item2_str] + + sample_filename = "somefile.jsonl" + + with io.open(sample_filename, 'w') as fp: + + print(json.dumps(headers), file=fp) + for line in sample_lines: + print(line, file=fp) + + for file, content in mfs.files.items(): + print("=" * 20, file, "=" * 20) + print(content.decode('utf-8')) + print("=" * 80) + + with io.open(sample_filename) as fp: + parsed = [line for line in JsonLinesReader(fp)] + expected = [item1, item2] + assert parsed == expected diff --git a/test/test_sheet_utils.py b/test/test_sheet_utils.py index 8557e1278..ae3096632 100644 --- a/test/test_sheet_utils.py +++ b/test/test_sheet_utils.py @@ -376,13 +376,13 @@ def test_xlsx_manager_load_csv(): def test_xlsx_item_manager_load_content(): - it = XlsxItemManager(SAMPLE_XLSX_FILE) + it = XlsxItemManager(SAMPLE_XLSX_FILE, autoload_schemas=False) assert it.load_content() == SAMPLE_XLSX_FILE_ITEM_CONTENT def test_xlsx_item_manager_load(): - assert XlsxItemManager.load(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_ITEM_CONTENT + assert XlsxItemManager.load(SAMPLE_XLSX_FILE, autoload_schemas=False) == SAMPLE_XLSX_FILE_ITEM_CONTENT def test_xlsx_item_manager_load_csv(): @@ -414,19 +414,19 @@ def test_csv_manager_load_csv(): def test_csv_item_manager_load_content(): - it = CsvItemManager(SAMPLE_CSV_FILE) + it = CsvItemManager(SAMPLE_CSV_FILE, autoload_schemas=False) assert it.load_content() == SAMPLE_CSV_FILE_ITEM_CONTENT def test_csv_item_manager_load(): - assert CsvItemManager.load(SAMPLE_CSV_FILE) == SAMPLE_CSV_FILE_ITEM_CONTENT + assert CsvItemManager.load(SAMPLE_CSV_FILE, autoload_schemas=False) == SAMPLE_CSV_FILE_ITEM_CONTENT def test_csv_item_manager_load_csv(): with pytest.raises(LoadArgumentsError) as exc: - CsvItemManager.load(SAMPLE_XLSX_FILE) + CsvItemManager.load(SAMPLE_XLSX_FILE, autoload_schemas=False) assert str(exc.value).startswith('The TableSetManager subclass CsvItemManager' ' expects only .csv filenames:') @@ -460,30 +460,30 @@ def test_tsv_manager_load_csv(): def test_tsv_item_manager_load_content(): - it = TsvItemManager(SAMPLE_TSV_FILE) + it = TsvItemManager(SAMPLE_TSV_FILE, autoload_schemas=False) assert it.load_content() == SAMPLE_TSV_FILE_ITEM_CONTENT def test_tsv_item_manager_load(): - assert TsvItemManager.load(SAMPLE_TSV_FILE) == SAMPLE_TSV_FILE_ITEM_CONTENT + assert TsvItemManager.load(SAMPLE_TSV_FILE, autoload_schemas=False) == SAMPLE_TSV_FILE_ITEM_CONTENT def test_tsv_item_manager_load_csv(): with pytest.raises(LoadArgumentsError) as exc: - TsvItemManager.load(SAMPLE_XLSX_FILE) + TsvItemManager.load(SAMPLE_XLSX_FILE, autoload_schemas=False) assert str(exc.value).startswith('The TableSetManager subclass TsvItemManager' ' expects only .tsv or .tsv.txt filenames:') def test_item_manager_load(): - assert ItemManager.load(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_ITEM_CONTENT + assert ItemManager.load(SAMPLE_XLSX_FILE, autoload_schemas=False) == SAMPLE_XLSX_FILE_ITEM_CONTENT - assert ItemManager.load(SAMPLE_CSV_FILE) == SAMPLE_CSV_FILE_ITEM_CONTENT + assert ItemManager.load(SAMPLE_CSV_FILE, autoload_schemas=False) == SAMPLE_CSV_FILE_ITEM_CONTENT - assert ItemManager.load(SAMPLE_TSV_FILE) == SAMPLE_TSV_FILE_ITEM_CONTENT + assert ItemManager.load(SAMPLE_TSV_FILE, autoload_schemas=False) == SAMPLE_TSV_FILE_ITEM_CONTENT with pytest.raises(LoadArgumentsError) as exc: ItemManager.load("something.else") @@ -492,9 +492,9 @@ def test_item_manager_load(): def test_load_items(): - assert load_items(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_ITEM_CONTENT + assert load_items(SAMPLE_XLSX_FILE, autoload_schemas=False) == SAMPLE_XLSX_FILE_ITEM_CONTENT - assert load_items(SAMPLE_CSV_FILE) == SAMPLE_CSV_FILE_ITEM_CONTENT + assert load_items(SAMPLE_CSV_FILE, autoload_schemas=False) == SAMPLE_CSV_FILE_ITEM_CONTENT with pytest.raises(LoadArgumentsError) as exc: load_items("something.else") From 34d528bbe70804f47231b9ab078874837bfcae1c Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Fri, 25 Aug 2023 11:53:57 -0400 Subject: [PATCH 037/101] Fix a unit test to conform to new google account name. --- test/test_s3_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_s3_utils.py b/test/test_s3_utils.py index 3cfd05c7e..0e98959cd 100644 --- a/test/test_s3_utils.py +++ b/test/test_s3_utils.py @@ -409,7 +409,7 @@ def test_s3utils_get_google_key(): keys = s3u.get_google_key() assert isinstance(keys, dict) assert keys['type'] == 'service_account' - assert keys["project_id"] == "fourdn-fourfront" + assert keys["project_id"] == "fourfront-396315" # yes, this is a magic constant for dict_key in ['private_key_id', 'private_key', 'client_email', 'client_id', 'auth_uri', 'client_x509_cert_url']: assert keys[dict_key] From 1c34ad02360497cebd68b8fe6032734f0ce22ab7 Mon Sep 17 00:00:00 2001 From: Kent M Pitman Date: Fri, 25 Aug 2023 16:28:46 -0400 Subject: [PATCH 038/101] Fix typo in comment (dcicutils/misc_utils.py) Co-authored-by: drio18 <58236592+drio18@users.noreply.github.com> --- dcicutils/misc_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dcicutils/misc_utils.py b/dcicutils/misc_utils.py index de188b872..cffabbfc6 100644 --- a/dcicutils/misc_utils.py +++ b/dcicutils/misc_utils.py @@ -2358,7 +2358,7 @@ def __iter__(self): self.headers = line n_headers = len(line) continue - # If length of line is mroe than we expect, ignore it. Let user put comments beyond our table + # If length of line is more than we expect, ignore it. Let user put comments beyond our table # But if length of line is less than we expect, extend the line with None if self.headers: if not isinstance(line, list): From 41fad79e06debc7be588ed73e202d72dbdffcefe Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Mon, 28 Aug 2023 05:32:21 -0400 Subject: [PATCH 039/101] Add some doc strings and comments. --- dcicutils/misc_utils.py | 30 ++++++++++++++++++++++++++++++ dcicutils/sheet_utils.py | 5 +++++ 2 files changed, 35 insertions(+) diff --git a/dcicutils/misc_utils.py b/dcicutils/misc_utils.py index cffabbfc6..aaa503a8f 100644 --- a/dcicutils/misc_utils.py +++ b/dcicutils/misc_utils.py @@ -2333,6 +2333,10 @@ def parse_in_radix(text: str, *, radix: int): def pad_to(target_size: int, data: list, *, padding=None): + """ + This will pad to a given target size, a list of a potentially different actual size, using given padding. + e.g., pad_to(3, [1, 2]) will return [1, 2, None] + """ actual_size = len(data) if actual_size < target_size: data = data + [padding] * (target_size - actual_size) @@ -2342,6 +2346,32 @@ def pad_to(target_size: int, data: list, *, padding=None): class JsonLinesReader: def __init__(self, fp, padded=False, padding=None): + """ + Given an fp (the conventional name for a "file pointer", the thing a call to io.open returns, + this creates an object that can be used to iterate across the lines in the JSON lines file + that the fp is reading from. + + There are two possible formats that this will return. + + For files that contain a series of dictionaries, such as: + {"something": 1, "else": "a"} + {"something": 2, "else": "b"} + ...etc + this will just return thos those dictionaries one-by-one when iterated over. + + The same set of dictionaries will also be yielded by a file containing: + ["something", "else"] + [1, "a"] + [2, "b"] + ...etc + this will just return thos those dictionaries one-by-one when iterated over. + + NOTES: + + * In the second case, shorter lists on subsequent lines return only partial dictionaries. + * In the second case, longer lists on subsequent lines will quietly drop any extra elements. + """ + self.fp = fp self.padded: bool = padded self.padding = padding diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index df3e16e43..50055acfe 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -224,6 +224,9 @@ def assure_patch_prototype_shape(cls, *, parent: Union[Dict, List], keys: Parsed @classmethod def parse_item_value(cls, value: SheetCellValue, context=None) -> AnyJsonData: + # TODO: Remodularize this for easier testing and more Schema-driven effect + # Doug asks that this be broken up into different mechanisms, more modular and separately testable. + # I pretty much agree with that. I'm just waiting for suggestions on what kinds of features are desired. if isinstance(value, str): lvalue = value.lower() # TODO: We could consult a schema to make this less heuristic, but this may do for now @@ -244,6 +247,8 @@ def parse_item_value(cls, value: SheetCellValue, context=None) -> AnyJsonData: # Note that this clause MUST follow '|' clause above so '#foo|#bar' isn't seen as instaguid return cls.get_instaguid(value, context=context) else: + # Doug points out that the schema might not agree, might want a string representation of a number. + # At this semantic layer, this might be a bad choice. return prefer_number(value) else: # presumably a number (int or float) return value From 6e8ce2cafaf4cb536d62f7ba3686791788154f90 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Wed, 30 Aug 2023 13:38:32 -0400 Subject: [PATCH 040/101] Rename tabname to tab_name throughout the sheet_utils interfaces. --- dcicutils/sheet_utils.py | 130 +++++++++++++++++++-------------------- 1 file changed, 65 insertions(+), 65 deletions(-) diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index 50055acfe..e3c7e7f0d 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -324,7 +324,7 @@ class AbstractTableSetManager: "Sheet2": [...], ..., } - It also needs some implementation of the .tabnames property. + It also needs some implementation of the .tab_names property. Note that at this level of abstraction, we take no position on what form of representation is used for the rows, as long as it is JSON data of some kind. It might be {"col1": "val1", "col2": "val2", ...} @@ -348,8 +348,8 @@ def load(cls, filename: str, **kwargs) -> Dict[str, List[AnyJsonData]]: raise NotImplementedError(f".load(...) is not implemented for {cls.__name__}.") # noQA @property - def tabnames(self) -> List[str]: - raise NotImplementedError(f".tabnames is not implemented for {self.__class__.__name__}..") # noQA + def tab_names(self) -> List[str]: + raise NotImplementedError(f".tab_names is not implemented for {self.__class__.__name__}..") # noQA class BasicTableSetManager(AbstractTableSetManager): @@ -365,25 +365,25 @@ class BasicTableSetManager(AbstractTableSetManager): def __init__(self, filename: str, **kwargs): super().__init__(**kwargs) self.filename: str = filename - self.headers_by_tabname: Dict[str, Headers] = {} - self.content_by_tabname: Dict[str, List[AnyJsonData]] = {} + self.headers_by_tab_name: Dict[str, Headers] = {} + self.content_by_tab_name: Dict[str, List[AnyJsonData]] = {} self.reader_agent: Any = self._get_reader_agent() - def tab_headers(self, tabname: str) -> Headers: - return self.headers_by_tabname[tabname] + def tab_headers(self, tab_name: str) -> Headers: + return self.headers_by_tab_name[tab_name] - def tab_content(self, tabname: str) -> List[AnyJsonData]: - return self.content_by_tabname[tabname] + def tab_content(self, tab_name: str) -> List[AnyJsonData]: + return self.content_by_tab_name[tab_name] @classmethod - def _create_tab_processor_state(cls, tabname: str) -> Any: + def _create_tab_processor_state(cls, tab_name: str) -> Any: """ This method provides for the possibility that some parsers will want auxiliary state, (such as parsed headers or a line count or a table of temporary names for objects to cross-link or some other such feature) that it carries with it as it moves from line to line parsing things. Subclasses might therefore want to make this do something more interesting. """ - ignored(tabname) # subclasses might need this, but we don't + ignored(tab_name) # subclasses might need this, but we don't return None def _get_reader_agent(self) -> Any: @@ -420,13 +420,13 @@ def load(cls, filename: str, **kwargs) -> AnyJsonData: def __init__(self, filename: str, **kwargs): super().__init__(filename=filename, **kwargs) - def _raw_row_generator_for_tabname(self, tabname: str) -> Iterable[SheetRow]: + def _raw_row_generator_for_tab_name(self, tab_name: str) -> Iterable[SheetRow]: """ - Given a tabname and a state (returned by _sheet_loader_state), return a generator for a set of row values. + Given a tab_name and a state (returned by _sheet_loader_state), return a generator for a set of row values. """ - raise NotImplementedError(f"._rows_for_tabname(...) is not implemented for {self.__class__.__name__}.") # noQA + raise NotImplementedError(f"._rows_for_tab_name(...) is not implemented for {self.__class__.__name__}.") # noQA - def _process_row(self, tabname: str, state: Any, row: List[SheetCellValue]) -> AnyJsonData: + def _process_row(self, tab_name: str, state: Any, row: List[SheetCellValue]) -> AnyJsonData: """ This needs to take a state and whatever represents a row and must return a list of objects representing column values. @@ -435,14 +435,14 @@ def _process_row(self, tabname: str, state: Any, row: List[SheetCellValue]) -> A raise NotImplementedError(f"._process_row(...) is not implemented for {self.__class__.__name__}.") # noQA def load_content(self) -> AnyJsonData: - for tabname in self.tabnames: + for tab_name in self.tab_names: sheet_content = [] - state = self._create_tab_processor_state(tabname) - for row_data in self._raw_row_generator_for_tabname(tabname): - processed_row_data: AnyJsonData = self._process_row(tabname, state, row_data) + state = self._create_tab_processor_state(tab_name) + for row_data in self._raw_row_generator_for_tab_name(tab_name): + processed_row_data: AnyJsonData = self._process_row(tab_name, state, row_data) sheet_content.append(processed_row_data) - self.content_by_tabname[tabname] = sheet_content - return self.content_by_tabname + self.content_by_tab_name[tab_name] = sheet_content + return self.content_by_tab_name @classmethod def parse_cell_value(cls, value: SheetCellValue) -> AnyJsonData: @@ -496,14 +496,14 @@ def _all_cols(cls, sheet: Worksheet): yield col @property - def tabnames(self) -> List[str]: + def tab_names(self) -> List[str]: return self.reader_agent.sheetnames def _get_reader_agent(self) -> Workbook: return openpyxl.load_workbook(self.filename) - def _raw_row_generator_for_tabname(self, tabname: str) -> Iterable[SheetRow]: - sheet = self.reader_agent[tabname] + def _raw_row_generator_for_tab_name(self, tab_name: str) -> Iterable[SheetRow]: + sheet = self.reader_agent[tab_name] return (self._get_raw_row_content_tuple(sheet, row) for row in self._all_rows(sheet)) @@ -511,15 +511,15 @@ def _get_raw_row_content_tuple(self, sheet: Worksheet, row: int) -> SheetRow: return [sheet.cell(row=row, column=col).value for col in self._all_cols(sheet)] - def _create_tab_processor_state(self, tabname: str) -> Headers: - sheet = self.reader_agent[tabname] + def _create_tab_processor_state(self, tab_name: str) -> Headers: + sheet = self.reader_agent[tab_name] headers: Headers = [str(sheet.cell(row=1, column=col).value) for col in self._all_cols(sheet)] - self.headers_by_tabname[sheet.title] = headers + self.headers_by_tab_name[sheet.title] = headers return headers - def _process_row(self, tabname: str, headers: Headers, row_data: SheetRow) -> AnyJsonData: - ignored(tabname) + def _process_row(self, tab_name: str, headers: Headers, row_data: SheetRow) -> AnyJsonData: + ignored(tab_name) return {headers[i]: self.parse_cell_value(row_datum) for i, row_datum in enumerate(row_data)} @@ -544,8 +544,8 @@ def fetch_schema(schema_name): schema = self.fetch_schema(schema_name, portal_env=self.portal_env) return schema_name, schema if self.autoload_schemas and self.portal_env: - autoloaded = {tabname: schema - for tabname, schema in pmap(fetch_schema, schema_names)} + autoloaded = {tab_name: schema + for tab_name, schema in pmap(fetch_schema, schema_names)} return autoloaded else: return {} @@ -576,20 +576,20 @@ class ItemManagerMixin(SchemaAutoloadMixin, BasicTableSetManager): def __init__(self, filename: str, schemas: Optional[Dict[str, AnyJsonData]] = None, **kwargs): super().__init__(filename=filename, **kwargs) - self.patch_prototypes_by_tabname: Dict[str, Dict] = {} - self.parsed_headers_by_tabname: Dict[str, ParsedHeaders] = {} - self.type_hints_by_tabname: Dict[str, OptionalTypeHints] = {} - self.schemas = schemas or self.fetch_relevant_schemas(self.tabnames) + self.patch_prototypes_by_tab_name: Dict[str, Dict] = {} + self.parsed_headers_by_tab_name: Dict[str, ParsedHeaders] = {} + self.type_hints_by_tab_name: Dict[str, OptionalTypeHints] = {} + self.schemas = schemas or self.fetch_relevant_schemas(self.tab_names) self._instaguid_context_table: Dict[str, str] = {} - def sheet_patch_prototype(self, tabname: str) -> Dict: - return self.patch_prototypes_by_tabname[tabname] + def sheet_patch_prototype(self, tab_name: str) -> Dict: + return self.patch_prototypes_by_tab_name[tab_name] - def sheet_parsed_headers(self, tabname: str) -> ParsedHeaders: - return self.parsed_headers_by_tabname[tabname] + def sheet_parsed_headers(self, tab_name: str) -> ParsedHeaders: + return self.parsed_headers_by_tab_name[tab_name] - def sheet_type_hints(self, tabname: str) -> OptionalTypeHints: - return self.type_hints_by_tabname[tabname] + def sheet_type_hints(self, tab_name: str) -> OptionalTypeHints: + return self.type_hints_by_tab_name[tab_name] class SheetState: @@ -597,33 +597,33 @@ def __init__(self, parsed_headers: ParsedHeaders, type_hints: OptionalTypeHints) self.parsed_headers = parsed_headers self.type_hints = type_hints - def _compile_type_hints(self, tabname: str): - parsed_headers = self.sheet_parsed_headers(tabname) - schema = self.schemas.get(tabname) + def _compile_type_hints(self, tab_name: str): + parsed_headers = self.sheet_parsed_headers(tab_name) + schema = self.schemas.get(tab_name) type_hints = [ItemTools.find_type_hint(parsed_header, schema) if schema else None for parsed_header in parsed_headers] - self.type_hints_by_tabname[tabname] = type_hints + self.type_hints_by_tab_name[tab_name] = type_hints - def _compile_sheet_headers(self, tabname: str): - headers = self.headers_by_tabname[tabname] + def _compile_sheet_headers(self, tab_name: str): + headers = self.headers_by_tab_name[tab_name] parsed_headers = ItemTools.parse_sheet_headers(headers) - self.parsed_headers_by_tabname[tabname] = parsed_headers + self.parsed_headers_by_tab_name[tab_name] = parsed_headers prototype = ItemTools.compute_patch_prototype(parsed_headers) - self.patch_prototypes_by_tabname[tabname] = prototype + self.patch_prototypes_by_tab_name[tab_name] = prototype - def _create_tab_processor_state(self, tabname: str) -> SheetState: - super()._create_tab_processor_state(tabname) + def _create_tab_processor_state(self, tab_name: str) -> SheetState: + super()._create_tab_processor_state(tab_name) # This will create state that allows us to efficiently assign values in the right place on each row # by setting up a prototype we can copy and then drop values into. - self._compile_sheet_headers(tabname) - self._compile_type_hints(tabname) - return self.SheetState(parsed_headers=self.sheet_parsed_headers(tabname), - type_hints=self.sheet_type_hints(tabname)) + self._compile_sheet_headers(tab_name) + self._compile_type_hints(tab_name) + return self.SheetState(parsed_headers=self.sheet_parsed_headers(tab_name), + type_hints=self.sheet_type_hints(tab_name)) - def _process_row(self, tabname: str, state: SheetState, row_data: SheetRow) -> AnyJsonData: + def _process_row(self, tab_name: str, state: SheetState, row_data: SheetRow) -> AnyJsonData: parsed_headers = state.parsed_headers type_hints = state.type_hints - patch_item = copy.deepcopy(self.sheet_patch_prototype(tabname)) + patch_item = copy.deepcopy(self.sheet_patch_prototype(tab_name)) for i, value in enumerate(row_data): parsed_value = self.parse_cell_value(value) type_hint = type_hints[i] @@ -653,7 +653,7 @@ def __init__(self, tab_name: Optional[str] = None, **kwargs): self.tab_name = tab_name or self.DEFAULT_TAB_NAME @property - def tabnames(self) -> List[str]: + def tab_names(self) -> List[str]: return [self.tab_name] @@ -677,22 +677,22 @@ def _get_reader_agent_for_filename(cls, filename) -> CsvReader: PAD_TRAILING_TABS = True - def _raw_row_generator_for_tabname(self, tabname: str) -> Iterable[SheetRow]: - headers = self.tab_headers(tabname) + def _raw_row_generator_for_tab_name(self, tab_name: str) -> Iterable[SheetRow]: + headers = self.tab_headers(tab_name) n_headers = len(headers) for row_data in self.reader_agent: if self.PAD_TRAILING_TABS: row_data = pad_to(n_headers, row_data, padding='') yield row_data - def _create_tab_processor_state(self, tabname: str) -> Headers: - headers: Optional[Headers] = self.headers_by_tabname.get(tabname) + def _create_tab_processor_state(self, tab_name: str) -> Headers: + headers: Optional[Headers] = self.headers_by_tab_name.get(tab_name) if headers is None: - self.headers_by_tabname[tabname] = headers = self.reader_agent.__next__() + self.headers_by_tab_name[tab_name] = headers = self.reader_agent.__next__() return headers - def _process_row(self, tabname: str, headers: Headers, row_data: SheetRow) -> AnyJsonData: - ignored(tabname) + def _process_row(self, tab_name: str, headers: Headers, row_data: SheetRow) -> AnyJsonData: + ignored(tab_name) return {headers[i]: self.parse_cell_value(row_datum) for i, row_datum in enumerate(row_data)} From 04eb58c6623354162aad1ca307313d925801f2c2 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Thu, 31 Aug 2023 02:27:16 -0400 Subject: [PATCH 041/101] Add support for reading inserts dirs, .json, .jsonl (two formats), and .tabs.json --- dcicutils/sheet_utils.py | 201 ++++++++++++++---- test/data_files/sample_items.tabs.json | 74 +++++++ test/data_files/sample_items2.json | 6 + test/data_files/sample_items_sheet2a.jsonl | 3 + test/data_files/sample_items_sheet2b.jsonl | 2 + test/data_files/sample_items_sheet2b1.jsonl | 3 + test/data_files/sample_items_sheet2b2.jsonl | 2 + .../sample_items_sheet_2.tsv.README.text | 4 - test/test_sheet_utils.py | 70 +++++- 9 files changed, 306 insertions(+), 59 deletions(-) create mode 100644 test/data_files/sample_items.tabs.json create mode 100644 test/data_files/sample_items2.json create mode 100644 test/data_files/sample_items_sheet2a.jsonl create mode 100644 test/data_files/sample_items_sheet2b.jsonl create mode 100644 test/data_files/sample_items_sheet2b1.jsonl create mode 100644 test/data_files/sample_items_sheet2b2.jsonl delete mode 100644 test/data_files/sample_items_sheet_2.tsv.README.text diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index e3c7e7f0d..4777c26b9 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -1,16 +1,19 @@ import chardet import copy import csv +import glob import io +import json import openpyxl import os +import re import uuid from dcicutils.common import AnyJsonData from dcicutils.env_utils import public_env_name, EnvUtils from dcicutils.ff_utils import get_schema from dcicutils.lang_utils import conjoined_list, disjoined_list, maybe_pluralize -from dcicutils.misc_utils import ignored, PRINT, pad_to +from dcicutils.misc_utils import ignored, PRINT, pad_to, JsonLinesReader from dcicutils.task_utils import pmap from openpyxl.worksheet.worksheet import Worksheet from openpyxl.workbook.workbook import Workbook @@ -82,16 +85,16 @@ def prefer_number(value: SheetCellValue): return value -def open_text_input_file_respecting_byte_order_mark(filename): +def open_unicode_text_input_file_respecting_byte_order_mark(filename): """ Opens a file for text input, respecting a byte-order mark (BOM). """ with io.open(filename, 'rb') as fp: leading_bytes = fp.read(4 * 8) # 4 bytes is all we need - bom_info = chardet.detect(leading_bytes) + bom_info = chardet.detect(leading_bytes, should_rename_legacy=True) detected_encoding = bom_info and bom_info.get('encoding') # tread lightly - - return io.open(filename, 'r', encoding=detected_encoding) + use_encoding = 'utf-8' if detected_encoding == 'ascii' else detected_encoding + return io.open(filename, 'r', encoding=use_encoding) class TypeHint: @@ -303,6 +306,10 @@ def finder(subheader, subschema): return finder(subheader=parsed_header, subschema=schema) + @classmethod + def infer_tab_name(cls, filename): + return os.path.basename(filename).split('.')[0] + # TODO: Consider whether this might want to be an abstract base class. Some change might be needed. # @@ -414,7 +421,7 @@ def load(cls, filename: str, **kwargs) -> AnyJsonData: raise LoadArgumentsError(f"The TableSetManager subclass {cls.__name__} expects only" f" {disjoined_list(cls.ALLOWED_FILE_EXTENSIONS)} filenames: {filename}") - table_set_manager: TableSetManager = cls(filename, **kwargs) + table_set_manager: TableSetManager = cls(filename=filename, **kwargs) return table_set_manager.load_content() def __init__(self, filename: str, **kwargs): @@ -451,30 +458,46 @@ def parse_cell_value(cls, value: SheetCellValue) -> AnyJsonData: class TableSetManagerRegistry: - ALL_TABLE_SET_MANAGERS: Dict[str, Type[TableSetManager]] = {} + ALL_TABLE_SET_MANAGERS: Dict[str, Type['ItemManagerMixin']] = {} + ALL_TABLE_SET_REGEXP_MAPPINGS = [] @classmethod - def register(cls, class_to_register: Type[TableSetManager]): - for ext in class_to_register.ALLOWED_FILE_EXTENSIONS: - existing = cls.ALL_TABLE_SET_MANAGERS.get(ext) - if existing: - raise Exception(f"Tried to define {class_to_register} to extension {ext}," - f" but {existing} already claimed that.") - cls.ALL_TABLE_SET_MANAGERS[ext] = class_to_register - return class_to_register + def register(cls, regexp=None): + def _wrapped_register(class_to_register: Type['ItemManagerMixin']): + if regexp: + cls.ALL_TABLE_SET_REGEXP_MAPPINGS.append((re.compile(regexp), class_to_register)) + for ext in class_to_register.ALLOWED_FILE_EXTENSIONS: + existing = cls.ALL_TABLE_SET_MANAGERS.get(ext) + if existing: + raise Exception(f"Tried to define {class_to_register} to extension {ext}," + f" but {existing} already claimed that.") + cls.ALL_TABLE_SET_MANAGERS[ext] = class_to_register + return class_to_register + return _wrapped_register @classmethod - def manager_for_filename(cls, filename: str) -> Type[TableSetManager]: - base = os.path.basename(filename) - dotparts = base.split('.') - while dotparts: - suffix = f".{'.'.join(dotparts)}" - found = cls.ALL_TABLE_SET_MANAGERS.get(suffix) - if found: - return found - dotparts = dotparts[1:] + def manager_for_filename(cls, filename: str) -> Type['ItemManagerMixin']: + base: str = os.path.basename(filename) + suffix_parts = base.split('.')[1:] + if suffix_parts: + for i in range(0, len(suffix_parts)): + suffix = f".{'.'.join(suffix_parts[i:])}" + found = cls.ALL_TABLE_SET_MANAGERS.get(suffix) + if found: + return found + else: + special_case: Optional[Type[ItemManagerMixin]] = cls.manager_for_special_filename(filename) + if special_case: + return special_case raise LoadArgumentsError(f"Unknown file type: {filename}") + @classmethod + def manager_for_special_filename(cls, filename: str) -> Optional[Type['ItemManagerMixin']]: + for pattern, manager_class in cls.ALL_TABLE_SET_REGEXP_MAPPINGS: + if pattern.match(filename): + return manager_class + return None + class XlsxManager(TableSetManager): """ @@ -530,13 +553,16 @@ class SchemaAutoloadMixin(AbstractTableSetManager): CACHE_SCHEMAS = True # Controls whether we're doing caching at all AUTOLOAD_SCHEMAS_DEFAULT = True - def __init__(self, autoload_schemas: Optional[bool] = None, portal_env: Optional[str] = None, **kwargs): - if portal_env is None: - portal_env = public_env_name(EnvUtils.PRD_ENV_NAME) - PRINT(f"The portal_env was not explicitly supplied. Schemas will come from portal_env={portal_env!r}.") - super().__init__(**kwargs) + def __init__(self, autoload_schemas: Optional[bool] = None, portal_env: Optional[str] = None, + **kwargs): + # This setup must be in place before the class initialization is done (via the super call). self.autoload_schemas: bool = self.AUTOLOAD_SCHEMAS_DEFAULT if autoload_schemas is None else autoload_schemas + if self.autoload_schemas: + if portal_env is None: + portal_env = public_env_name(EnvUtils.PRD_ENV_NAME) + PRINT(f"The portal_env was not explicitly supplied. Schemas will come from portal_env={portal_env!r}.") self.portal_env: Optional[str] = portal_env + super().__init__(**kwargs) def fetch_relevant_schemas(self, schema_names: List[str]): # The schema_names argument is not normally given, but it is there for easier testing @@ -636,7 +662,7 @@ def parse_cell_value(self, value: SheetCellValue) -> AnyJsonData: return ItemTools.parse_item_value(value, context=self._instaguid_context_table) -@TableSetManagerRegistry.register +@TableSetManagerRegistry.register() class XlsxItemManager(ItemManagerMixin, XlsxManager): """ This layers item-style row processing functionality on an XLSX file. @@ -646,15 +672,103 @@ class XlsxItemManager(ItemManagerMixin, XlsxManager): class SingleTableMixin(AbstractTableSetManager): - DEFAULT_TAB_NAME = 'Sheet1' + def __init__(self, filename: str, tab_name: Optional[str] = None, **kwargs): + self._tab_name = tab_name or ItemTools.infer_tab_name(filename) + super().__init__(filename=filename, **kwargs) - def __init__(self, tab_name: Optional[str] = None, **kwargs): - super().__init__(**kwargs) - self.tab_name = tab_name or self.DEFAULT_TAB_NAME + @property + def tab_names(self) -> List[str]: + return [self._tab_name] + + +class _JsonInsertsDataItemManager(ItemManagerMixin, BasicTableSetManager): + + AUTOLOAD_SCHEMAS_DEFAULT = False + + ALLOWED_FILE_EXTENSIONS = [] + + def _load_json_data(self, filename: str) -> Dict[str, AnyJsonData]: + raise NotImplementedError(f"._load_json_data() is not implemented for {cls.__name__}.") # noQA @property def tab_names(self) -> List[str]: - return [self.tab_name] + return list(self.content_by_tab_name.keys()) + + def _get_reader_agent(self) -> Any: + return self + + def load_content(self) -> Dict[str, AnyJsonData]: + data = self._load_json_data(self.filename) + for tab_name, tab_content in data.items(): + self.content_by_tab_name[tab_name] = tab_content + if not tab_content: + self.headers_by_tab_name[tab_name] = [] + else: + self.headers_by_tab_name[tab_name] = list(tab_content[0].keys()) + return self.content_by_tab_name + + +@TableSetManagerRegistry.register() +class TabbedJsonInsertsItemManager(_JsonInsertsDataItemManager): + + ALLOWED_FILE_EXTENSIONS = [".tabs.json"] # If you want them all in one family, use this extension + + def _load_json_data(self, filename: str) -> Dict[str, AnyJsonData]: + data = json.load(open_unicode_text_input_file_respecting_byte_order_mark(filename)) + if (not isinstance(data, dict) + or not all(isinstance(tab_name, str) for tab_name in data.keys()) + or not all(isinstance(content, list) and all(isinstance(item, dict) for item in content) + for content in data.values())): + raise ValueError(f"Data in {filename} is not of type Dict[str, List[dict]].") + return data + + +@TableSetManagerRegistry.register() +class SimpleJsonInsertsItemManager(SingleTableMixin, _JsonInsertsDataItemManager): + + ALLOWED_FILE_EXTENSIONS = [".json"] # If you want them all in one family, use this extension + + def _load_json_data(self, filename: str) -> Dict[str, AnyJsonData]: + data = {self._tab_name: json.load(open_unicode_text_input_file_respecting_byte_order_mark(filename))} + if not all(isinstance(content, list) and all(isinstance(item, dict) for item in content) + for content in data.values()): + raise ValueError(f"Data in {filename} is not of type List[dict].") + return data + + +@TableSetManagerRegistry.register() +class SimpleJsonLinesInsertsItemManager(SingleTableMixin, _JsonInsertsDataItemManager): + + ALLOWED_FILE_EXTENSIONS = [".jsonl"] # If you want them all in one family, use this extension + + def _load_json_data(self, filename: str) -> Dict[str, AnyJsonData]: + content = [line for line in JsonLinesReader(open_unicode_text_input_file_respecting_byte_order_mark(filename))] + data = {self._tab_name: content} + if not all(isinstance(content, list) and all(isinstance(item, dict) for item in content) + for content in data.values()): + raise ValueError(f"Data in {filename} is not of type List[dict].") + return data + + +@TableSetManagerRegistry.register(regexp="^(.*/)?(|[^/]*[-_])inserts/?$") +class InsertsItemManager(_JsonInsertsDataItemManager): + + ALLOWED_FILE_EXTENSIONS = [] + + def _load_json_data(self, filename: str) -> Dict[str, AnyJsonData]: + if not os.path.isdir(filename): + raise LoadArgumentsError(f"{filename} is not the name of an inserts directory.") + tab_files = glob.glob(os.path.join(filename, "*.json")) + data = {} + for tab_file in tab_files: + tab_content = json.load(open_unicode_text_input_file_respecting_byte_order_mark(tab_file)) + # Here we don't use os.path.splitext because we want to split on the first dot. + # e.g., for foo.bar.baz, return just foo + # this allows names like ExperimentSet.tab.json that might need to use multi-dot suffixes + # for things unrelated to the tab name. + tab_name = os.path.basename(tab_file).split('.')[0] + data[tab_name] = tab_content + return data class CsvManager(SingleTableMixin, TableSetManager): @@ -665,15 +779,12 @@ class CsvManager(SingleTableMixin, TableSetManager): ALLOWED_FILE_EXTENSIONS = ['.csv'] - def __init__(self, filename: str, **kwargs): - super().__init__(filename=filename, **kwargs) - def _get_reader_agent(self) -> CsvReader: return self._get_reader_agent_for_filename(self.filename) @classmethod def _get_reader_agent_for_filename(cls, filename) -> CsvReader: - return csv.reader(open_text_input_file_respecting_byte_order_mark(filename)) + return csv.reader(open_unicode_text_input_file_respecting_byte_order_mark(filename)) PAD_TRAILING_TABS = True @@ -697,7 +808,7 @@ def _process_row(self, tab_name: str, headers: Headers, row_data: SheetRow) -> A for i, row_datum in enumerate(row_data)} -@TableSetManagerRegistry.register +@TableSetManagerRegistry.register() class CsvItemManager(ItemManagerMixin, CsvManager): """ This layers item-style row processing functionality on a CSV file. @@ -718,7 +829,7 @@ def __init__(self, filename: str, escaping: Optional[bool] = None, **kwargs): @classmethod def _get_reader_agent_for_filename(cls, filename) -> CsvReader: - return csv.reader(open_text_input_file_respecting_byte_order_mark(filename), delimiter='\t') + return csv.reader(open_unicode_text_input_file_respecting_byte_order_mark(filename), delimiter='\t') def parse_cell_value(self, value: SheetCellValue) -> AnyJsonData: if self.escaping and isinstance(value, str) and '\\' in value: @@ -750,7 +861,7 @@ def expand_escape_sequences(cls, text: str) -> str: return s.getvalue() -@TableSetManagerRegistry.register +@TableSetManagerRegistry.register() class TsvItemManager(ItemManagerMixin, TsvManager): """ This layers item-style row processing functionality on a TSV file. @@ -767,7 +878,7 @@ class ItemManager(AbstractTableSetManager): @classmethod def create_implementation_manager(cls, filename: str, **kwargs) -> BasicTableSetManager: reader_agent_class = TableSetManagerRegistry.manager_for_filename(filename) - reader_agent = reader_agent_class(filename, **kwargs) + reader_agent = reader_agent_class(filename=filename, **kwargs) return reader_agent @classmethod @@ -780,8 +891,8 @@ def load(cls, filename: str, """ Given a filename and various options """ - manager = cls.create_implementation_manager(filename, tab_name=tab_name, escaping=escaping, schemas=schemas, - autoload_schemas=autoload_schemas, **kwargs) + manager = cls.create_implementation_manager(filename=filename, tab_name=tab_name, escaping=escaping, + schemas=schemas, autoload_schemas=autoload_schemas, **kwargs) return manager.load_content() diff --git a/test/data_files/sample_items.tabs.json b/test/data_files/sample_items.tabs.json new file mode 100644 index 000000000..f972245f0 --- /dev/null +++ b/test/data_files/sample_items.tabs.json @@ -0,0 +1,74 @@ +{ + "Sheet1": [ + { + "x": 1, + "y": { + "a": 1, + "z": 1 + } + }, + { + "x": 1, + "y": { + "a": 2, + "z": 3 + } + }, + { + "x": "alpha", + "y": { + "a": "beta", + "z": [ + "gamma", + "delta" + ] + } + } + ], + "Sheet2": [ + { + "name": "bill", + "age": 23, + "mother": { + "name": "mary", + "age": 58 + }, + "father": { + "name": "fred", + "age": 63 + }, + "friends": [ + { + "name": "sam", + "age": 22 + }, + { + "name": "arthur", + "age": 19 + } + ] + }, + { + "name": "joe", + "age": 9, + "mother": { + "name": "estrella", + "age": 35 + }, + "father": { + "name": "anthony", + "age": 34 + }, + "friends": [ + { + "name": "anders", + "age": 9 + }, + { + "name": null, + "age": null + } + ] + } + ] +} diff --git a/test/data_files/sample_items2.json b/test/data_files/sample_items2.json new file mode 100644 index 000000000..7e084f908 --- /dev/null +++ b/test/data_files/sample_items2.json @@ -0,0 +1,6 @@ +[ + {"name": "john", "sex": "Male", "member": false}, + {"name": "juan", "sex": "Male", "member": true}, + {"name": "igor", "sex": "unknown", "member": null}, + {"name": "mary", "sex": "Female", "member": true} +] diff --git a/test/data_files/sample_items_sheet2a.jsonl b/test/data_files/sample_items_sheet2a.jsonl new file mode 100644 index 000000000..a0e96e83e --- /dev/null +++ b/test/data_files/sample_items_sheet2a.jsonl @@ -0,0 +1,3 @@ +["name", "age", "mother.name", "mother.age", "father.name", "father.age", "friends#0.name", "friends#0.age", "friends#1.name", "friends#1.age"] +["bill", 23, "mary", 58, "fred", 63, "sam", 22, "arthur", 19] +["joe", 9, "estrella", 35, "anthony", 34, "anders", 9] diff --git a/test/data_files/sample_items_sheet2b.jsonl b/test/data_files/sample_items_sheet2b.jsonl new file mode 100644 index 000000000..c044bfe18 --- /dev/null +++ b/test/data_files/sample_items_sheet2b.jsonl @@ -0,0 +1,2 @@ +{"name": "bill", "age": 23, "mother.name": "mary", "mother.age": 58, "father.name": "fred", "father.age": 63, "friends#0.name": "sam", "friends#0.age": 22, "friends#1.name": "arthur", "friends#1.age": 19} +{"name": "joe", "age": 9, "mother.name": "estrella", "mother.age": 35, "father.name": "anthony", "father.age": 34, "friends#0.name": "anders", "friends#0.age": 9} diff --git a/test/data_files/sample_items_sheet2b1.jsonl b/test/data_files/sample_items_sheet2b1.jsonl new file mode 100644 index 000000000..8f5c3345b --- /dev/null +++ b/test/data_files/sample_items_sheet2b1.jsonl @@ -0,0 +1,3 @@ +["name", "age", "mother", "father", "friends"] +["bill", 23, {"name": "mary", "age": 58}, {"name": "fred", "age": 63}, [{"name": "sam", "age": 22}, {"name": "arthur", "age": 19}]] +["joe", 9, {"name": "estrella", "age": 35}, {"name": "anthony", "age": 34}, [{"name": "anders", "age": 9}]] diff --git a/test/data_files/sample_items_sheet2b2.jsonl b/test/data_files/sample_items_sheet2b2.jsonl new file mode 100644 index 000000000..1ef8d9f11 --- /dev/null +++ b/test/data_files/sample_items_sheet2b2.jsonl @@ -0,0 +1,2 @@ +{"name": "bill", "age": 23, "mother": {"name": "mary", "age": 58}, "father": {"name": "fred", "age": 63}, "friends": [{"name": "sam", "age": 22}, {"name": "arthur", "age": 19}]} +{"name": "joe", "age": 9, "mother": {"name": "estrella", "age": 35}, "father": {"name": "anthony", "age": 34}, "friends": [{"name": "anders", "age": 9}]} \ No newline at end of file diff --git a/test/data_files/sample_items_sheet_2.tsv.README.text b/test/data_files/sample_items_sheet_2.tsv.README.text deleted file mode 100644 index efefaf654..000000000 --- a/test/data_files/sample_items_sheet_2.tsv.README.text +++ /dev/null @@ -1,4 +0,0 @@ -Note that one of the lines in file sample_items_sheet_2.tsv has two blank fields at end of line. -PyCharm and perhaps other editors "helpfully" removes trailing whitespace from lines, -so the number of columns varies line-to-line. Instead of insisting on explicit tabs at end of line, -we pad such short lines with nulls when reading from the file. diff --git a/test/test_sheet_utils.py b/test/test_sheet_utils.py index ae3096632..df30905db 100644 --- a/test/test_sheet_utils.py +++ b/test/test_sheet_utils.py @@ -10,7 +10,7 @@ from dcicutils.qa_utils import printed_output from dcicutils.sheet_utils import ( # High-level interfaces - ItemManager, load_items, + ItemManager, load_items, TableSetManagerRegistry, # Low-level implementation BasicTableSetManager, SchemaAutoloadMixin, ItemTools, XlsxManager, XlsxItemManager, @@ -28,6 +28,9 @@ from .helpers import using_fresh_ff_state_for_testing +TEST_SHEET_1 = 'Sheet1' + + def test_load_failure(): sample_message = "This is a test." @@ -127,6 +130,13 @@ def test_item_tools_parse_sheet_headers(): assert ItemTools.parse_sheet_headers(input) == expected +def test_item_tools_infer_tab_name(): + + assert ItemTools.infer_tab_name('some/dir/some') == 'some' + assert ItemTools.infer_tab_name('some/dir/some.file') == 'some' + assert ItemTools.infer_tab_name('some/dir/some.file.name') == 'some' + + @pytest.mark.parametrize('parsed_headers,expected_prototype', [ (['a'], {'a': None}), @@ -288,6 +298,15 @@ def test_item_tools_find_type_hint(): assert actual is None +def test_table_set_manager_registry_manager_for_filename(): + + assert TableSetManagerRegistry.manager_for_filename("xyz/foo.csv") == CsvItemManager + + with pytest.raises(Exception) as exc: + TableSetManagerRegistry.manager_for_filename("xyz/foo.something.missing") + assert str(exc.value) == "Unknown file type: xyz/foo.something.missing" + + SAMPLE_XLSX_FILE = os.path.join(TEST_DIR, 'data_files/sample_items.xlsx') SAMPLE_XLSX_FILE_RAW_CONTENT = { @@ -344,15 +363,23 @@ def test_item_tools_find_type_hint(): SAMPLE_CSV_FILE = os.path.join(TEST_DIR, 'data_files/sample_items_sheet2.csv') -SAMPLE_CSV_FILE_RAW_CONTENT = {CsvManager.DEFAULT_TAB_NAME: SAMPLE_XLSX_FILE_RAW_CONTENT['Sheet2']} +SAMPLE_CSV_FILE_SHEET_NAME = ItemTools.infer_tab_name(SAMPLE_CSV_FILE) + +SAMPLE_CSV_FILE_RAW_CONTENT = {SAMPLE_CSV_FILE_SHEET_NAME: SAMPLE_XLSX_FILE_RAW_CONTENT['Sheet2']} -SAMPLE_CSV_FILE_ITEM_CONTENT = {CsvItemManager.DEFAULT_TAB_NAME: SAMPLE_XLSX_FILE_ITEM_CONTENT['Sheet2']} +SAMPLE_CSV_FILE_ITEM_CONTENT = {SAMPLE_CSV_FILE_SHEET_NAME: SAMPLE_XLSX_FILE_ITEM_CONTENT['Sheet2']} SAMPLE_TSV_FILE = os.path.join(TEST_DIR, 'data_files/sample_items_sheet2.tsv') -SAMPLE_TSV_FILE_RAW_CONTENT = {TsvManager.DEFAULT_TAB_NAME: SAMPLE_XLSX_FILE_RAW_CONTENT['Sheet2']} +SAMPLE_TSV_FILE_SHEET_NAME = ItemTools.infer_tab_name(SAMPLE_TSV_FILE) + +SAMPLE_TSV_FILE_RAW_CONTENT = {SAMPLE_TSV_FILE_SHEET_NAME: SAMPLE_XLSX_FILE_RAW_CONTENT['Sheet2']} + +SAMPLE_TSV_FILE_ITEM_CONTENT = {SAMPLE_TSV_FILE_SHEET_NAME: SAMPLE_XLSX_FILE_ITEM_CONTENT['Sheet2']} + +SAMPLE_JSON_FILE = os.path.join(TEST_DIR, 'data_files/sample_items.tabs.json') -SAMPLE_TSV_FILE_ITEM_CONTENT = {TsvItemManager.DEFAULT_TAB_NAME: SAMPLE_XLSX_FILE_ITEM_CONTENT['Sheet2']} +SAMPLE_JSON_FILE_ITEM_CONTENT = SAMPLE_XLSX_FILE_ITEM_CONTENT def test_xlsx_manager_load_content(): @@ -485,6 +512,12 @@ def test_item_manager_load(): assert ItemManager.load(SAMPLE_TSV_FILE, autoload_schemas=False) == SAMPLE_TSV_FILE_ITEM_CONTENT + loaded = ItemManager.load(SAMPLE_JSON_FILE, autoload_schemas=False) + print("loaded=", json.dumps(loaded, indent=2)) + expected = SAMPLE_JSON_FILE_ITEM_CONTENT + print("expected=", json.dumps(expected, indent=2)) + assert loaded == expected + with pytest.raises(LoadArgumentsError) as exc: ItemManager.load("something.else") assert str(exc.value) == "Unknown file type: something.else" @@ -501,6 +534,10 @@ def test_load_items(): assert str(exc.value) == "Unknown file type: something.else" +SAMPLE_CSV_FILE2 = os.path.join(TEST_DIR, 'data_files/sample_items2.csv') + +SAMPLE_CSV_FILE2_SHEET_NAME = ItemTools.infer_tab_name(SAMPLE_CSV_FILE2) + SAMPLE_CSV_FILE2_SCHEMAS = { "Person": { "type": "object", @@ -513,7 +550,7 @@ def test_load_items(): } SAMPLE_CSV_FILE2_CONTENT = { - CsvManager.DEFAULT_TAB_NAME: [ + SAMPLE_CSV_FILE2_SHEET_NAME: [ {"name": "john", "sex": "M", "member": "false"}, {"name": "juan", "sex": "male", "member": "true"}, {"name": "igor", "sex": "unknown", "member": None}, @@ -522,7 +559,7 @@ def test_load_items(): } SAMPLE_CSV_FILE2_ITEM_CONTENT = { - CsvItemManager.DEFAULT_TAB_NAME: [ + SAMPLE_CSV_FILE2_SHEET_NAME: [ {"name": "john", "sex": "M", "member": False}, {"name": "juan", "sex": "male", "member": True}, {"name": "igor", "sex": "unknown", "member": None}, @@ -539,7 +576,11 @@ def test_load_items(): ] } -SAMPLE_CSV_FILE2 = os.path.join(TEST_DIR, 'data_files/sample_items2.csv') + +SAMPLE_JSON_FILE2 = os.path.join(TEST_DIR, 'data_files/sample_items2.json') + +SAMPLE_JSON_FILE2_SHEET_NAME = ItemTools.infer_tab_name(SAMPLE_JSON_FILE2) + SAMPLE_CSV_FILE3_SCHEMAS = { "Person": { @@ -652,6 +693,15 @@ def test_load_items_with_schema(): assert actual == expected +def test_sample_items_csv_vs_json(): + + csv_content = load_items(SAMPLE_CSV_FILE2, schemas=SAMPLE_CSV_FILE2_SCHEMAS, tab_name='Person') + + json_content = load_items(SAMPLE_JSON_FILE2, tab_name="Person") + + assert csv_content == json_content + + @pytest.mark.parametrize('instaguids_enabled', [True, False]) def test_load_items_with_schema_and_instaguids(instaguids_enabled): @@ -738,7 +788,7 @@ def test_schema_autoload_mixin_fetch_relevant_schemas(autoload_schemas, cache_sc with local_attrs(SchemaAutoloadMixin, CACHE_SCHEMAS=cache_schemas): with schema_autoloader_for_testing(portal_env=portal_env, autoload_schemas=autoload_schemas) as autoloader: - assert autoloader.portal_env == 'data' + assert autoloader.portal_env == ('data' if autoload_schemas or portal_env else None) if autoload_schemas: @@ -751,7 +801,7 @@ def test_schema_autoload_mixin_fetch_relevant_schemas(autoload_schemas, cache_sc assert autoloader.fetch_relevant_schemas(['User', 'Lab']) == {} - if portal_env == 'data': + if portal_env == 'data' or not autoload_schemas: assert printed.lines == [] else: assert printed.lines == [ From ce9f9bc830d1405fa25bce68ce85845babe2ed1d Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Thu, 31 Aug 2023 02:28:21 -0400 Subject: [PATCH 042/101] Bump beta version. --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index b3e907b9d..3da73b78b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dcicutils" -version = "7.9.0.1b3" # to become "7.10.0" +version = "7.9.0.1b4" # to become "7.10.0" description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources" authors = ["4DN-DCIC Team "] license = "MIT" From 0ea5b62ed40ef9f9b47cd7190987dd6bc5e51f05 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Thu, 31 Aug 2023 09:22:42 -0400 Subject: [PATCH 043/101] Add yaml formats. --- dcicutils/sheet_utils.py | 30 +++++++++++++++--- test/data_files/sample_items.tabs.yaml | 42 ++++++++++++++++++++++++++ test/test_sheet_utils.py | 19 +++++++++--- 3 files changed, 83 insertions(+), 8 deletions(-) create mode 100644 test/data_files/sample_items.tabs.yaml diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index 4777c26b9..3ca07e098 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -8,6 +8,7 @@ import os import re import uuid +import yaml from dcicutils.common import AnyJsonData from dcicutils.env_utils import public_env_name, EnvUtils @@ -687,6 +688,9 @@ class _JsonInsertsDataItemManager(ItemManagerMixin, BasicTableSetManager): ALLOWED_FILE_EXTENSIONS = [] + def _parser(self, filename): + return json.load(open_unicode_text_input_file_respecting_byte_order_mark(filename)) + def _load_json_data(self, filename: str) -> Dict[str, AnyJsonData]: raise NotImplementedError(f"._load_json_data() is not implemented for {cls.__name__}.") # noQA @@ -714,7 +718,7 @@ class TabbedJsonInsertsItemManager(_JsonInsertsDataItemManager): ALLOWED_FILE_EXTENSIONS = [".tabs.json"] # If you want them all in one family, use this extension def _load_json_data(self, filename: str) -> Dict[str, AnyJsonData]: - data = json.load(open_unicode_text_input_file_respecting_byte_order_mark(filename)) + data = self._parser(filename) if (not isinstance(data, dict) or not all(isinstance(tab_name, str) for tab_name in data.keys()) or not all(isinstance(content, list) and all(isinstance(item, dict) for item in content) @@ -723,23 +727,41 @@ def _load_json_data(self, filename: str) -> Dict[str, AnyJsonData]: return data +@TableSetManagerRegistry.register() +class TabbedYamlInsertsItemManager(TabbedJsonInsertsItemManager): + + ALLOWED_FILE_EXTENSIONS = [".tabs.yaml"] + + def _parser(self, filename): + return yaml.safe_load(open_unicode_text_input_file_respecting_byte_order_mark(filename)) + + @TableSetManagerRegistry.register() class SimpleJsonInsertsItemManager(SingleTableMixin, _JsonInsertsDataItemManager): - ALLOWED_FILE_EXTENSIONS = [".json"] # If you want them all in one family, use this extension + ALLOWED_FILE_EXTENSIONS = [".json"] def _load_json_data(self, filename: str) -> Dict[str, AnyJsonData]: - data = {self._tab_name: json.load(open_unicode_text_input_file_respecting_byte_order_mark(filename))} + data = {self._tab_name: self._parser(filename)} if not all(isinstance(content, list) and all(isinstance(item, dict) for item in content) for content in data.values()): raise ValueError(f"Data in {filename} is not of type List[dict].") return data +@TableSetManagerRegistry.register() +class SimpleYamlInsertsItemManager(SimpleJsonInsertsItemManager): + + ALLOWED_FILE_EXTENSIONS = [".yaml"] + + def _parser(self, filename): + return yaml.safe_load(open_unicode_text_input_file_respecting_byte_order_mark(filename)) + + @TableSetManagerRegistry.register() class SimpleJsonLinesInsertsItemManager(SingleTableMixin, _JsonInsertsDataItemManager): - ALLOWED_FILE_EXTENSIONS = [".jsonl"] # If you want them all in one family, use this extension + ALLOWED_FILE_EXTENSIONS = [".jsonl"] def _load_json_data(self, filename: str) -> Dict[str, AnyJsonData]: content = [line for line in JsonLinesReader(open_unicode_text_input_file_respecting_byte_order_mark(filename))] diff --git a/test/data_files/sample_items.tabs.yaml b/test/data_files/sample_items.tabs.yaml new file mode 100644 index 000000000..f98d9259b --- /dev/null +++ b/test/data_files/sample_items.tabs.yaml @@ -0,0 +1,42 @@ +Sheet1: +- x: 1 + y: + a: 1 + z: 1 +- x: 1 + y: + a: 2 + z: 3 +- x: alpha + y: + a: beta + z: + - gamma + - delta +Sheet2: +- age: 23 + father: + age: 63 + name: fred + friends: + - age: 22 + name: sam + - age: 19 + name: arthur + mother: + age: 58 + name: mary + name: bill +- age: 9 + father: + age: 34 + name: anthony + friends: + - age: 9 + name: anders + - age: null + name: null + mother: + age: 35 + name: estrella + name: joe diff --git a/test/test_sheet_utils.py b/test/test_sheet_utils.py index df30905db..78c9816eb 100644 --- a/test/test_sheet_utils.py +++ b/test/test_sheet_utils.py @@ -377,9 +377,13 @@ def test_table_set_manager_registry_manager_for_filename(): SAMPLE_TSV_FILE_ITEM_CONTENT = {SAMPLE_TSV_FILE_SHEET_NAME: SAMPLE_XLSX_FILE_ITEM_CONTENT['Sheet2']} -SAMPLE_JSON_FILE = os.path.join(TEST_DIR, 'data_files/sample_items.tabs.json') +SAMPLE_JSON_TABS_FILE = os.path.join(TEST_DIR, 'data_files/sample_items.tabs.json') -SAMPLE_JSON_FILE_ITEM_CONTENT = SAMPLE_XLSX_FILE_ITEM_CONTENT +SAMPLE_JSON_TABS_FILE_ITEM_CONTENT = SAMPLE_XLSX_FILE_ITEM_CONTENT + +SAMPLE_YAML_TABS_FILE = os.path.join(TEST_DIR, 'data_files/sample_items.tabs.yaml') + +SAMPLE_YAML_TABS_FILE_ITEM_CONTENT = SAMPLE_XLSX_FILE_ITEM_CONTENT def test_xlsx_manager_load_content(): @@ -512,9 +516,9 @@ def test_item_manager_load(): assert ItemManager.load(SAMPLE_TSV_FILE, autoload_schemas=False) == SAMPLE_TSV_FILE_ITEM_CONTENT - loaded = ItemManager.load(SAMPLE_JSON_FILE, autoload_schemas=False) + loaded = ItemManager.load(SAMPLE_JSON_TABS_FILE, autoload_schemas=False) print("loaded=", json.dumps(loaded, indent=2)) - expected = SAMPLE_JSON_FILE_ITEM_CONTENT + expected = SAMPLE_JSON_TABS_FILE_ITEM_CONTENT print("expected=", json.dumps(expected, indent=2)) assert loaded == expected @@ -702,6 +706,13 @@ def test_sample_items_csv_vs_json(): assert csv_content == json_content +def test_sample_items_json_vs_yaml(): + + tabs_data_from_json = load_items(SAMPLE_JSON_TABS_FILE) + tabs_data_from_yaml = load_items(SAMPLE_YAML_TABS_FILE) + assert tabs_data_from_json == tabs_data_from_yaml + + @pytest.mark.parametrize('instaguids_enabled', [True, False]) def test_load_items_with_schema_and_instaguids(instaguids_enabled): From bcc1128094ba97a51c1b15809967b14ac925ae43 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Fri, 1 Sep 2023 12:09:35 -0400 Subject: [PATCH 044/101] Add class AbstractItemManager. Rename InsertsItemManager to InsertsDirectoryItemManager. Rename _JsonInsertsDataItemManager to InsertsItemManager. --- dcicutils/sheet_utils.py | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index 3ca07e098..5611cf2be 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -343,6 +343,8 @@ class AbstractTableSetManager: happen is not constrained by this class. """ + ALLOWED_FILE_EXTENSIONS: List[str] = [] + def __init__(self, **kwargs): unwanted_kwargs(context=self.__class__.__name__, kwargs=kwargs) @@ -368,8 +370,6 @@ class BasicTableSetManager(AbstractTableSetManager): of this where there's only one set of headers and only one block of content. """ - ALLOWED_FILE_EXTENSIONS: List[str] = [] - def __init__(self, filename: str, **kwargs): super().__init__(**kwargs) self.filename: str = filename @@ -457,14 +457,19 @@ def parse_cell_value(cls, value: SheetCellValue) -> AnyJsonData: return prefer_number(value) +class AbstractItemManager(AbstractTableSetManager): + + pass + + class TableSetManagerRegistry: - ALL_TABLE_SET_MANAGERS: Dict[str, Type['ItemManagerMixin']] = {} + ALL_TABLE_SET_MANAGERS: Dict[str, Type[AbstractItemManager]] = {} ALL_TABLE_SET_REGEXP_MAPPINGS = [] @classmethod def register(cls, regexp=None): - def _wrapped_register(class_to_register: Type['ItemManagerMixin']): + def _wrapped_register(class_to_register: Type[AbstractItemManager]): if regexp: cls.ALL_TABLE_SET_REGEXP_MAPPINGS.append((re.compile(regexp), class_to_register)) for ext in class_to_register.ALLOWED_FILE_EXTENSIONS: @@ -477,7 +482,7 @@ def _wrapped_register(class_to_register: Type['ItemManagerMixin']): return _wrapped_register @classmethod - def manager_for_filename(cls, filename: str) -> Type['ItemManagerMixin']: + def manager_for_filename(cls, filename: str) -> Type[AbstractItemManager]: base: str = os.path.basename(filename) suffix_parts = base.split('.')[1:] if suffix_parts: @@ -487,13 +492,13 @@ def manager_for_filename(cls, filename: str) -> Type['ItemManagerMixin']: if found: return found else: - special_case: Optional[Type[ItemManagerMixin]] = cls.manager_for_special_filename(filename) + special_case: Optional[Type[AbstractItemManager]] = cls.manager_for_special_filename(filename) if special_case: return special_case raise LoadArgumentsError(f"Unknown file type: {filename}") @classmethod - def manager_for_special_filename(cls, filename: str) -> Optional[Type['ItemManagerMixin']]: + def manager_for_special_filename(cls, filename: str) -> Optional[Type[AbstractItemManager]]: for pattern, manager_class in cls.ALL_TABLE_SET_REGEXP_MAPPINGS: if pattern.match(filename): return manager_class @@ -595,7 +600,7 @@ def clear_schema_cache(cls): cls.SCHEMA_CACHE.pop(key, None) -class ItemManagerMixin(SchemaAutoloadMixin, BasicTableSetManager): +class ItemManagerMixin(SchemaAutoloadMixin, AbstractItemManager, BasicTableSetManager): """ This can add functionality to a reader such as an XlsxManager or a CsvManager in order to make its rows get handled like Items instead of just flat table rows. @@ -682,7 +687,7 @@ def tab_names(self) -> List[str]: return [self._tab_name] -class _JsonInsertsDataItemManager(ItemManagerMixin, BasicTableSetManager): +class InsertsItemManager(ItemManagerMixin, BasicTableSetManager): AUTOLOAD_SCHEMAS_DEFAULT = False @@ -713,7 +718,7 @@ def load_content(self) -> Dict[str, AnyJsonData]: @TableSetManagerRegistry.register() -class TabbedJsonInsertsItemManager(_JsonInsertsDataItemManager): +class TabbedJsonInsertsItemManager(InsertsItemManager): ALLOWED_FILE_EXTENSIONS = [".tabs.json"] # If you want them all in one family, use this extension @@ -737,7 +742,7 @@ def _parser(self, filename): @TableSetManagerRegistry.register() -class SimpleJsonInsertsItemManager(SingleTableMixin, _JsonInsertsDataItemManager): +class SimpleJsonInsertsItemManager(SingleTableMixin, InsertsItemManager): ALLOWED_FILE_EXTENSIONS = [".json"] @@ -759,7 +764,7 @@ def _parser(self, filename): @TableSetManagerRegistry.register() -class SimpleJsonLinesInsertsItemManager(SingleTableMixin, _JsonInsertsDataItemManager): +class SimpleJsonLinesInsertsItemManager(SingleTableMixin, InsertsItemManager): ALLOWED_FILE_EXTENSIONS = [".jsonl"] @@ -773,7 +778,7 @@ def _load_json_data(self, filename: str) -> Dict[str, AnyJsonData]: @TableSetManagerRegistry.register(regexp="^(.*/)?(|[^/]*[-_])inserts/?$") -class InsertsItemManager(_JsonInsertsDataItemManager): +class InsertsDirectoryItemManager(InsertsItemManager): ALLOWED_FILE_EXTENSIONS = [] From 7de093a7ef5146ee2026b58fe8a2f4b123c71352 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Fri, 1 Sep 2023 12:54:43 -0400 Subject: [PATCH 045/101] Rename ._parser() to ._parse_json_data(). Factor type checks out of ._load_json_data() into ._check_json_data(). --- dcicutils/sheet_utils.py | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index 5611cf2be..70ef94779 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -724,12 +724,15 @@ class TabbedJsonInsertsItemManager(InsertsItemManager): def _load_json_data(self, filename: str) -> Dict[str, AnyJsonData]: data = self._parser(filename) + self._check_json_data(filename, data) + return data + + def _check_json_data(self, filename: str, data): if (not isinstance(data, dict) or not all(isinstance(tab_name, str) for tab_name in data.keys()) or not all(isinstance(content, list) and all(isinstance(item, dict) for item in content) for content in data.values())): raise ValueError(f"Data in {filename} is not of type Dict[str, List[dict]].") - return data @TableSetManagerRegistry.register() @@ -748,10 +751,13 @@ class SimpleJsonInsertsItemManager(SingleTableMixin, InsertsItemManager): def _load_json_data(self, filename: str) -> Dict[str, AnyJsonData]: data = {self._tab_name: self._parser(filename)} + self._check_json_data(filename, data) + return data + + def _check_json_data(self, filename: str, data): if not all(isinstance(content, list) and all(isinstance(item, dict) for item in content) for content in data.values()): raise ValueError(f"Data in {filename} is not of type List[dict].") - return data @TableSetManagerRegistry.register() @@ -759,7 +765,7 @@ class SimpleYamlInsertsItemManager(SimpleJsonInsertsItemManager): ALLOWED_FILE_EXTENSIONS = [".yaml"] - def _parser(self, filename): + def _parse_json_data(self, filename): return yaml.safe_load(open_unicode_text_input_file_respecting_byte_order_mark(filename)) @@ -768,13 +774,20 @@ class SimpleJsonLinesInsertsItemManager(SingleTableMixin, InsertsItemManager): ALLOWED_FILE_EXTENSIONS = [".jsonl"] - def _load_json_data(self, filename: str) -> Dict[str, AnyJsonData]: + def _parse_json_data(self, filename: str) -> Dict[str, AnyJsonData]: content = [line for line in JsonLinesReader(open_unicode_text_input_file_respecting_byte_order_mark(filename))] data = {self._tab_name: content} + return data + + def _load_json_data(self, filename: str) -> Dict[str, AnyJsonData]: + data = self._parse_json_data(filename) + self._check_json_data(filename, data) + return data + + def _check_json_data(self, filename: str, data): if not all(isinstance(content, list) and all(isinstance(item, dict) for item in content) for content in data.values()): raise ValueError(f"Data in {filename} is not of type List[dict].") - return data @TableSetManagerRegistry.register(regexp="^(.*/)?(|[^/]*[-_])inserts/?$") @@ -782,7 +795,7 @@ class InsertsDirectoryItemManager(InsertsItemManager): ALLOWED_FILE_EXTENSIONS = [] - def _load_json_data(self, filename: str) -> Dict[str, AnyJsonData]: + def _parse_json_data(self, filename: str) -> Dict[str, AnyJsonData]: if not os.path.isdir(filename): raise LoadArgumentsError(f"{filename} is not the name of an inserts directory.") tab_files = glob.glob(os.path.join(filename, "*.json")) From b01e34be64f6fa37de2d897a9bc27300d3fe7f9c Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Fri, 1 Sep 2023 13:04:51 -0400 Subject: [PATCH 046/101] Rename _parse_json_data, _load_json_data, and _check_json_data, respectively, to _parse_inserts_data, _load_inserts_data, and _check_inserts_data. --- dcicutils/sheet_utils.py | 44 ++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index 70ef94779..8e2e60b64 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -693,11 +693,16 @@ class InsertsItemManager(ItemManagerMixin, BasicTableSetManager): ALLOWED_FILE_EXTENSIONS = [] - def _parser(self, filename): + def _parse_inserts_data(self, filename): # by default, we assume inserts files are JSON data return json.load(open_unicode_text_input_file_respecting_byte_order_mark(filename)) - def _load_json_data(self, filename: str) -> Dict[str, AnyJsonData]: - raise NotImplementedError(f"._load_json_data() is not implemented for {cls.__name__}.") # noQA + def _load_inserts_data(self, filename: str) -> Dict[str, AnyJsonData]: + data = self._parse_inserts_data(filename) + self._check_inserts_data(filename, data) + return data + + def _check_inserts_data(self, filename, data): # by default, we do no specific error checking + pass @property def tab_names(self) -> List[str]: @@ -707,7 +712,7 @@ def _get_reader_agent(self) -> Any: return self def load_content(self) -> Dict[str, AnyJsonData]: - data = self._load_json_data(self.filename) + data = self._load_inserts_data(self.filename) for tab_name, tab_content in data.items(): self.content_by_tab_name[tab_name] = tab_content if not tab_content: @@ -722,12 +727,7 @@ class TabbedJsonInsertsItemManager(InsertsItemManager): ALLOWED_FILE_EXTENSIONS = [".tabs.json"] # If you want them all in one family, use this extension - def _load_json_data(self, filename: str) -> Dict[str, AnyJsonData]: - data = self._parser(filename) - self._check_json_data(filename, data) - return data - - def _check_json_data(self, filename: str, data): + def _check_inserts_data(self, filename: str, data): if (not isinstance(data, dict) or not all(isinstance(tab_name, str) for tab_name in data.keys()) or not all(isinstance(content, list) and all(isinstance(item, dict) for item in content) @@ -740,7 +740,7 @@ class TabbedYamlInsertsItemManager(TabbedJsonInsertsItemManager): ALLOWED_FILE_EXTENSIONS = [".tabs.yaml"] - def _parser(self, filename): + def _parse_inserts_data(self, filename): return yaml.safe_load(open_unicode_text_input_file_respecting_byte_order_mark(filename)) @@ -749,12 +749,12 @@ class SimpleJsonInsertsItemManager(SingleTableMixin, InsertsItemManager): ALLOWED_FILE_EXTENSIONS = [".json"] - def _load_json_data(self, filename: str) -> Dict[str, AnyJsonData]: - data = {self._tab_name: self._parser(filename)} - self._check_json_data(filename, data) + def _load_inserts_data(self, filename: str) -> Dict[str, AnyJsonData]: + data = {self._tab_name: self._parse_inserts_data(filename)} + self._check_inserts_data(filename, data) return data - def _check_json_data(self, filename: str, data): + def _check_inserts_data(self, filename: str, data): if not all(isinstance(content, list) and all(isinstance(item, dict) for item in content) for content in data.values()): raise ValueError(f"Data in {filename} is not of type List[dict].") @@ -765,7 +765,7 @@ class SimpleYamlInsertsItemManager(SimpleJsonInsertsItemManager): ALLOWED_FILE_EXTENSIONS = [".yaml"] - def _parse_json_data(self, filename): + def _parse_inserts_data(self, filename): return yaml.safe_load(open_unicode_text_input_file_respecting_byte_order_mark(filename)) @@ -774,17 +774,17 @@ class SimpleJsonLinesInsertsItemManager(SingleTableMixin, InsertsItemManager): ALLOWED_FILE_EXTENSIONS = [".jsonl"] - def _parse_json_data(self, filename: str) -> Dict[str, AnyJsonData]: + def _parse_inserts_data(self, filename: str) -> Dict[str, AnyJsonData]: content = [line for line in JsonLinesReader(open_unicode_text_input_file_respecting_byte_order_mark(filename))] data = {self._tab_name: content} return data - def _load_json_data(self, filename: str) -> Dict[str, AnyJsonData]: - data = self._parse_json_data(filename) - self._check_json_data(filename, data) + def _load_inserts_data(self, filename: str) -> Dict[str, AnyJsonData]: + data = self._parse_inserts_data(filename) + self._check_inserts_data(filename, data) return data - def _check_json_data(self, filename: str, data): + def _check_inserts_data(self, filename: str, data): if not all(isinstance(content, list) and all(isinstance(item, dict) for item in content) for content in data.values()): raise ValueError(f"Data in {filename} is not of type List[dict].") @@ -795,7 +795,7 @@ class InsertsDirectoryItemManager(InsertsItemManager): ALLOWED_FILE_EXTENSIONS = [] - def _parse_json_data(self, filename: str) -> Dict[str, AnyJsonData]: + def _parse_inserts_data(self, filename: str) -> Dict[str, AnyJsonData]: if not os.path.isdir(filename): raise LoadArgumentsError(f"{filename} is not the name of an inserts directory.") tab_files = glob.glob(os.path.join(filename, "*.json")) From 0ae48ee6039c80b6aef629c549c451e62351588c Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Fri, 1 Sep 2023 13:30:33 -0400 Subject: [PATCH 047/101] WIP. Testing good. --- dcicutils/sheet_utils.py | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index 8e2e60b64..b210b7f2d 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -687,9 +687,7 @@ def tab_names(self) -> List[str]: return [self._tab_name] -class InsertsItemManager(ItemManagerMixin, BasicTableSetManager): - - AUTOLOAD_SCHEMAS_DEFAULT = False +class InsertsManager(BasicTableSetManager): # ItemManagerMixin isn't really appropriate here ALLOWED_FILE_EXTENSIONS = [] @@ -701,7 +699,7 @@ def _load_inserts_data(self, filename: str) -> Dict[str, AnyJsonData]: self._check_inserts_data(filename, data) return data - def _check_inserts_data(self, filename, data): # by default, we do no specific error checking + def _check_inserts_data(self, filename: str, data): # by default, we do no specific error checking pass @property @@ -722,11 +720,27 @@ def load_content(self) -> Dict[str, AnyJsonData]: return self.content_by_tab_name -@TableSetManagerRegistry.register() -class TabbedJsonInsertsItemManager(InsertsItemManager): +class InsertsItemManager(AbstractItemManager, InsertsManager): # ItemManagerMixin isn't really appropriate here + + AUTOLOAD_SCHEMAS_DEFAULT = False # Has no effect, but someone might inspect the value. + + def __init__(self, filename: str, *, autoload_schemas: Optional[bool] = None, portal_env: Optional[str] = None, + schemas: Optional[Dict[str, AnyJsonData]] = None, **kwargs): + ignored(portal_env) # Would only be used if autoload_schemas was requested, and we don't allow that. + if schemas not in [None, {}]: + raise ValueError(f"{self.__class__.__name__} does not allow schemas={schemas!r}.") + if autoload_schemas not in [None, False]: + raise ValueError(f"{self.__class__.__name__} does not allow autoload_schemas={autoload_schemas!r}.") + super().__init__(filename, **kwargs) + + +class TabbedJsonInsertsManager(InsertsItemManager): ALLOWED_FILE_EXTENSIONS = [".tabs.json"] # If you want them all in one family, use this extension + +@TableSetManagerRegistry.register() +class TabbedJsonInsertsItemManager(TabbedJsonInsertsManager): def _check_inserts_data(self, filename: str, data): if (not isinstance(data, dict) or not all(isinstance(tab_name, str) for tab_name in data.keys()) @@ -745,7 +759,7 @@ def _parse_inserts_data(self, filename): @TableSetManagerRegistry.register() -class SimpleJsonInsertsItemManager(SingleTableMixin, InsertsItemManager): +class SimpleJsonInsertsItemManager(SingleTableMixin, InsertsManager): ALLOWED_FILE_EXTENSIONS = [".json"] @@ -770,7 +784,7 @@ def _parse_inserts_data(self, filename): @TableSetManagerRegistry.register() -class SimpleJsonLinesInsertsItemManager(SingleTableMixin, InsertsItemManager): +class SimpleJsonLinesInsertsItemManager(SingleTableMixin, InsertsManager): ALLOWED_FILE_EXTENSIONS = [".jsonl"] @@ -791,7 +805,7 @@ def _check_inserts_data(self, filename: str, data): @TableSetManagerRegistry.register(regexp="^(.*/)?(|[^/]*[-_])inserts/?$") -class InsertsDirectoryItemManager(InsertsItemManager): +class InsertsDirectoryItemManager(InsertsManager): ALLOWED_FILE_EXTENSIONS = [] @@ -916,7 +930,7 @@ class ItemManager(AbstractTableSetManager): """ @classmethod - def create_implementation_manager(cls, filename: str, **kwargs) -> BasicTableSetManager: + def create_implementation_manager(cls, filename: str, **kwargs) -> AbstractItemManager: reader_agent_class = TableSetManagerRegistry.manager_for_filename(filename) reader_agent = reader_agent_class(filename=filename, **kwargs) return reader_agent From 1e2c5a92d5ef37e11e35b1168af11804ab132efd Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Fri, 1 Sep 2023 23:24:39 -0400 Subject: [PATCH 048/101] WIP. Tests passing. --- dcicutils/sheet_utils.py | 296 ++++++++++++++++++++++++--------------- test/test_sheet_utils.py | 22 +-- 2 files changed, 194 insertions(+), 124 deletions(-) diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index b210b7f2d..01cf319c0 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -29,6 +29,8 @@ SheetCellValue = Union[int, float, str] SheetRow = List[SheetCellValue] CsvReader = type(csv.reader(TemporaryFile())) +SheetData = List[dict] +TabbedSheetData = Dict[str, SheetData] class LoadFailure(Exception): @@ -86,6 +88,30 @@ def prefer_number(value: SheetCellValue): return value +def expand_string_escape_sequences(text: str) -> str: + s = io.StringIO() + escaping = False + for ch in text: + if escaping: + if ch == 'r': + s.write('\r') + elif ch == 't': + s.write('\t') + elif ch == 'n': + s.write('\n') + elif ch == '\\': + s.write('\\') + else: + # Rather than err, just leave other sequences as-is. + s.write(f"\\{ch}") + escaping = False + elif ch == '\\': + escaping = True + else: + s.write(ch) + return s.getvalue() + + def open_unicode_text_input_file_respecting_byte_order_mark(filename): """ Opens a file for text input, respecting a byte-order mark (BOM). @@ -345,12 +371,13 @@ class AbstractTableSetManager: ALLOWED_FILE_EXTENSIONS: List[str] = [] - def __init__(self, **kwargs): + def __init__(self, filename: str, **kwargs): + self.filename: str = filename unwanted_kwargs(context=self.__class__.__name__, kwargs=kwargs) # TODO: Consider whether this should be an abstractmethod (but first see detailed design note at top of class.) @classmethod - def load(cls, filename: str, **kwargs) -> Dict[str, List[AnyJsonData]]: + def load(cls, filename: str, **kwargs) -> TabbedSheetData: """ Reads a filename and returns a dictionary that maps sheet names to rows of dictionary data. For more information, see documentation of AbstractTableSetManager. @@ -361,6 +388,9 @@ def load(cls, filename: str, **kwargs) -> Dict[str, List[AnyJsonData]]: def tab_names(self) -> List[str]: raise NotImplementedError(f".tab_names is not implemented for {self.__class__.__name__}..") # noQA + def load_content(self) -> Any: + raise NotImplementedError(f".load_content() is not implemented for {self.__class__.__name__}.") # noQA + class BasicTableSetManager(AbstractTableSetManager): """ @@ -371,10 +401,9 @@ class BasicTableSetManager(AbstractTableSetManager): """ def __init__(self, filename: str, **kwargs): - super().__init__(**kwargs) - self.filename: str = filename + super().__init__(filename=filename, **kwargs) self.headers_by_tab_name: Dict[str, Headers] = {} - self.content_by_tab_name: Dict[str, List[AnyJsonData]] = {} + self.content_by_tab_name: Dict[str, SheetData] = {} self.reader_agent: Any = self._get_reader_agent() def tab_headers(self, tab_name: str) -> Headers: @@ -398,21 +427,21 @@ def _get_reader_agent(self) -> Any: """This function is responsible for opening the workbook and returning a workbook object.""" raise NotImplementedError(f"._get_reader_agent() is not implemented for {self.__class__.__name__}.") # noQA - def load_content(self) -> Any: - raise NotImplementedError(f".load_content() is not implemented for {self.__class__.__name__}.") # noQA - -class TableSetManager(BasicTableSetManager): +class SemanticTableSetManager(BasicTableSetManager): """ - This is the base class for all things that read tablesets. Those may be: + This is the base class for all workbook-like things, which read tablesets with possible semantic processing. + Those may be: * Excel workbook readers (.xlsx) * Comma-separated file readers (.csv) * Tab-separarated file readers (.tsv in most of the world, but Microsoft stupidly calls this .txt, outright refusing to write a .tsv file, so many people seem to compromise and call this .tsv.txt) - Unimplemented formats that could easily be made to do the same thing: - * JSON files - * JSON lines files - * YAML files + This is NOT a parent class of these kinds of files, which we always take literally as if semantic processing + were already done (in part so they can be used to test the results of other formats): + * Json files + * Yaml files + * Inserts directories + * JsonLines files """ @classmethod @@ -422,7 +451,7 @@ def load(cls, filename: str, **kwargs) -> AnyJsonData: raise LoadArgumentsError(f"The TableSetManager subclass {cls.__name__} expects only" f" {disjoined_list(cls.ALLOWED_FILE_EXTENSIONS)} filenames: {filename}") - table_set_manager: TableSetManager = cls(filename=filename, **kwargs) + table_set_manager: SemanticTableSetManager = cls(filename=filename, **kwargs) return table_set_manager.load_content() def __init__(self, filename: str, **kwargs): @@ -464,23 +493,56 @@ class AbstractItemManager(AbstractTableSetManager): class TableSetManagerRegistry: - ALL_TABLE_SET_MANAGERS: Dict[str, Type[AbstractItemManager]] = {} + ALL_TABLE_SET_MANAGERS: Dict[str, Type[AbstractTableSetManager]] = {} + ALL_TABLE_SET_ITEM_MANAGERS: Dict[str, Type[AbstractItemManager]] = {} ALL_TABLE_SET_REGEXP_MAPPINGS = [] + ALL_TABLE_SET_REGEXP_ITEM_MAPPINGS = [] + + @classmethod + def describe_for_debugging(cls): + me = cls.__name__ + for attr in ["ALL_TABLE_SET_MANAGERS", "ALL_TABLE_SET_ITEM_MANAGERS"]: + print(f"{me}.{attr}:") + for suffix, manager_class in getattr(cls, attr).items() or [(None, None)]: + suffix: str + print(f" {('---' if suffix is None else suffix).rjust(50)}" + f" :: {manager_class.__name__ if manager_class else '---'}") + for attr in ["ALL_TABLE_SET_REGEXP_MAPPINGS", "ALL_TABLE_SET_REGEXP_ITEM_MAPPINGS"]: + print(f"{me}.{attr}:") + for regexp, manager_class in getattr(cls, attr) or [(None, None)]: + regexp: str + print(f" {('---' if regexp is None else str(regexp)).rjust(50)}" + f" :: {manager_class.__name__ if regexp else '---'}") @classmethod def register(cls, regexp=None): - def _wrapped_register(class_to_register: Type[AbstractItemManager]): + def _wrapped_register(class_to_register: Type[AbstractTableSetManager]): + is_item_class = issubclass(class_to_register, AbstractItemManager) + print(f"The class {class_to_register.__name__} {'IS' if is_item_class else 'is NOT'} an item class.") + manager_table: Dict[str, Type[AbstractTableSetManager]] = ( + cls.ALL_TABLE_SET_ITEM_MANAGERS + if is_item_class + else cls.ALL_TABLE_SET_MANAGERS + ) + regexp_mapping = ( + cls.ALL_TABLE_SET_REGEXP_ITEM_MAPPINGS + if is_item_class + else cls.ALL_TABLE_SET_REGEXP_MAPPINGS + ) + if regexp: - cls.ALL_TABLE_SET_REGEXP_MAPPINGS.append((re.compile(regexp), class_to_register)) + regexp_mapping.append((re.compile(regexp), class_to_register)) for ext in class_to_register.ALLOWED_FILE_EXTENSIONS: - existing = cls.ALL_TABLE_SET_MANAGERS.get(ext) + existing = manager_table.get(ext) if existing: raise Exception(f"Tried to define {class_to_register} to extension {ext}," f" but {existing} already claimed that.") - cls.ALL_TABLE_SET_MANAGERS[ext] = class_to_register + manager_table[ext] = class_to_register return class_to_register return _wrapped_register + register1 = register + @classmethod def manager_for_filename(cls, filename: str) -> Type[AbstractItemManager]: base: str = os.path.basename(filename) @@ -488,7 +550,7 @@ def manager_for_filename(cls, filename: str) -> Type[AbstractItemManager]: if suffix_parts: for i in range(0, len(suffix_parts)): suffix = f".{'.'.join(suffix_parts[i:])}" - found = cls.ALL_TABLE_SET_MANAGERS.get(suffix) + found = cls.ALL_TABLE_SET_ITEM_MANAGERS.get(suffix) if found: return found else: @@ -499,13 +561,14 @@ def manager_for_filename(cls, filename: str) -> Type[AbstractItemManager]: @classmethod def manager_for_special_filename(cls, filename: str) -> Optional[Type[AbstractItemManager]]: - for pattern, manager_class in cls.ALL_TABLE_SET_REGEXP_MAPPINGS: + for pattern, manager_class in cls.ALL_TABLE_SET_REGEXP_ITEM_MAPPINGS: if pattern.match(filename): return manager_class return None -class XlsxManager(TableSetManager): +@TableSetManagerRegistry.register1() +class XlsxManager(SemanticTableSetManager): """ This implements the mechanism to get a series of rows out of the sheets in an XLSX file. """ @@ -559,7 +622,7 @@ class SchemaAutoloadMixin(AbstractTableSetManager): CACHE_SCHEMAS = True # Controls whether we're doing caching at all AUTOLOAD_SCHEMAS_DEFAULT = True - def __init__(self, autoload_schemas: Optional[bool] = None, portal_env: Optional[str] = None, + def __init__(self, filename: str, autoload_schemas: Optional[bool] = None, portal_env: Optional[str] = None, **kwargs): # This setup must be in place before the class initialization is done (via the super call). self.autoload_schemas: bool = self.AUTOLOAD_SCHEMAS_DEFAULT if autoload_schemas is None else autoload_schemas @@ -568,7 +631,7 @@ def __init__(self, autoload_schemas: Optional[bool] = None, portal_env: Optional portal_env = public_env_name(EnvUtils.PRD_ENV_NAME) PRINT(f"The portal_env was not explicitly supplied. Schemas will come from portal_env={portal_env!r}.") self.portal_env: Optional[str] = portal_env - super().__init__(**kwargs) + super().__init__(filename=filename, **kwargs) def fetch_relevant_schemas(self, schema_names: List[str]): # The schema_names argument is not normally given, but it is there for easier testing @@ -691,16 +754,22 @@ class InsertsManager(BasicTableSetManager): # ItemManagerMixin isn't really app ALLOWED_FILE_EXTENSIONS = [] - def _parse_inserts_data(self, filename): # by default, we assume inserts files are JSON data - return json.load(open_unicode_text_input_file_respecting_byte_order_mark(filename)) + def _parse_inserts_data(self, filename: str) -> AnyJsonData: + raise NotImplementedError(f"._parse_inserts_dataa(...) is not implemented for {self.__class__.__name__}.") # noQA - def _load_inserts_data(self, filename: str) -> Dict[str, AnyJsonData]: - data = self._parse_inserts_data(filename) - self._check_inserts_data(filename, data) - return data + def _load_inserts_data(self, filename: str) -> TabbedSheetData: + data: AnyJsonData = self._parse_inserts_data(filename) + tabbed_inserts: AnyJsonData = self._wrap_inserts_data(filename, data) + if (not isinstance(tabbed_inserts, dict) + or not all(isinstance(tab_name, str) for tab_name in tabbed_inserts.keys()) + or not all(isinstance(content, list) and all(isinstance(item, dict) for item in content) + for content in tabbed_inserts.values())): + raise ValueError(f"Data in {filename} is not of type TabbedSheetData (Dict[str, List[dict]]).") + tabbed_inserts: TabbedSheetData # we've just checked that + return tabbed_inserts - def _check_inserts_data(self, filename: str, data): # by default, we do no specific error checking - pass + def _wrap_inserts_data(self, filename: str, data: AnyJsonData) -> AnyJsonData: + return data @property def tab_names(self) -> List[str]: @@ -720,96 +789,108 @@ def load_content(self) -> Dict[str, AnyJsonData]: return self.content_by_tab_name -class InsertsItemManager(AbstractItemManager, InsertsManager): # ItemManagerMixin isn't really appropriate here +class SimpleInsertsMixin(SingleTableMixin): - AUTOLOAD_SCHEMAS_DEFAULT = False # Has no effect, but someone might inspect the value. + def _wrap_inserts_data(self, filename: str, data: AnyJsonData) -> TabbedSheetData: + if (not isinstance(data, list) + or not all(isinstance(item, dict) for item in data)): + raise ValueError(f"Data in {filename} is not of type SheetData (List[dict]).") + return {self._tab_name: data} - def __init__(self, filename: str, *, autoload_schemas: Optional[bool] = None, portal_env: Optional[str] = None, - schemas: Optional[Dict[str, AnyJsonData]] = None, **kwargs): - ignored(portal_env) # Would only be used if autoload_schemas was requested, and we don't allow that. - if schemas not in [None, {}]: - raise ValueError(f"{self.__class__.__name__} does not allow schemas={schemas!r}.") - if autoload_schemas not in [None, False]: - raise ValueError(f"{self.__class__.__name__} does not allow autoload_schemas={autoload_schemas!r}.") - super().__init__(filename, **kwargs) +class JsonInsertsMixin: -class TabbedJsonInsertsManager(InsertsItemManager): + def _parse_inserts_data(self, filename: str) -> AnyJsonData: + return json.load(open_unicode_text_input_file_respecting_byte_order_mark(filename)) + + +@TableSetManagerRegistry.register1() +class TabbedJsonInsertsManager(JsonInsertsMixin, InsertsManager): ALLOWED_FILE_EXTENSIONS = [".tabs.json"] # If you want them all in one family, use this extension -@TableSetManagerRegistry.register() -class TabbedJsonInsertsItemManager(TabbedJsonInsertsManager): - def _check_inserts_data(self, filename: str, data): - if (not isinstance(data, dict) - or not all(isinstance(tab_name, str) for tab_name in data.keys()) - or not all(isinstance(content, list) and all(isinstance(item, dict) for item in content) - for content in data.values())): - raise ValueError(f"Data in {filename} is not of type Dict[str, List[dict]].") +@TableSetManagerRegistry.register1() +class SimpleJsonInsertsManager(SimpleInsertsMixin, JsonInsertsMixin, InsertsManager): + ALLOWED_FILE_EXTENSIONS = [".json"] + + +class YamlInsertsMixin: + + def _parse_inserts_data(self, filename) -> AnyJsonData: + return yaml.safe_load(open_unicode_text_input_file_respecting_byte_order_mark(filename)) -@TableSetManagerRegistry.register() -class TabbedYamlInsertsItemManager(TabbedJsonInsertsItemManager): + +@TableSetManagerRegistry.register1() +class TabbedYamlInsertsManager(YamlInsertsMixin, InsertsManager): ALLOWED_FILE_EXTENSIONS = [".tabs.yaml"] - def _parse_inserts_data(self, filename): + def _parse_inserts_data(self, filename) -> AnyJsonData: return yaml.safe_load(open_unicode_text_input_file_respecting_byte_order_mark(filename)) -@TableSetManagerRegistry.register() -class SimpleJsonInsertsItemManager(SingleTableMixin, InsertsManager): +@TableSetManagerRegistry.register1() +class SimpleYamlInsertsManager(SimpleInsertsMixin, YamlInsertsMixin, InsertsManager): - ALLOWED_FILE_EXTENSIONS = [".json"] + ALLOWED_FILE_EXTENSIONS = [".yaml"] - def _load_inserts_data(self, filename: str) -> Dict[str, AnyJsonData]: - data = {self._tab_name: self._parse_inserts_data(filename)} - self._check_inserts_data(filename, data) - return data - def _check_inserts_data(self, filename: str, data): - if not all(isinstance(content, list) and all(isinstance(item, dict) for item in content) - for content in data.values()): - raise ValueError(f"Data in {filename} is not of type List[dict].") +class InsertsItemMixin(AbstractItemManager): # ItemManagerMixin isn't really appropriate here + + AUTOLOAD_SCHEMAS_DEFAULT = False # Has no effect, but someone might inspect the value. + + def __init__(self, filename: str, *, autoload_schemas: Optional[bool] = None, portal_env: Optional[str] = None, + schemas: Optional[Dict[str, AnyJsonData]] = None, **kwargs): + ignored(portal_env) # Would only be used if autoload_schemas was requested, and we don't allow that. + if schemas not in [None, {}]: + raise ValueError(f"{self.__class__.__name__} does not allow schemas={schemas!r}.") + if autoload_schemas not in [None, False]: + raise ValueError(f"{self.__class__.__name__} does not allow autoload_schemas={autoload_schemas!r}.") + super().__init__(filename=filename, **kwargs) @TableSetManagerRegistry.register() -class SimpleYamlInsertsItemManager(SimpleJsonInsertsItemManager): +class TabbedJsonInsertsItemManager(InsertsItemMixin, TabbedJsonInsertsManager): + pass - ALLOWED_FILE_EXTENSIONS = [".yaml"] - def _parse_inserts_data(self, filename): - return yaml.safe_load(open_unicode_text_input_file_respecting_byte_order_mark(filename)) +@TableSetManagerRegistry.register() +class SimpleJsonInsertsItemManager(InsertsItemMixin, SimpleJsonInsertsManager): + pass + + +@TableSetManagerRegistry.register() +class TabbedYamlInsertsItemManager(InsertsItemMixin, TabbedYamlInsertsManager): + pass @TableSetManagerRegistry.register() -class SimpleJsonLinesInsertsItemManager(SingleTableMixin, InsertsManager): +class SimpleYamlInsertsItemManager(InsertsItemMixin, SimpleYamlInsertsManager): + pass + + +@TableSetManagerRegistry.register1() +class SimpleJsonLinesInsertsManager(SimpleInsertsMixin, InsertsManager): ALLOWED_FILE_EXTENSIONS = [".jsonl"] - def _parse_inserts_data(self, filename: str) -> Dict[str, AnyJsonData]: - content = [line for line in JsonLinesReader(open_unicode_text_input_file_respecting_byte_order_mark(filename))] - data = {self._tab_name: content} - return data + def _parse_inserts_data(self, filename: str) -> AnyJsonData: + return [line for line in JsonLinesReader(open_unicode_text_input_file_respecting_byte_order_mark(filename))] - def _load_inserts_data(self, filename: str) -> Dict[str, AnyJsonData]: - data = self._parse_inserts_data(filename) - self._check_inserts_data(filename, data) - return data - def _check_inserts_data(self, filename: str, data): - if not all(isinstance(content, list) and all(isinstance(item, dict) for item in content) - for content in data.values()): - raise ValueError(f"Data in {filename} is not of type List[dict].") +@TableSetManagerRegistry.register() +class SimpleJsonLinesInsertsItemManager(InsertsItemMixin, SimpleJsonLinesInsertsManager): + pass -@TableSetManagerRegistry.register(regexp="^(.*/)?(|[^/]*[-_])inserts/?$") -class InsertsDirectoryItemManager(InsertsManager): +@TableSetManagerRegistry.register1() +class InsertsDirectoryManager(InsertsManager): ALLOWED_FILE_EXTENSIONS = [] - def _parse_inserts_data(self, filename: str) -> Dict[str, AnyJsonData]: + def _parse_inserts_data(self, filename: str) -> AnyJsonData: if not os.path.isdir(filename): raise LoadArgumentsError(f"{filename} is not the name of an inserts directory.") tab_files = glob.glob(os.path.join(filename, "*.json")) @@ -825,7 +906,13 @@ def _parse_inserts_data(self, filename: str) -> Dict[str, AnyJsonData]: return data -class CsvManager(SingleTableMixin, TableSetManager): +@TableSetManagerRegistry.register(regexp="^(.*/)?(|[^/]*[-_])inserts/?$") +class InsertsDirectoryItemManager(InsertsItemMixin, InsertsDirectoryManager): + pass + + +@TableSetManagerRegistry.register1() +class CsvManager(SingleTableMixin, SemanticTableSetManager): """ This implements the mechanism to get a series of rows out of the sheet in a csv file, returning a result that still looks like there could have been multiple tabs. @@ -870,6 +957,7 @@ class CsvItemManager(ItemManagerMixin, CsvManager): pass +@TableSetManagerRegistry.register1() class TsvManager(CsvManager): """ TSV files are just CSV files with tabs instead of commas as separators. @@ -887,32 +975,8 @@ def _get_reader_agent_for_filename(cls, filename) -> CsvReader: def parse_cell_value(self, value: SheetCellValue) -> AnyJsonData: if self.escaping and isinstance(value, str) and '\\' in value: - value = self.expand_escape_sequences(value) - return super().parse_cell_value(value) - - @classmethod - def expand_escape_sequences(cls, text: str) -> str: - s = io.StringIO() - escaping = False - for ch in text: - if escaping: - if ch == 'r': - s.write('\r') - elif ch == 't': - s.write('\t') - elif ch == 'n': - s.write('\n') - elif ch == '\\': - s.write('\\') - else: - # Rather than err, just leave other sequences as-is. - s.write(f"\\{ch}") - escaping = False - elif ch == '\\': - escaping = True - else: - s.write(ch) - return s.getvalue() + value = expand_string_escape_sequences(value) + return super().parse_cell_value(value) # noQA - PyCharm wrongly thinks this method call is improper @TableSetManagerRegistry.register() @@ -941,7 +1005,7 @@ def load(cls, filename: str, escaping: Optional[bool] = None, schemas: Optional[Dict] = None, autoload_schemas: Optional[bool] = None, - **kwargs) -> Dict[str, List[AnyJsonData]]: + **kwargs) -> TabbedSheetData: """ Given a filename and various options """ @@ -951,3 +1015,7 @@ def load(cls, filename: str, load_items = ItemManager.load + + +# Uncommenting this will cause this library, upon loading, to print out debugging data about what got defined. +TableSetManagerRegistry.describe_for_debugging() diff --git a/test/test_sheet_utils.py b/test/test_sheet_utils.py index 78c9816eb..26d402cb1 100644 --- a/test/test_sheet_utils.py +++ b/test/test_sheet_utils.py @@ -20,7 +20,7 @@ # Error handling LoadFailure, LoadArgumentsError, LoadTableError, # Utilities - prefer_number, unwanted_kwargs, + prefer_number, unwanted_kwargs, expand_string_escape_sequences, ) from typing import Dict, Optional from unittest import mock @@ -72,6 +72,14 @@ def test_prefer_number(): assert prefer_number('123e-1') == 12.3 +def test_expand_string_escape_sequences(): + + assert expand_string_escape_sequences("foo") == "foo" + assert expand_string_escape_sequences("foo\\tbar") == "foo\tbar" + assert expand_string_escape_sequences("\\r\\t\\n\\\\") == "\r\t\n\\" + assert expand_string_escape_sequences("foo\\fbar") == "foo\\fbar" + + def test_unwanted_kwargs_without_error(): unwanted_kwargs(context="Function foo", kwargs={}) unwanted_kwargs(context="Function foo", kwargs={}, context_plural=True, detailed=True) @@ -468,14 +476,6 @@ def test_tsv_manager_load_content(): assert wt.load_content() == SAMPLE_TSV_FILE_RAW_CONTENT -def test_tsv_manager_expand_escape_sequences(): - - assert TsvManager.expand_escape_sequences("foo") == "foo" - assert TsvManager.expand_escape_sequences("foo\\tbar") == "foo\tbar" - assert TsvManager.expand_escape_sequences("\\r\\t\\n\\\\") == "\r\t\n\\" - assert TsvManager.expand_escape_sequences("foo\\fbar") == "foo\\fbar" - - def test_tsv_manager_load(): assert TsvManager.load(SAMPLE_TSV_FILE) == SAMPLE_TSV_FILE_RAW_CONTENT @@ -729,7 +729,9 @@ def test_load_items_with_schema_and_instaguids(instaguids_enabled): class SchemaAutoloaderForTesting(SchemaAutoloadMixin): - pass + + def __init__(self, **kwargs): + super().__init__(filename='ignored.file.name', **kwargs) @contextlib.contextmanager From b8a4c3965089cd5a937a7f06503b81ebc13b6a00 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Sat, 2 Sep 2023 00:17:11 -0400 Subject: [PATCH 049/101] Rearrange the way escaping= works so both csv an tsv files can using that argument. --- dcicutils/sheet_utils.py | 32 ++++++++------ test/data_files/escaping-false.json | 67 +++++++++++++++++++++++++++++ test/data_files/escaping-true.json | 67 +++++++++++++++++++++++++++++ test/data_files/escaping.csv | 11 +++++ test/test_sheet_utils.py | 11 +++++ 5 files changed, 176 insertions(+), 12 deletions(-) create mode 100644 test/data_files/escaping-false.json create mode 100644 test/data_files/escaping-true.json create mode 100644 test/data_files/escaping.csv diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index 01cf319c0..9562e92af 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -430,12 +430,15 @@ def _get_reader_agent(self) -> Any: class SemanticTableSetManager(BasicTableSetManager): """ - This is the base class for all workbook-like things, which read tablesets with possible semantic processing. + This is the base class for all workbook-like data sources, i.e., that may need to apply semantic processing. Those may be: * Excel workbook readers (.xlsx) * Comma-separated file readers (.csv) * Tab-separarated file readers (.tsv in most of the world, but Microsoft stupidly calls this .txt, outright refusing to write a .tsv file, so many people seem to compromise and call this .tsv.txt) + There are two levels to each of these: a class that is not semantically interpreted, + and a class that is semantically interpreted as an "item". + This is NOT a parent class of these kinds of files, which we always take literally as if semantic processing were already done (in part so they can be used to test the results of other formats): * Json files @@ -920,6 +923,10 @@ class CsvManager(SingleTableMixin, SemanticTableSetManager): ALLOWED_FILE_EXTENSIONS = ['.csv'] + def __init__(self, filename: str, escaping: Optional[bool] = None, **kwargs): + super().__init__(filename=filename, **kwargs) + self.escaping: bool = escaping or False + def _get_reader_agent(self) -> CsvReader: return self._get_reader_agent_for_filename(self.filename) @@ -943,10 +950,20 @@ def _create_tab_processor_state(self, tab_name: str) -> Headers: self.headers_by_tab_name[tab_name] = headers = self.reader_agent.__next__() return headers + def _escape_cell_text(self, cell_text): + if '\\' in cell_text: + return expand_string_escape_sequences(cell_text) + else: + return cell_text + def _process_row(self, tab_name: str, headers: Headers, row_data: SheetRow) -> AnyJsonData: ignored(tab_name) - return {headers[i]: self.parse_cell_value(row_datum) - for i, row_datum in enumerate(row_data)} + if self.escaping: + return {headers[i]: self.parse_cell_value(self._escape_cell_text(cell_text)) + for i, cell_text in enumerate(row_data)} + else: + return {headers[i]: self.parse_cell_value(cell_text) + for i, cell_text in enumerate(row_data)} @TableSetManagerRegistry.register() @@ -965,19 +982,10 @@ class TsvManager(CsvManager): """ ALLOWED_FILE_EXTENSIONS = ['.tsv', '.tsv.txt'] - def __init__(self, filename: str, escaping: Optional[bool] = None, **kwargs): - super().__init__(filename=filename, **kwargs) - self.escaping: bool = escaping or False - @classmethod def _get_reader_agent_for_filename(cls, filename) -> CsvReader: return csv.reader(open_unicode_text_input_file_respecting_byte_order_mark(filename), delimiter='\t') - def parse_cell_value(self, value: SheetCellValue) -> AnyJsonData: - if self.escaping and isinstance(value, str) and '\\' in value: - value = expand_string_escape_sequences(value) - return super().parse_cell_value(value) # noQA - PyCharm wrongly thinks this method call is improper - @TableSetManagerRegistry.register() class TsvItemManager(ItemManagerMixin, TsvManager): diff --git a/test/data_files/escaping-false.json b/test/data_files/escaping-false.json new file mode 100644 index 000000000..84ab06993 --- /dev/null +++ b/test/data_files/escaping-false.json @@ -0,0 +1,67 @@ +{ + "escaping": [ + { + "name": "backslash", + "unquoted": "\\\\", + "doublequoted": "\\\\", + "singlequoted": "'\\\\'", + "overflow": null + }, + { + "name": "formfeed", + "unquoted": "\\f", + "doublequoted": "\\f", + "singlequoted": "'\\f'", + "overflow": null + }, + { + "name": "newline", + "unquoted": "\\n", + "doublequoted": "\\n", + "singlequoted": "'\\n'", + "overflow": null + }, + { + "name": "return", + "unquoted": "\\r", + "doublequoted": "\\r", + "singlequoted": "'\\r'", + "overflow": null + }, + { + "name": "tab", + "unquoted": "\\t", + "doublequoted": "\\t", + "singlequoted": "'\\t'", + "overflow": null + }, + { + "name": "misc", + "unquoted": "\\m", + "doublequoted": "\\m", + "singlequoted": "'\\m'", + "overflow": null + }, + { + "name": "quote1", + "unquoted": "N/A", + "doublequoted": "x,,z", + "singlequoted": "N/A", + "overflow": null + }, + { + "name": "quotelong", + "unquoted": "N/A", + "doublequoted": "x,,z,N/A\nquotlongcontinued,", + "singlequoted": "N/A", + "overflow": null + }, + { + "name": "comma", + "unquoted": "N/A", + "doublequoted": ",", + "singlequoted": "'", + "overflow": "'" + } + ] +} diff --git a/test/data_files/escaping-true.json b/test/data_files/escaping-true.json new file mode 100644 index 000000000..5d6c837a6 --- /dev/null +++ b/test/data_files/escaping-true.json @@ -0,0 +1,67 @@ +{ + "escaping": [ + { + "name": "backslash", + "unquoted": "\\", + "doublequoted": "\\", + "singlequoted": "'\\'", + "overflow": null + }, + { + "name": "formfeed", + "unquoted": "\\f", + "doublequoted": "\\f", + "singlequoted": "'\\f'", + "overflow": null + }, + { + "name": "newline", + "unquoted": "\n", + "doublequoted": "\n", + "singlequoted": "'\n'", + "overflow": null + }, + { + "name": "return", + "unquoted": "\r", + "doublequoted": "\r", + "singlequoted": "'\r'", + "overflow": null + }, + { + "name": "tab", + "unquoted": "\t", + "doublequoted": "\t", + "singlequoted": "'\t'", + "overflow": null + }, + { + "name": "misc", + "unquoted": "\\m", + "doublequoted": "\\m", + "singlequoted": "'\\m'", + "overflow": null + }, + { + "name": "quote1", + "unquoted": "N/A", + "doublequoted": "x,,z", + "singlequoted": "N/A", + "overflow": null + }, + { + "name": "quotelong", + "unquoted": "N/A", + "doublequoted": "x,,z,N/A\nquotlongcontinued,", + "singlequoted": "N/A", + "overflow": null + }, + { + "name": "comma", + "unquoted": "N/A", + "doublequoted": ",", + "singlequoted": "'", + "overflow": "'" + } + ] +} diff --git a/test/data_files/escaping.csv b/test/data_files/escaping.csv new file mode 100644 index 000000000..ec04defbd --- /dev/null +++ b/test/data_files/escaping.csv @@ -0,0 +1,11 @@ +name,unquoted,doublequoted,singlequoted,overflow +backslash,\\,"\\",'\\' +formfeed,\f,"\f",'\f' +newline,\n,"\n",'\n' +return,\r,"\r",'\r' +tab,\t,"\t",'\t' +misc,\m,"\m",'\m' +quote1,N/A,"x,,z",N/A +quotelong,N/A,"x,,z,N/A +quotlongcontinued,",N/A +comma,N/A,",",',' diff --git a/test/test_sheet_utils.py b/test/test_sheet_utils.py index 26d402cb1..e5bd77939 100644 --- a/test/test_sheet_utils.py +++ b/test/test_sheet_utils.py @@ -470,6 +470,17 @@ def test_csv_item_manager_load_csv(): ' expects only .csv filenames:') +def test_csv_escaping(): + + actual = CsvManager.load("test/data_files/escaping.csv", escaping=False) + expected = json.load(open("test/data_files/escaping-false.json")) + assert actual == expected + + actual = CsvManager.load("test/data_files/escaping.csv", escaping=True) + expected = json.load(open("test/data_files/escaping-true.json")) + assert actual == expected + + def test_tsv_manager_load_content(): wt = TsvManager(SAMPLE_TSV_FILE) From a2fe079d3e4f4763f96aba022f7015129e04c1be Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Sat, 2 Sep 2023 04:52:28 -0400 Subject: [PATCH 050/101] Separate registration of regular table set managers from registration of item managers. --- dcicutils/sheet_utils.py | 142 ++++++++++++++++++--------------------- test/test_sheet_utils.py | 12 +++- 2 files changed, 76 insertions(+), 78 deletions(-) diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index 9562e92af..41d5f0ff7 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -19,7 +19,7 @@ from openpyxl.worksheet.worksheet import Worksheet from openpyxl.workbook.workbook import Workbook from tempfile import TemporaryFile -from typing import Any, Dict, Iterable, List, Optional, Type, Union +from typing import Any, Dict, Iterable, List, Optional, Tuple, Type, Union Header = str @@ -31,7 +31,7 @@ CsvReader = type(csv.reader(TemporaryFile())) SheetData = List[dict] TabbedSheetData = Dict[str, SheetData] - +Regexp = type(re.compile("sample")) class LoadFailure(Exception): """ @@ -496,81 +496,52 @@ class AbstractItemManager(AbstractTableSetManager): class TableSetManagerRegistry: - ALL_TABLE_SET_MANAGERS: Dict[str, Type[AbstractTableSetManager]] = {} - ALL_TABLE_SET_ITEM_MANAGERS: Dict[str, Type[AbstractItemManager]] = {} - ALL_TABLE_SET_REGEXP_MAPPINGS = [] - ALL_TABLE_SET_REGEXP_ITEM_MAPPINGS = [] - - @classmethod - def describe_for_debugging(cls): - me = cls.__name__ - for attr in ["ALL_TABLE_SET_MANAGERS", "ALL_TABLE_SET_ITEM_MANAGERS"]: - print(f"{me}.{attr}:") - for suffix, manager_class in getattr(cls, attr).items() or [(None, None)]: - suffix: str - print(f" {('---' if suffix is None else suffix).rjust(50)}" - f" :: {manager_class.__name__ if manager_class else '---'}") - for attr in ["ALL_TABLE_SET_REGEXP_MAPPINGS", "ALL_TABLE_SET_REGEXP_ITEM_MAPPINGS"]: - print(f"{me}.{attr}:") - for regexp, manager_class in getattr(cls, attr) or [(None, None)]: - regexp: str - print(f" {('---' if regexp is None else str(regexp)).rjust(50)}" - f" :: {manager_class.__name__ if regexp else '---'}") + def __init__(self): + self.manager_table: Dict[str, Type[AbstractTableSetManager]] = {} + self.regexp_mappings: List[Tuple[Regexp, Type[AbstractTableSetManager]]] = [] - @classmethod - def register(cls, regexp=None): + def register(self, regexp: Optional[str] = None): def _wrapped_register(class_to_register: Type[AbstractTableSetManager]): - is_item_class = issubclass(class_to_register, AbstractItemManager) - print(f"The class {class_to_register.__name__} {'IS' if is_item_class else 'is NOT'} an item class.") - manager_table: Dict[str, Type[AbstractTableSetManager]] = ( - cls.ALL_TABLE_SET_ITEM_MANAGERS - if is_item_class - else cls.ALL_TABLE_SET_MANAGERS - ) - regexp_mapping = ( - cls.ALL_TABLE_SET_REGEXP_ITEM_MAPPINGS - if is_item_class - else cls.ALL_TABLE_SET_REGEXP_MAPPINGS - ) - if regexp: - regexp_mapping.append((re.compile(regexp), class_to_register)) + self.regexp_mappings.append((re.compile(regexp), class_to_register)) for ext in class_to_register.ALLOWED_FILE_EXTENSIONS: - existing = manager_table.get(ext) + existing = self.manager_table.get(ext) if existing: raise Exception(f"Tried to define {class_to_register} to extension {ext}," f" but {existing} already claimed that.") - manager_table[ext] = class_to_register + self.manager_table[ext] = class_to_register return class_to_register return _wrapped_register register1 = register - @classmethod - def manager_for_filename(cls, filename: str) -> Type[AbstractItemManager]: + def manager_for_filename(self, filename: str) -> Type[AbstractTableSetManager]: base: str = os.path.basename(filename) suffix_parts = base.split('.')[1:] if suffix_parts: for i in range(0, len(suffix_parts)): suffix = f".{'.'.join(suffix_parts[i:])}" - found = cls.ALL_TABLE_SET_ITEM_MANAGERS.get(suffix) + found: Optional[Type[AbstractTableSetManager]] = self.manager_table.get(suffix) if found: return found else: - special_case: Optional[Type[AbstractItemManager]] = cls.manager_for_special_filename(filename) + special_case: Optional[Type[AbstractItemManager]] = self.manager_for_special_filename(filename) if special_case: return special_case raise LoadArgumentsError(f"Unknown file type: {filename}") - @classmethod - def manager_for_special_filename(cls, filename: str) -> Optional[Type[AbstractItemManager]]: - for pattern, manager_class in cls.ALL_TABLE_SET_REGEXP_ITEM_MAPPINGS: + def manager_for_special_filename(self, filename: str) -> Optional[Type[AbstractTableSetManager]]: + for pattern, manager_class in self.regexp_mappings: if pattern.match(filename): return manager_class return None -@TableSetManagerRegistry.register1() +TABLE_SET_MANAGER_REGISTRY = TableSetManagerRegistry() +ITEM_MANAGER_REGISTRY = TableSetManagerRegistry() + + +@TABLE_SET_MANAGER_REGISTRY.register() class XlsxManager(SemanticTableSetManager): """ This implements the mechanism to get a series of rows out of the sheets in an XLSX file. @@ -734,7 +705,7 @@ def parse_cell_value(self, value: SheetCellValue) -> AnyJsonData: return ItemTools.parse_item_value(value, context=self._instaguid_context_table) -@TableSetManagerRegistry.register() +@ITEM_MANAGER_REGISTRY.register() class XlsxItemManager(ItemManagerMixin, XlsxManager): """ This layers item-style row processing functionality on an XLSX file. @@ -807,13 +778,13 @@ def _parse_inserts_data(self, filename: str) -> AnyJsonData: return json.load(open_unicode_text_input_file_respecting_byte_order_mark(filename)) -@TableSetManagerRegistry.register1() +@TABLE_SET_MANAGER_REGISTRY.register() class TabbedJsonInsertsManager(JsonInsertsMixin, InsertsManager): ALLOWED_FILE_EXTENSIONS = [".tabs.json"] # If you want them all in one family, use this extension -@TableSetManagerRegistry.register1() +@TABLE_SET_MANAGER_REGISTRY.register() class SimpleJsonInsertsManager(SimpleInsertsMixin, JsonInsertsMixin, InsertsManager): ALLOWED_FILE_EXTENSIONS = [".json"] @@ -825,7 +796,7 @@ def _parse_inserts_data(self, filename) -> AnyJsonData: return yaml.safe_load(open_unicode_text_input_file_respecting_byte_order_mark(filename)) -@TableSetManagerRegistry.register1() +@TABLE_SET_MANAGER_REGISTRY.register() class TabbedYamlInsertsManager(YamlInsertsMixin, InsertsManager): ALLOWED_FILE_EXTENSIONS = [".tabs.yaml"] @@ -834,7 +805,7 @@ def _parse_inserts_data(self, filename) -> AnyJsonData: return yaml.safe_load(open_unicode_text_input_file_respecting_byte_order_mark(filename)) -@TableSetManagerRegistry.register1() +@TABLE_SET_MANAGER_REGISTRY.register() class SimpleYamlInsertsManager(SimpleInsertsMixin, YamlInsertsMixin, InsertsManager): ALLOWED_FILE_EXTENSIONS = [".yaml"] @@ -854,27 +825,27 @@ def __init__(self, filename: str, *, autoload_schemas: Optional[bool] = None, po super().__init__(filename=filename, **kwargs) -@TableSetManagerRegistry.register() +@ITEM_MANAGER_REGISTRY.register() class TabbedJsonInsertsItemManager(InsertsItemMixin, TabbedJsonInsertsManager): pass -@TableSetManagerRegistry.register() +@ITEM_MANAGER_REGISTRY.register() class SimpleJsonInsertsItemManager(InsertsItemMixin, SimpleJsonInsertsManager): pass -@TableSetManagerRegistry.register() +@ITEM_MANAGER_REGISTRY.register() class TabbedYamlInsertsItemManager(InsertsItemMixin, TabbedYamlInsertsManager): pass -@TableSetManagerRegistry.register() +@ITEM_MANAGER_REGISTRY.register() class SimpleYamlInsertsItemManager(InsertsItemMixin, SimpleYamlInsertsManager): pass -@TableSetManagerRegistry.register1() +@TABLE_SET_MANAGER_REGISTRY.register() class SimpleJsonLinesInsertsManager(SimpleInsertsMixin, InsertsManager): ALLOWED_FILE_EXTENSIONS = [".jsonl"] @@ -883,12 +854,12 @@ def _parse_inserts_data(self, filename: str) -> AnyJsonData: return [line for line in JsonLinesReader(open_unicode_text_input_file_respecting_byte_order_mark(filename))] -@TableSetManagerRegistry.register() +@ITEM_MANAGER_REGISTRY.register() class SimpleJsonLinesInsertsItemManager(InsertsItemMixin, SimpleJsonLinesInsertsManager): pass -@TableSetManagerRegistry.register1() +@TABLE_SET_MANAGER_REGISTRY.register(regexp="^(.*/)?(|[^/]*[-_])inserts/?$") class InsertsDirectoryManager(InsertsManager): ALLOWED_FILE_EXTENSIONS = [] @@ -909,12 +880,12 @@ def _parse_inserts_data(self, filename: str) -> AnyJsonData: return data -@TableSetManagerRegistry.register(regexp="^(.*/)?(|[^/]*[-_])inserts/?$") +@ITEM_MANAGER_REGISTRY.register(regexp="^(.*/)?(|[^/]*[-_])inserts/?$") class InsertsDirectoryItemManager(InsertsItemMixin, InsertsDirectoryManager): pass -@TableSetManagerRegistry.register1() +@TABLE_SET_MANAGER_REGISTRY.register() class CsvManager(SingleTableMixin, SemanticTableSetManager): """ This implements the mechanism to get a series of rows out of the sheet in a csv file, @@ -966,7 +937,7 @@ def _process_row(self, tab_name: str, headers: Headers, row_data: SheetRow) -> A for i, cell_text in enumerate(row_data)} -@TableSetManagerRegistry.register() +@ITEM_MANAGER_REGISTRY.register() class CsvItemManager(ItemManagerMixin, CsvManager): """ This layers item-style row processing functionality on a CSV file. @@ -974,7 +945,7 @@ class CsvItemManager(ItemManagerMixin, CsvManager): pass -@TableSetManagerRegistry.register1() +@TABLE_SET_MANAGER_REGISTRY.register() class TsvManager(CsvManager): """ TSV files are just CSV files with tabs instead of commas as separators. @@ -987,7 +958,7 @@ def _get_reader_agent_for_filename(cls, filename) -> CsvReader: return csv.reader(open_unicode_text_input_file_respecting_byte_order_mark(filename), delimiter='\t') -@TableSetManagerRegistry.register() +@ITEM_MANAGER_REGISTRY.register() class TsvItemManager(ItemManagerMixin, TsvManager): """ This layers item-style row processing functionality on a TSV file. @@ -995,6 +966,30 @@ class TsvItemManager(ItemManagerMixin, TsvManager): pass +class TableSetManager(AbstractTableSetManager): + """ + This class will open a .xlsx or .csv file and load its content in our standard format. + (See more detailed description in AbstractTableManager.) + """ + + @classmethod + def create_implementation_manager(cls, filename: str, **kwargs) -> AbstractTableSetManager: + reader_agent_class = TABLE_SET_MANAGER_REGISTRY.manager_for_filename(filename) + if issubclass(reader_agent_class, AbstractItemManager): + raise ValueError(f"TableSetManager unexpectedly found reader agent class {reader_agent_class}.") + reader_agent = reader_agent_class(filename=filename, **kwargs) + return reader_agent + + @classmethod + def load(cls, filename: str, tab_name: Optional[str] = None, escaping: Optional[bool] = None, + **kwargs) -> TabbedSheetData: + """ + Given a filename and various options + """ + manager = cls.create_implementation_manager(filename=filename, tab_name=tab_name, escaping=escaping, **kwargs) + return manager.load_content() + + class ItemManager(AbstractTableSetManager): """ This class will open a .xlsx or .csv file and load its content in our standard format. @@ -1003,16 +998,16 @@ class ItemManager(AbstractTableSetManager): @classmethod def create_implementation_manager(cls, filename: str, **kwargs) -> AbstractItemManager: - reader_agent_class = TableSetManagerRegistry.manager_for_filename(filename) + reader_agent_class: Type[AbstractTableSetManager] = ITEM_MANAGER_REGISTRY.manager_for_filename(filename) + if not issubclass(reader_agent_class, AbstractItemManager): + raise ValueError(f"ItemManager unexpectedly found reader agent class {reader_agent_class}.") + reader_agent_class: Type[AbstractItemManager] reader_agent = reader_agent_class(filename=filename, **kwargs) return reader_agent @classmethod - def load(cls, filename: str, - tab_name: Optional[str] = None, - escaping: Optional[bool] = None, - schemas: Optional[Dict] = None, - autoload_schemas: Optional[bool] = None, + def load(cls, filename: str, tab_name: Optional[str] = None, escaping: Optional[bool] = None, + schemas: Optional[Dict] = None, autoload_schemas: Optional[bool] = None, **kwargs) -> TabbedSheetData: """ Given a filename and various options @@ -1022,8 +1017,5 @@ def load(cls, filename: str, return manager.load_content() +load_table_set = TableSetManager.load load_items = ItemManager.load - - -# Uncommenting this will cause this library, upon loading, to print out debugging data about what got defined. -TableSetManagerRegistry.describe_for_debugging() diff --git a/test/test_sheet_utils.py b/test/test_sheet_utils.py index e5bd77939..9a672d695 100644 --- a/test/test_sheet_utils.py +++ b/test/test_sheet_utils.py @@ -10,7 +10,7 @@ from dcicutils.qa_utils import printed_output from dcicutils.sheet_utils import ( # High-level interfaces - ItemManager, load_items, TableSetManagerRegistry, + ItemManager, load_items, TABLE_SET_MANAGER_REGISTRY, ITEM_MANAGER_REGISTRY, # Low-level implementation BasicTableSetManager, SchemaAutoloadMixin, ItemTools, XlsxManager, XlsxItemManager, @@ -308,10 +308,16 @@ def test_item_tools_find_type_hint(): def test_table_set_manager_registry_manager_for_filename(): - assert TableSetManagerRegistry.manager_for_filename("xyz/foo.csv") == CsvItemManager + assert TABLE_SET_MANAGER_REGISTRY.manager_for_filename("xyz/foo.csv") == CsvManager with pytest.raises(Exception) as exc: - TableSetManagerRegistry.manager_for_filename("xyz/foo.something.missing") + TABLE_SET_MANAGER_REGISTRY.manager_for_filename("xyz/foo.something.missing") + assert str(exc.value) == "Unknown file type: xyz/foo.something.missing" + + assert ITEM_MANAGER_REGISTRY.manager_for_filename("xyz/foo.csv") == CsvItemManager + + with pytest.raises(Exception) as exc: + ITEM_MANAGER_REGISTRY.manager_for_filename("xyz/foo.something.missing") assert str(exc.value) == "Unknown file type: xyz/foo.something.missing" From 91ddce0649f0529f2df99e11b8a54b763b6ec679 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Sat, 2 Sep 2023 05:30:11 -0400 Subject: [PATCH 051/101] Stub in checking of required headers. --- dcicutils/sheet_utils.py | 39 ++++++++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index 41d5f0ff7..9f182881c 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -1,4 +1,5 @@ import chardet +import contextlib import copy import csv import glob @@ -13,7 +14,7 @@ from dcicutils.common import AnyJsonData from dcicutils.env_utils import public_env_name, EnvUtils from dcicutils.ff_utils import get_schema -from dcicutils.lang_utils import conjoined_list, disjoined_list, maybe_pluralize +from dcicutils.lang_utils import conjoined_list, disjoined_list, maybe_pluralize, there_are from dcicutils.misc_utils import ignored, PRINT, pad_to, JsonLinesReader from dcicutils.task_utils import pmap from openpyxl.worksheet.worksheet import Worksheet @@ -57,6 +58,21 @@ class LoadTableError(LoadFailure): pass +@contextlib.contextmanager +def deferred_problems(): + problems = [] + + def note_problems(problem): + problems.append(problem) + + yield note_problems + + if problems: + for problem in problems: + PRINT(f"Problem: {problem}") + raise Exception(there_are(problems, kind='problem while compiling hints', tense='past', show=False)) + + def unwanted_kwargs(*, context, kwargs, context_plural=False, detailed=False): if kwargs: unwanted = [f"{argname}={value!r}" if detailed else argname @@ -440,7 +456,7 @@ class SemanticTableSetManager(BasicTableSetManager): and a class that is semantically interpreted as an "item". This is NOT a parent class of these kinds of files, which we always take literally as if semantic processing - were already done (in part so they can be used to test the results of other formats): + were already done (in part so that they can be used to test the results of other formats): * Json files * Yaml files * Inserts directories @@ -669,10 +685,19 @@ def __init__(self, parsed_headers: ParsedHeaders, type_hints: OptionalTypeHints) def _compile_type_hints(self, tab_name: str): parsed_headers = self.sheet_parsed_headers(tab_name) schema = self.schemas.get(tab_name) + with deferred_problems() as note_problem: + for required_header in self._schema_required_headers(schema): + if required_header not in parsed_headers: + note_problem("Missing required header") type_hints = [ItemTools.find_type_hint(parsed_header, schema) if schema else None for parsed_header in parsed_headers] self.type_hints_by_tab_name[tab_name] = type_hints + @classmethod + def _schema_required_headers(cls, schema): + ignored(schema) + return [] # TODO: Make this compute a list of required headers (in parsed header form) + def _compile_sheet_headers(self, tab_name: str): headers = self.headers_by_tab_name[tab_name] parsed_headers = ItemTools.parse_sheet_headers(headers) @@ -742,7 +767,9 @@ def _load_inserts_data(self, filename: str) -> TabbedSheetData: tabbed_inserts: TabbedSheetData # we've just checked that return tabbed_inserts - def _wrap_inserts_data(self, filename: str, data: AnyJsonData) -> AnyJsonData: + @classmethod + def _wrap_inserts_data(cls, filename: str, data: AnyJsonData) -> AnyJsonData: + ignored(filename) return data @property @@ -774,7 +801,8 @@ def _wrap_inserts_data(self, filename: str, data: AnyJsonData) -> TabbedSheetDat class JsonInsertsMixin: - def _parse_inserts_data(self, filename: str) -> AnyJsonData: + @classmethod + def _parse_inserts_data(cls, filename: str) -> AnyJsonData: return json.load(open_unicode_text_input_file_respecting_byte_order_mark(filename)) @@ -921,7 +949,8 @@ def _create_tab_processor_state(self, tab_name: str) -> Headers: self.headers_by_tab_name[tab_name] = headers = self.reader_agent.__next__() return headers - def _escape_cell_text(self, cell_text): + @classmethod + def _escape_cell_text(cls, cell_text): if '\\' in cell_text: return expand_string_escape_sequences(cell_text) else: From 142a20b13e1ed52b1248ab986f0e0099b1f73ddb Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Tue, 5 Sep 2023 12:29:25 -0400 Subject: [PATCH 052/101] Bump beta version. --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 3da73b78b..022a2dbf7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dcicutils" -version = "7.9.0.1b4" # to become "7.10.0" +version = "7.9.0.1b5" # to become "7.10.0" description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources" authors = ["4DN-DCIC Team "] license = "MIT" From e09af07e6128e29aa87ab10f29042634288033a1 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Tue, 5 Sep 2023 16:43:57 -0400 Subject: [PATCH 053/101] PEP8 --- dcicutils/sheet_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index 9f182881c..befabe342 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -34,6 +34,7 @@ TabbedSheetData = Dict[str, SheetData] Regexp = type(re.compile("sample")) + class LoadFailure(Exception): """ In general, we'd prefer to load up the spreadsheet with clumsy data that can then be validated in detail, From 7d2ecaab3d64c5f86cf426a4dc30125fc5e42853 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Thu, 7 Sep 2023 05:19:41 -0400 Subject: [PATCH 054/101] Fix a bug in newly proposed ff_utils.get_schemas with vapp. --- dcicutils/ff_utils.py | 4 ++-- test/test_ff_utils.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dcicutils/ff_utils.py b/dcicutils/ff_utils.py index b0e3baa84..949d715e7 100644 --- a/dcicutils/ff_utils.py +++ b/dcicutils/ff_utils.py @@ -991,8 +991,8 @@ def get_schema(name, key=None, ff_env: Optional[str] = None, portal_env: Optiona add_on = 'frame=raw' if portal_vapp: full_url = f"{base_url}?{add_on}" - schema = portal_vapp.get(full_url) - return schema + res = portal_vapp.get(full_url) + return get_response_json(res) else: auth = get_authentication_with_server(auth=key, ff_env=portal_env) schema = get_metadata(obj_id=base_url, key=auth, add_on=add_on) diff --git a/test/test_ff_utils.py b/test/test_ff_utils.py index 940820e57..633772bab 100644 --- a/test/test_ff_utils.py +++ b/test/test_ff_utils.py @@ -1359,7 +1359,7 @@ def test_get_schema_with_vapp(): with mock.patch.object(ff_utils, "get_metadata") as mock_get_metadata: with mock.patch.object(ff_utils, "get_authentication_with_server") as mock_get_authentication_with_server: - sample_vapp.get.return_value = sample_schema_metadata + sample_vapp.get.return_value = MockResponse(200, json=sample_schema_metadata) assert ff_utils.get_schema('User', portal_vapp=sample_vapp) == sample_schema_metadata From 5e4627363ad9f6b6b681130cd62354fa43b5acf2 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Thu, 7 Sep 2023 05:20:17 -0400 Subject: [PATCH 055/101] Extend VirtualApp to amke it easier to test by adding an AbstractVirtualApp. --- dcicutils/misc_utils.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/dcicutils/misc_utils.py b/dcicutils/misc_utils.py index aaa503a8f..afa8850e8 100644 --- a/dcicutils/misc_utils.py +++ b/dcicutils/misc_utils.py @@ -192,7 +192,11 @@ class _VirtualAppHelper(webtest.TestApp): pass -class VirtualApp: +class AbstractVirtualApp: + pass + + +class VirtualApp(AbstractVirtualApp): """ Wrapper class for TestApp, to allow custom control over submitting Encoded requests, simulating a number of conditions, including permissions. From 53de60a546f7fe0d9dbfa13093e8816247b96dd2 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Thu, 7 Sep 2023 05:24:10 -0400 Subject: [PATCH 056/101] Implement portal_vapp= in sheet_utils. --- dcicutils/sheet_utils.py | 63 ++++++++++++++++++++++++++++------------ test/test_sheet_utils.py | 63 ++++++++++++++++++++++++++++++++++++++-- 2 files changed, 105 insertions(+), 21 deletions(-) diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index befabe342..a3c6e02d5 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -11,16 +11,16 @@ import uuid import yaml -from dcicutils.common import AnyJsonData -from dcicutils.env_utils import public_env_name, EnvUtils -from dcicutils.ff_utils import get_schema -from dcicutils.lang_utils import conjoined_list, disjoined_list, maybe_pluralize, there_are -from dcicutils.misc_utils import ignored, PRINT, pad_to, JsonLinesReader -from dcicutils.task_utils import pmap from openpyxl.worksheet.worksheet import Worksheet from openpyxl.workbook.workbook import Workbook from tempfile import TemporaryFile from typing import Any, Dict, Iterable, List, Optional, Tuple, Type, Union +from .common import AnyJsonData +from .env_utils import public_env_name, EnvUtils +from .ff_utils import get_schema +from .lang_utils import conjoined_list, disjoined_list, maybe_pluralize, there_are +from .misc_utils import ignored, PRINT, pad_to, JsonLinesReader, AbstractVirtualApp +from .task_utils import pmap Header = str @@ -614,22 +614,23 @@ class SchemaAutoloadMixin(AbstractTableSetManager): AUTOLOAD_SCHEMAS_DEFAULT = True def __init__(self, filename: str, autoload_schemas: Optional[bool] = None, portal_env: Optional[str] = None, - **kwargs): + portal_vapp: Optional[AbstractVirtualApp] = None, **kwargs): # This setup must be in place before the class initialization is done (via the super call). self.autoload_schemas: bool = self.AUTOLOAD_SCHEMAS_DEFAULT if autoload_schemas is None else autoload_schemas - if self.autoload_schemas: - if portal_env is None: + if self.autoload_schemas: # If autoload_schemas is False, we don't care about doing this defaulting. + if portal_env is None and portal_vapp is None: portal_env = public_env_name(EnvUtils.PRD_ENV_NAME) PRINT(f"The portal_env was not explicitly supplied. Schemas will come from portal_env={portal_env!r}.") self.portal_env: Optional[str] = portal_env + self.portal_vapp: Optional[AbstractVirtualApp] = portal_vapp super().__init__(filename=filename, **kwargs) def fetch_relevant_schemas(self, schema_names: List[str]): # The schema_names argument is not normally given, but it is there for easier testing def fetch_schema(schema_name): - schema = self.fetch_schema(schema_name, portal_env=self.portal_env) + schema = self.fetch_schema(schema_name, portal_env=self.portal_env, portal_vapp=self.portal_vapp) return schema_name, schema - if self.autoload_schemas and self.portal_env: + if self.autoload_schemas and (self.portal_env or self.portal_vapp): autoloaded = {tab_name: schema for tab_name, schema in pmap(fetch_schema, schema_names)} return autoloaded @@ -637,9 +638,10 @@ def fetch_schema(schema_name): return {} @classmethod - def fetch_schema(cls, schema_name: str, *, portal_env: str): + def fetch_schema(cls, schema_name: str, *, portal_env: Optional[str] = None, + portal_vapp: Optional[AbstractVirtualApp] = None): def just_fetch_it(): - return get_schema(schema_name, ff_env=portal_env) + return get_schema(schema_name, portal_env=portal_env, portal_vapp=portal_vapp) if cls.CACHE_SCHEMAS: schema: Optional[AnyJsonData] = cls.SCHEMA_CACHE.get(schema_name) if schema is None: @@ -665,9 +667,16 @@ def __init__(self, filename: str, schemas: Optional[Dict[str, AnyJsonData]] = No self.patch_prototypes_by_tab_name: Dict[str, Dict] = {} self.parsed_headers_by_tab_name: Dict[str, ParsedHeaders] = {} self.type_hints_by_tab_name: Dict[str, OptionalTypeHints] = {} - self.schemas = schemas or self.fetch_relevant_schemas(self.tab_names) + self._schemas = schemas self._instaguid_context_table: Dict[str, str] = {} + @property + def schemas(self): + schemas = self._schemas + if schemas is None: + self._schemas = schemas = self.fetch_relevant_schemas(self.tab_names) + return schemas + def sheet_patch_prototype(self, tab_name: str) -> Dict: return self.patch_prototypes_by_tab_name[tab_name] @@ -841,12 +850,18 @@ class SimpleYamlInsertsManager(SimpleInsertsMixin, YamlInsertsMixin, InsertsMana class InsertsItemMixin(AbstractItemManager): # ItemManagerMixin isn't really appropriate here + """ + This class is used for inserts directories and other JSON-like data that will be literally used as an Item + without semantic pre-processing. In other words, these classes will not be pre-checked for semantic correctness + but instead assumed to have been checked by other means. + """ AUTOLOAD_SCHEMAS_DEFAULT = False # Has no effect, but someone might inspect the value. def __init__(self, filename: str, *, autoload_schemas: Optional[bool] = None, portal_env: Optional[str] = None, - schemas: Optional[Dict[str, AnyJsonData]] = None, **kwargs): - ignored(portal_env) # Would only be used if autoload_schemas was requested, and we don't allow that. + portal_vapp: Optional[AbstractVirtualApp] = None, schemas: Optional[Dict[str, AnyJsonData]] = None, + **kwargs): + ignored(portal_env, portal_vapp) # Would only be used if autoload_schemas was true, and we don't allow that. if schemas not in [None, {}]: raise ValueError(f"{self.__class__.__name__} does not allow schemas={schemas!r}.") if autoload_schemas not in [None, False]: @@ -1038,12 +1053,24 @@ def create_implementation_manager(cls, filename: str, **kwargs) -> AbstractItemM @classmethod def load(cls, filename: str, tab_name: Optional[str] = None, escaping: Optional[bool] = None, schemas: Optional[Dict] = None, autoload_schemas: Optional[bool] = None, + portal_env: Optional[str] = None, portal_vapp: Optional[AbstractVirtualApp] = None, **kwargs) -> TabbedSheetData: """ - Given a filename and various options + Given a filename and various options, loads the items associated with that filename. + + :param filename: The name of the file to load. + :param tab_name: For files that lack multiple tabs (such as .csv or .tsv), + the tab name to associate with the data. + :param escaping: Whether to perform escape processing on backslashes. + :param schemas: A set of schemas to use instead of trying to load them. + :param autoload_schemas: Whether to try autoloading schemas. + :param portal_env: A portal to consult to find schemas (usually if calling from the outside of a portal). + :param portal_vapp: A vapp to use (usually if calling from within a portal). """ manager = cls.create_implementation_manager(filename=filename, tab_name=tab_name, escaping=escaping, - schemas=schemas, autoload_schemas=autoload_schemas, **kwargs) + schemas=schemas, autoload_schemas=autoload_schemas, + portal_env=portal_env, portal_vapp=portal_vapp, + **kwargs) return manager.load_content() diff --git a/test/test_sheet_utils.py b/test/test_sheet_utils.py index 9a672d695..b9472a94c 100644 --- a/test/test_sheet_utils.py +++ b/test/test_sheet_utils.py @@ -4,10 +4,11 @@ import pytest from collections import namedtuple -from dcicutils import sheet_utils as sheet_utils_module +from dcicutils import sheet_utils as sheet_utils_module, ff_utils as ff_utils_module from dcicutils.common import AnyJsonData -from dcicutils.misc_utils import is_uuid, local_attrs -from dcicutils.qa_utils import printed_output +from dcicutils.env_utils import EnvUtils, public_env_name +from dcicutils.misc_utils import is_uuid, local_attrs, NamedObject, AbstractVirtualApp +from dcicutils.qa_utils import printed_output, mock_not_called, MockResponse from dcicutils.sheet_utils import ( # High-level interfaces ItemManager, load_items, TABLE_SET_MANAGER_REGISTRY, ITEM_MANAGER_REGISTRY, @@ -846,6 +847,10 @@ def test_schema_autoload_mixin_fetch_relevant_schemas(autoload_schemas, cache_sc @pytest.mark.integrated def test_workbook_with_schemas(): + print() # start o a fresh line + + SchemaAutoloadMixin.clear_schema_cache() + actual_data = CsvManager(filename=SAMPLE_ITEMS_FOR_REAL_SCHEMAS_FILE, tab_name='ExperimentSeq').load_content() expected_data = { "ExperimentSeq": [ @@ -876,3 +881,55 @@ def test_workbook_with_schemas(): ] } assert actual_items == expected_items + + +@using_fresh_ff_state_for_testing() +@pytest.mark.integrated +def test_workbook_with_schemas_and_portal_vapp(): + + print() # start on a fresh line + + SchemaAutoloadMixin.clear_schema_cache() + + portal_env = public_env_name(EnvUtils.PRD_ENV_NAME) + + experiment_seq_schema = ff_utils_module.get_schema('ExperimentSeq', portal_env=portal_env) + + expected_items = { + "ExperimentSeq": [ + { + "accession": "foo", + "fragment_size_selection_method": "SPRI beads" + }, + { + "accession": "bar", + "fragment_size_selection_method": "BluePippin" + } + ] + } + + class MockVapp(NamedObject, AbstractVirtualApp): + + def __init__(self, name): + super().__init__(name=name) + self.call_count = 0 + + def get(self, path_url): + assert path_url.startswith('profiles/ExperimentSeq.json?') + self.call_count += 1 + response = MockResponse(200, json=experiment_seq_schema) + return response + + portal_vapp = MockVapp(name=f'MockVapp[{portal_env}]') + + old_count = portal_vapp.call_count + + with mock.patch.object(ff_utils_module, "get_authentication_with_server", + mock_not_called("get_authentication_with_server")): + with mock.patch.object(ff_utils_module, "get_metadata", + mock_not_called("get_metadata")): + actual_items = load_items(SAMPLE_ITEMS_FOR_REAL_SCHEMAS_FILE, + tab_name='ExperimentSeq', autoload_schemas=True, portal_vapp=portal_vapp) + + assert portal_vapp.call_count == old_count + 1 + assert actual_items == expected_items From 630720f83da9c4be44c2aa9ab1393ac10b1ec6b2 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Thu, 7 Sep 2023 12:29:22 -0400 Subject: [PATCH 057/101] Simplifications per Will's code review. --- dcicutils/ff_utils.py | 6 ++---- test/test_ff_utils.py | 14 +++++++------- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/dcicutils/ff_utils.py b/dcicutils/ff_utils.py index 949d715e7..280bdc0df 100644 --- a/dcicutils/ff_utils.py +++ b/dcicutils/ff_utils.py @@ -994,8 +994,7 @@ def get_schema(name, key=None, ff_env: Optional[str] = None, portal_env: Optiona res = portal_vapp.get(full_url) return get_response_json(res) else: - auth = get_authentication_with_server(auth=key, ff_env=portal_env) - schema = get_metadata(obj_id=base_url, key=auth, add_on=add_on) + schema = get_metadata(obj_id=base_url, key=key, ff_env=portal_env, add_on=add_on) return schema @@ -1026,8 +1025,7 @@ def get_schemas(key=None, ff_env: Optional[str] = None, *, allow_abstract: bool full_url = f"{base_url}?{add_on}" schemas: Dict[str, Dict] = portal_vapp.get(full_url) else: - auth = get_authentication_with_server(auth=key, ff_env=portal_env) - schemas: Dict[str, Dict] = get_metadata(obj_id=base_url, key=auth, add_on=add_on) + schemas: Dict[str, Dict] = get_metadata(obj_id=base_url, key=key, ff_env=portal_env, add_on=add_on) filtered_schemas = {} for schema_name, schema in schemas.items(): if allow_abstract or not schema.get('isAbstract'): diff --git a/test/test_ff_utils.py b/test/test_ff_utils.py index 633772bab..16413a519 100644 --- a/test/test_ff_utils.py +++ b/test/test_ff_utils.py @@ -1348,8 +1348,8 @@ def test_get_schema_with_vapp(): # When called with no vapp, get_metadata is consulted (after getting auth info) assert ff_utils.get_schema('User', **env_args) == sample_schema_metadata - mock_get_authentication_with_server.assert_called_once_with(auth=None, ff_env=expected_env) - mock_get_metadata.assert_called_once_with(obj_id='profiles/User.json', key=sample_auth, + mock_get_authentication_with_server.assert_not_called() + mock_get_metadata.assert_called_once_with(obj_id='profiles/User.json', key=None, ff_env=expected_env, add_on='frame=raw') sample_vapp.get.assert_not_called() @@ -1398,11 +1398,10 @@ def test_get_schemas_with_vapp(): mock_get_metadata.return_value = sample_schema_metadata mock_get_authentication_with_server.return_value = sample_auth - # When called with no vapp, get_metadata is consulted (after getting auth info) assert ff_utils.get_schemas(**env_args) == sample_schema_metadata - mock_get_authentication_with_server.assert_called_once_with(auth=None, ff_env=expected_env) - mock_get_metadata.assert_called_once_with(obj_id='profiles/', key=sample_auth, + mock_get_authentication_with_server.assert_not_called() + mock_get_metadata.assert_called_once_with(obj_id='profiles/', key=None, ff_env=expected_env, add_on='frame=raw') sample_vapp.get.assert_not_called() @@ -1456,9 +1455,10 @@ def mocked_schemas_subset(keys): with mock.patch.object(ff_utils, "get_metadata") as mock_get_metadata: - def mocked_get_metadata(obj_id, key, add_on): + def mocked_get_metadata(obj_id, key, ff_env, add_on): assert obj_id == "profiles/" # this is the web API to ask for all profiles - assert key == 'some-auth' # we assume auth is tested elsewhere + assert key is None # it would get looked up + assert ff_env is None # it would get looked up, too assert add_on == "frame=raw" # we presently always send this return mocked_schemas From cb5125ca6a33027fc3d3d7132bce120cc24bd8dd Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Mon, 11 Sep 2023 16:02:33 -0400 Subject: [PATCH 058/101] Refactor to have a separate bundle_utils.py --- dcicutils/bundle_utils.py | 503 +++++++++++++++++++++++++++ dcicutils/common.py | 6 + dcicutils/sheet_utils.py | 509 ++------------------------- test/test_bundle_utils.py | 702 +++++++++++++++++++++++++++++++++++++ test/test_common.py | 24 +- test/test_sheet_utils.py | 704 ++------------------------------------ 6 files changed, 1291 insertions(+), 1157 deletions(-) create mode 100644 dcicutils/bundle_utils.py create mode 100644 test/test_bundle_utils.py diff --git a/dcicutils/bundle_utils.py b/dcicutils/bundle_utils.py new file mode 100644 index 000000000..d6d35d00d --- /dev/null +++ b/dcicutils/bundle_utils.py @@ -0,0 +1,503 @@ +import contextlib +import copy +# import os +import uuid + +from typing import Any, Dict, List, Optional, Type, Union +from .common import AnyJsonData # , Regexp, CsvReader +from .env_utils import EnvUtils, public_env_name +from .ff_utils import get_schema +from .lang_utils import there_are +from .misc_utils import AbstractVirtualApp, ignored, PRINT +from .sheet_utils import ( + Header, Headers, ParsedHeader, ParsedHeaders, SheetCellValue, SheetRow, TabbedSheetData, # SheetData, + prefer_number, + LoadTableError, + TableSetManagerRegistry, AbstractTableSetManager, BasicTableSetManager, + CsvManager, TsvManager, XlsxManager, + SimpleJsonInsertsManager, SimpleYamlInsertsManager, SimpleJsonLinesInsertsManager, + TabbedJsonInsertsManager, TabbedYamlInsertsManager, + InsertsDirectoryManager, +) +from .task_utils import pmap + + +@contextlib.contextmanager +def deferred_problems(): + problems = [] + + def note_problems(problem): + problems.append(problem) + + yield note_problems + + if problems: + for problem in problems: + PRINT(f"Problem: {problem}") + raise Exception(there_are(problems, kind='problem while compiling hints', tense='past', show=False)) + + +class AbstractItemManager(AbstractTableSetManager): + + pass + + +class TypeHint: + def apply_hint(self, value): + return value + + def __str__(self): + return f"<{self.__class__.__name__}>" + + def __repr__(self): + return self.__str__() + + +class BoolHint(TypeHint): + + def apply_hint(self, value): + if isinstance(value, str) and value: + if 'true'.startswith(value.lower()): + return True + elif 'false'.startswith(value.lower()): + return False + return super().apply_hint(value) + + +class EnumHint(TypeHint): + + def __str__(self): + return f"" + + def __init__(self, value_map): + self.value_map = value_map + + def apply_hint(self, value): + if isinstance(value, str): + if value in self.value_map: + result = self.value_map[value] + return result + else: + lvalue = value.lower() + found = [] + for lkey, key in self.value_map.items(): + if lkey.startswith(lvalue): + found.append(lkey) + if len(found) == 1: + [only_found] = found + result = self.value_map[only_found] + return result + return super().apply_hint(value) + + +OptionalTypeHints = List[Optional[TypeHint]] + + +class ItemTools: + """ + Implements operations on table-related data without pre-supposing the specific representation of the table. + It is assumed this can be used for data that was obtained from .json, .csv, .tsv, and .xlsx files because + it does not presuppose the source of the data nor where it will be written to. + + For the purpose of this class: + + * a 'header' is a string representing the top of a column. + + * a 'parsed header' is a list of strings and/or ints, after splitting at uses of '#' or '.', so that + "a.b.c" is represented as ["a", "b", "c"], and "x.y#0" is represented as ["x", "y", 0], and representing + each numeric token as an int instead of a string. + + * a 'headers' object is just a list of strings, each of which is a 'header'. + + * a 'parsed headers' object is a non-empty list of lists, each of which is a 'parsed header'. + e..g., the headers ["a.b.c", "x.y#0"] is represented as parsed hearders [["a", "b", "c"], ["x", "y", 0]]. + + """ + + @classmethod + def parse_sheet_header(cls, header: Header) -> ParsedHeader: + result = [] + token = "" + for i in range(len(header)): + ch = header[i] + if ch == '.' or ch == '#': + if token: + result.append(int(token) if token.isdigit() else token) + token = "" + else: + token += ch + if token: + result.append(int(token) if token.isdigit() else token) + return result + + @classmethod + def parse_sheet_headers(cls, headers: Headers): + return [cls.parse_sheet_header(header) + for header in headers] + + @classmethod + def compute_patch_prototype(cls, parsed_headers: ParsedHeaders): + prototype = {} + for parsed_header in parsed_headers: + parsed_header0 = parsed_header[0] + if isinstance(parsed_header0, int): + raise LoadTableError(f"A header cannot begin with a numeric ref: {parsed_header0}") + cls.assure_patch_prototype_shape(parent=prototype, keys=parsed_header) + return prototype + + @classmethod + def assure_patch_prototype_shape(cls, *, parent: Union[Dict, List], keys: ParsedHeader): + [key0, *more_keys] = keys + key1 = more_keys[0] if more_keys else None + if isinstance(key1, int): + placeholder = [] + elif isinstance(key1, str): + placeholder = {} + else: + placeholder = None + if isinstance(key0, int): + n = len(parent) + if key0 == n: + parent.append(placeholder) + elif key0 > n: + raise LoadTableError("Numeric items must occur sequentially.") + elif isinstance(key0, str): + if key0 not in parent: + parent[key0] = placeholder + if key1 is not None: + cls.assure_patch_prototype_shape(parent=parent[key0], keys=more_keys) + return parent + + INSTAGUIDS_ENABLED = False # Experimental feature not enabled by default + + @classmethod + def parse_item_value(cls, value: SheetCellValue, context=None) -> AnyJsonData: + # TODO: Remodularize this for easier testing and more Schema-driven effect + # Doug asks that this be broken up into different mechanisms, more modular and separately testable. + # I pretty much agree with that. I'm just waiting for suggestions on what kinds of features are desired. + if isinstance(value, str): + lvalue = value.lower() + # TODO: We could consult a schema to make this less heuristic, but this may do for now + if lvalue == 'true': + return True + elif lvalue == 'false': + return False + elif lvalue == 'null' or lvalue == '': + return None + elif '|' in value: + if value == '|': # Use '|' for [] + return [] + else: + if value.endswith("|"): # Use 'foo|' for ['foo'] + value = value[:-1] + return [cls.parse_item_value(subvalue, context=context) for subvalue in value.split('|')] + elif cls.INSTAGUIDS_ENABLED and context is not None and value.startswith('#'): + # Note that this clause MUST follow '|' clause above so '#foo|#bar' isn't seen as instaguid + return cls.get_instaguid(value, context=context) + else: + # Doug points out that the schema might not agree, might want a string representation of a number. + # At this semantic layer, this might be a bad choice. + return prefer_number(value) + else: # presumably a number (int or float) + return value + + @classmethod + def get_instaguid(cls, guid_placeholder: str, *, context: Optional[Dict] = None): + if context is None: + return guid_placeholder + else: + referent = context.get(guid_placeholder) + if not referent: + context[guid_placeholder] = referent = str(uuid.uuid4()) + return referent + + @classmethod + def set_path_value(cls, datum: Union[List, Dict], path: ParsedHeader, value: Any, force: bool = False): + if (value is None or value == '') and not force: + return + [key, *more_path] = path + if not more_path: + datum[key] = value + else: + cls.set_path_value(datum[key], more_path, value) + + @classmethod + def find_type_hint(cls, parsed_header: Optional[ParsedHeader], schema: Any): + + def finder(subheader, subschema): + if not parsed_header: + return None + else: + [key1, *other_headers] = subheader + if isinstance(key1, str) and isinstance(subschema, dict): + if subschema.get('type') == 'object': + def1 = subschema.get('properties', {}).get(key1) + if not other_headers: + if def1 is not None: + t = def1.get('type') + if t == 'string': + enum = def1.get('enum') + if enum: + mapping = {e.lower(): e for e in enum} + return EnumHint(mapping) + elif t == 'boolean': + return BoolHint() + else: + pass # fall through to asking super() + else: + pass # fall through to asking super() + else: + return finder(subheader=other_headers, subschema=def1) + + return finder(subheader=parsed_header, subschema=schema) + + +ITEM_MANAGER_REGISTRY = TableSetManagerRegistry() + + +class SchemaAutoloadMixin(AbstractTableSetManager): + + SCHEMA_CACHE = {} # Shared cache. Do not override. Use .clear_schema_cache() to clear it. + CACHE_SCHEMAS = True # Controls whether we're doing caching at all + AUTOLOAD_SCHEMAS_DEFAULT = True + + def __init__(self, filename: str, autoload_schemas: Optional[bool] = None, portal_env: Optional[str] = None, + portal_vapp: Optional[AbstractVirtualApp] = None, **kwargs): + # This setup must be in place before the class initialization is done (via the super call). + self.autoload_schemas: bool = self.AUTOLOAD_SCHEMAS_DEFAULT if autoload_schemas is None else autoload_schemas + if self.autoload_schemas: # If autoload_schemas is False, we don't care about doing this defaulting. + if portal_env is None and portal_vapp is None: + portal_env = public_env_name(EnvUtils.PRD_ENV_NAME) + PRINT(f"The portal_env was not explicitly supplied. Schemas will come from portal_env={portal_env!r}.") + self.portal_env: Optional[str] = portal_env + self.portal_vapp: Optional[AbstractVirtualApp] = portal_vapp + super().__init__(filename=filename, **kwargs) + + def fetch_relevant_schemas(self, schema_names: List[str]): + # The schema_names argument is not normally given, but it is there for easier testing + def fetch_schema(schema_name): + schema = self.fetch_schema(schema_name, portal_env=self.portal_env, portal_vapp=self.portal_vapp) + return schema_name, schema + if self.autoload_schemas and (self.portal_env or self.portal_vapp): + autoloaded = {tab_name: schema + for tab_name, schema in pmap(fetch_schema, schema_names)} + return autoloaded + else: + return {} + + @classmethod + def fetch_schema(cls, schema_name: str, *, portal_env: Optional[str] = None, + portal_vapp: Optional[AbstractVirtualApp] = None): + def just_fetch_it(): + return get_schema(schema_name, portal_env=portal_env, portal_vapp=portal_vapp) + if cls.CACHE_SCHEMAS: + schema: Optional[AnyJsonData] = cls.SCHEMA_CACHE.get(schema_name) + if schema is None: + cls.SCHEMA_CACHE[schema_name] = schema = just_fetch_it() + return schema + else: + return just_fetch_it() + + @classmethod + def clear_schema_cache(cls): + for key in list(cls.SCHEMA_CACHE.keys()): # important to get the list of keys as a separate object first + cls.SCHEMA_CACHE.pop(key, None) + + +class ItemManagerMixin(SchemaAutoloadMixin, AbstractItemManager, BasicTableSetManager): + """ + This can add functionality to a reader such as an XlsxManager or a CsvManager in order to make its rows + get handled like Items instead of just flat table rows. + """ + + def __init__(self, filename: str, schemas: Optional[Dict[str, AnyJsonData]] = None, **kwargs): + super().__init__(filename=filename, **kwargs) + self.patch_prototypes_by_tab_name: Dict[str, Dict] = {} + self.parsed_headers_by_tab_name: Dict[str, ParsedHeaders] = {} + self.type_hints_by_tab_name: Dict[str, OptionalTypeHints] = {} + self._schemas = schemas + self._instaguid_context_table: Dict[str, str] = {} + + @property + def schemas(self): + schemas = self._schemas + if schemas is None: + self._schemas = schemas = self.fetch_relevant_schemas(self.tab_names) + return schemas + + def sheet_patch_prototype(self, tab_name: str) -> Dict: + return self.patch_prototypes_by_tab_name[tab_name] + + def sheet_parsed_headers(self, tab_name: str) -> ParsedHeaders: + return self.parsed_headers_by_tab_name[tab_name] + + def sheet_type_hints(self, tab_name: str) -> OptionalTypeHints: + return self.type_hints_by_tab_name[tab_name] + + class SheetState: + + def __init__(self, parsed_headers: ParsedHeaders, type_hints: OptionalTypeHints): + self.parsed_headers = parsed_headers + self.type_hints = type_hints + + def _compile_type_hints(self, tab_name: str): + parsed_headers = self.sheet_parsed_headers(tab_name) + schema = self.schemas.get(tab_name) + with deferred_problems() as note_problem: + for required_header in self._schema_required_headers(schema): + if required_header not in parsed_headers: + note_problem("Missing required header") + type_hints = [ItemTools.find_type_hint(parsed_header, schema) if schema else None + for parsed_header in parsed_headers] + self.type_hints_by_tab_name[tab_name] = type_hints + + @classmethod + def _schema_required_headers(cls, schema): + ignored(schema) + return [] # TODO: Make this compute a list of required headers (in parsed header form) + + def _compile_sheet_headers(self, tab_name: str): + headers = self.headers_by_tab_name[tab_name] + parsed_headers = ItemTools.parse_sheet_headers(headers) + self.parsed_headers_by_tab_name[tab_name] = parsed_headers + prototype = ItemTools.compute_patch_prototype(parsed_headers) + self.patch_prototypes_by_tab_name[tab_name] = prototype + + def _create_tab_processor_state(self, tab_name: str) -> SheetState: + super()._create_tab_processor_state(tab_name) + # This will create state that allows us to efficiently assign values in the right place on each row + # by setting up a prototype we can copy and then drop values into. + self._compile_sheet_headers(tab_name) + self._compile_type_hints(tab_name) + return self.SheetState(parsed_headers=self.sheet_parsed_headers(tab_name), + type_hints=self.sheet_type_hints(tab_name)) + + def _process_row(self, tab_name: str, state: SheetState, row_data: SheetRow) -> AnyJsonData: + parsed_headers = state.parsed_headers + type_hints = state.type_hints + patch_item = copy.deepcopy(self.sheet_patch_prototype(tab_name)) + for i, value in enumerate(row_data): + parsed_value = self.parse_cell_value(value) + type_hint = type_hints[i] + if type_hint: + parsed_value = type_hint.apply_hint(parsed_value) + ItemTools.set_path_value(patch_item, parsed_headers[i], parsed_value) + return patch_item + + def parse_cell_value(self, value: SheetCellValue) -> AnyJsonData: + return ItemTools.parse_item_value(value, context=self._instaguid_context_table) + + +class InsertsItemMixin(AbstractItemManager): # ItemManagerMixin isn't really appropriate here + """ + This class is used for inserts directories and other JSON-like data that will be literally used as an Item + without semantic pre-processing. In other words, these classes will not be pre-checked for semantic correctness + but instead assumed to have been checked by other means. + """ + + AUTOLOAD_SCHEMAS_DEFAULT = False # Has no effect, but someone might inspect the value. + + def __init__(self, filename: str, *, autoload_schemas: Optional[bool] = None, portal_env: Optional[str] = None, + portal_vapp: Optional[AbstractVirtualApp] = None, schemas: Optional[Dict[str, AnyJsonData]] = None, + **kwargs): + ignored(portal_env, portal_vapp) # Would only be used if autoload_schemas was true, and we don't allow that. + if schemas not in [None, {}]: + raise ValueError(f"{self.__class__.__name__} does not allow schemas={schemas!r}.") + if autoload_schemas not in [None, False]: + raise ValueError(f"{self.__class__.__name__} does not allow autoload_schemas={autoload_schemas!r}.") + super().__init__(filename=filename, **kwargs) + + +@ITEM_MANAGER_REGISTRY.register() +class TabbedJsonInsertsItemManager(InsertsItemMixin, TabbedJsonInsertsManager): + pass + + +@ITEM_MANAGER_REGISTRY.register() +class SimpleJsonInsertsItemManager(InsertsItemMixin, SimpleJsonInsertsManager): + pass + + +@ITEM_MANAGER_REGISTRY.register() +class TabbedYamlInsertsItemManager(InsertsItemMixin, TabbedYamlInsertsManager): + pass + + +@ITEM_MANAGER_REGISTRY.register() +class SimpleYamlInsertsItemManager(InsertsItemMixin, SimpleYamlInsertsManager): + pass + + +@ITEM_MANAGER_REGISTRY.register() +class XlsxItemManager(ItemManagerMixin, XlsxManager): + """ + This layers item-style row processing functionality on an XLSX file. + """ + pass + + +@ITEM_MANAGER_REGISTRY.register() +class SimpleJsonLinesInsertsItemManager(InsertsItemMixin, SimpleJsonLinesInsertsManager): + pass + + +@ITEM_MANAGER_REGISTRY.register(regexp="^(.*/)?(|[^/]*[-_])inserts/?$") +class InsertsDirectoryItemManager(InsertsItemMixin, InsertsDirectoryManager): + pass + + +@ITEM_MANAGER_REGISTRY.register() +class CsvItemManager(ItemManagerMixin, CsvManager): + """ + This layers item-style row processing functionality on a CSV file. + """ + pass + + +@ITEM_MANAGER_REGISTRY.register() +class TsvItemManager(ItemManagerMixin, TsvManager): + """ + This layers item-style row processing functionality on a TSV file. + """ + pass + + +class ItemManager(AbstractTableSetManager): + """ + This class will open a .xlsx or .csv file and load its content in our standard format. + (See more detailed description in AbstractTableManager.) + """ + + @classmethod + def create_implementation_manager(cls, filename: str, **kwargs) -> AbstractItemManager: + reader_agent_class: Type[AbstractTableSetManager] = ITEM_MANAGER_REGISTRY.manager_for_filename(filename) + if not issubclass(reader_agent_class, AbstractItemManager): + raise ValueError(f"ItemManager unexpectedly found reader agent class {reader_agent_class}.") + reader_agent_class: Type[AbstractItemManager] + reader_agent = reader_agent_class(filename=filename, **kwargs) + return reader_agent + + @classmethod + def load(cls, filename: str, tab_name: Optional[str] = None, escaping: Optional[bool] = None, + schemas: Optional[Dict] = None, autoload_schemas: Optional[bool] = None, + portal_env: Optional[str] = None, portal_vapp: Optional[AbstractVirtualApp] = None, + **kwargs) -> TabbedSheetData: + """ + Given a filename and various options, loads the items associated with that filename. + + :param filename: The name of the file to load. + :param tab_name: For files that lack multiple tabs (such as .csv or .tsv), + the tab name to associate with the data. + :param escaping: Whether to perform escape processing on backslashes. + :param schemas: A set of schemas to use instead of trying to load them. + :param autoload_schemas: Whether to try autoloading schemas. + :param portal_env: A portal to consult to find schemas (usually if calling from the outside of a portal). + :param portal_vapp: A vapp to use (usually if calling from within a portal). + """ + manager = cls.create_implementation_manager(filename=filename, tab_name=tab_name, escaping=escaping, + schemas=schemas, autoload_schemas=autoload_schemas, + portal_env=portal_env, portal_vapp=portal_vapp, + **kwargs) + return manager.load_content() + +load_items = ItemManager.load \ No newline at end of file diff --git a/dcicutils/common.py b/dcicutils/common.py index b4f487cf3..f1893af1c 100644 --- a/dcicutils/common.py +++ b/dcicutils/common.py @@ -1,5 +1,8 @@ +import csv import os +import re +from tempfile import TemporaryFile from typing import Dict, Union, Tuple, List, Any from typing_extensions import Literal @@ -69,6 +72,9 @@ PortalEnvName = str +Regexp = type(re.compile("sample")) + +CsvReader = type(csv.reader(TemporaryFile())) # ===== AWS Data ===== diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index a3c6e02d5..41a130ab0 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -1,6 +1,6 @@ import chardet -import contextlib -import copy +# import contextlib +# import copy import csv import glob import io @@ -8,19 +8,19 @@ import openpyxl import os import re -import uuid +# import uuid import yaml from openpyxl.worksheet.worksheet import Worksheet from openpyxl.workbook.workbook import Workbook from tempfile import TemporaryFile from typing import Any, Dict, Iterable, List, Optional, Tuple, Type, Union -from .common import AnyJsonData -from .env_utils import public_env_name, EnvUtils -from .ff_utils import get_schema -from .lang_utils import conjoined_list, disjoined_list, maybe_pluralize, there_are -from .misc_utils import ignored, PRINT, pad_to, JsonLinesReader, AbstractVirtualApp -from .task_utils import pmap +from .common import AnyJsonData, Regexp +# from .env_utils import public_env_name, EnvUtils +# from .ff_utils import get_schema +from .lang_utils import conjoined_list, disjoined_list, maybe_pluralize # , there_are +from .misc_utils import ignored, pad_to, JsonLinesReader # , PRINT, AbstractVirtualApp +# from .task_utils import pmap Header = str @@ -32,7 +32,6 @@ CsvReader = type(csv.reader(TemporaryFile())) SheetData = List[dict] TabbedSheetData = Dict[str, SheetData] -Regexp = type(re.compile("sample")) class LoadFailure(Exception): @@ -59,21 +58,6 @@ class LoadTableError(LoadFailure): pass -@contextlib.contextmanager -def deferred_problems(): - problems = [] - - def note_problems(problem): - problems.append(problem) - - yield note_problems - - if problems: - for problem in problems: - PRINT(f"Problem: {problem}") - raise Exception(there_are(problems, kind='problem while compiling hints', tense='past', show=False)) - - def unwanted_kwargs(*, context, kwargs, context_plural=False, detailed=False): if kwargs: unwanted = [f"{argname}={value!r}" if detailed else argname @@ -141,218 +125,7 @@ def open_unicode_text_input_file_respecting_byte_order_mark(filename): return io.open(filename, 'r', encoding=use_encoding) -class TypeHint: - def apply_hint(self, value): - return value - - def __str__(self): - return f"<{self.__class__.__name__}>" - - def __repr__(self): - return self.__str__() - - -class BoolHint(TypeHint): - - def apply_hint(self, value): - if isinstance(value, str) and value: - if 'true'.startswith(value.lower()): - return True - elif 'false'.startswith(value.lower()): - return False - return super().apply_hint(value) - - -class EnumHint(TypeHint): - - def __str__(self): - return f"" - - def __init__(self, value_map): - self.value_map = value_map - - def apply_hint(self, value): - if isinstance(value, str): - if value in self.value_map: - result = self.value_map[value] - return result - else: - lvalue = value.lower() - found = [] - for lkey, key in self.value_map.items(): - if lkey.startswith(lvalue): - found.append(lkey) - if len(found) == 1: - [only_found] = found - result = self.value_map[only_found] - return result - return super().apply_hint(value) - - -OptionalTypeHints = List[Optional[TypeHint]] - - -class ItemTools: - """ - Implements operations on table-related data without pre-supposing the specific representation of the table. - It is assumed this can be used for data that was obtained from .json, .csv, .tsv, and .xlsx files because - it does not presuppose the source of the data nor where it will be written to. - - For the purpose of this class: - - * a 'header' is a string representing the top of a column. - * a 'parsed header' is a list of strings and/or ints, after splitting at uses of '#' or '.', so that - "a.b.c" is represented as ["a", "b", "c"], and "x.y#0" is represented as ["x", "y", 0], and representing - each numeric token as an int instead of a string. - - * a 'headers' object is just a list of strings, each of which is a 'header'. - - * a 'parsed headers' object is a non-empty list of lists, each of which is a 'parsed header'. - e..g., the headers ["a.b.c", "x.y#0"] is represented as parsed hearders [["a", "b", "c"], ["x", "y", 0]]. - - """ - - @classmethod - def parse_sheet_header(cls, header: Header) -> ParsedHeader: - result = [] - token = "" - for i in range(len(header)): - ch = header[i] - if ch == '.' or ch == '#': - if token: - result.append(int(token) if token.isdigit() else token) - token = "" - else: - token += ch - if token: - result.append(int(token) if token.isdigit() else token) - return result - - @classmethod - def parse_sheet_headers(cls, headers: Headers): - return [cls.parse_sheet_header(header) - for header in headers] - - @classmethod - def compute_patch_prototype(cls, parsed_headers: ParsedHeaders): - prototype = {} - for parsed_header in parsed_headers: - parsed_header0 = parsed_header[0] - if isinstance(parsed_header0, int): - raise LoadTableError(f"A header cannot begin with a numeric ref: {parsed_header0}") - cls.assure_patch_prototype_shape(parent=prototype, keys=parsed_header) - return prototype - - @classmethod - def assure_patch_prototype_shape(cls, *, parent: Union[Dict, List], keys: ParsedHeader): - [key0, *more_keys] = keys - key1 = more_keys[0] if more_keys else None - if isinstance(key1, int): - placeholder = [] - elif isinstance(key1, str): - placeholder = {} - else: - placeholder = None - if isinstance(key0, int): - n = len(parent) - if key0 == n: - parent.append(placeholder) - elif key0 > n: - raise LoadTableError("Numeric items must occur sequentially.") - elif isinstance(key0, str): - if key0 not in parent: - parent[key0] = placeholder - if key1 is not None: - cls.assure_patch_prototype_shape(parent=parent[key0], keys=more_keys) - return parent - - INSTAGUIDS_ENABLED = False # Experimental feature not enabled by default - - @classmethod - def parse_item_value(cls, value: SheetCellValue, context=None) -> AnyJsonData: - # TODO: Remodularize this for easier testing and more Schema-driven effect - # Doug asks that this be broken up into different mechanisms, more modular and separately testable. - # I pretty much agree with that. I'm just waiting for suggestions on what kinds of features are desired. - if isinstance(value, str): - lvalue = value.lower() - # TODO: We could consult a schema to make this less heuristic, but this may do for now - if lvalue == 'true': - return True - elif lvalue == 'false': - return False - elif lvalue == 'null' or lvalue == '': - return None - elif '|' in value: - if value == '|': # Use '|' for [] - return [] - else: - if value.endswith("|"): # Use 'foo|' for ['foo'] - value = value[:-1] - return [cls.parse_item_value(subvalue, context=context) for subvalue in value.split('|')] - elif cls.INSTAGUIDS_ENABLED and context is not None and value.startswith('#'): - # Note that this clause MUST follow '|' clause above so '#foo|#bar' isn't seen as instaguid - return cls.get_instaguid(value, context=context) - else: - # Doug points out that the schema might not agree, might want a string representation of a number. - # At this semantic layer, this might be a bad choice. - return prefer_number(value) - else: # presumably a number (int or float) - return value - - @classmethod - def get_instaguid(cls, guid_placeholder: str, *, context: Optional[Dict] = None): - if context is None: - return guid_placeholder - else: - referent = context.get(guid_placeholder) - if not referent: - context[guid_placeholder] = referent = str(uuid.uuid4()) - return referent - - @classmethod - def set_path_value(cls, datum: Union[List, Dict], path: ParsedHeader, value: Any, force: bool = False): - if (value is None or value == '') and not force: - return - [key, *more_path] = path - if not more_path: - datum[key] = value - else: - cls.set_path_value(datum[key], more_path, value) - - @classmethod - def find_type_hint(cls, parsed_header: Optional[ParsedHeader], schema: Any): - - def finder(subheader, subschema): - if not parsed_header: - return None - else: - [key1, *other_headers] = subheader - if isinstance(key1, str) and isinstance(subschema, dict): - if subschema.get('type') == 'object': - def1 = subschema.get('properties', {}).get(key1) - if not other_headers: - if def1 is not None: - t = def1.get('type') - if t == 'string': - enum = def1.get('enum') - if enum: - mapping = {e.lower(): e for e in enum} - return EnumHint(mapping) - elif t == 'boolean': - return BoolHint() - else: - pass # fall through to asking super() - else: - pass # fall through to asking super() - else: - return finder(subheader=other_headers, subschema=def1) - - return finder(subheader=parsed_header, subschema=schema) - - @classmethod - def infer_tab_name(cls, filename): - return os.path.basename(filename).split('.')[0] # TODO: Consider whether this might want to be an abstract base class. Some change might be needed. @@ -506,11 +279,6 @@ def parse_cell_value(cls, value: SheetCellValue) -> AnyJsonData: return prefer_number(value) -class AbstractItemManager(AbstractTableSetManager): - - pass - - class TableSetManagerRegistry: def __init__(self): @@ -542,7 +310,7 @@ def manager_for_filename(self, filename: str) -> Type[AbstractTableSetManager]: if found: return found else: - special_case: Optional[Type[AbstractItemManager]] = self.manager_for_special_filename(filename) + special_case: Optional[Type[AbstractTableSetManager]] = self.manager_for_special_filename(filename) if special_case: return special_case raise LoadArgumentsError(f"Unknown file type: {filename}") @@ -555,7 +323,7 @@ def manager_for_special_filename(self, filename: str) -> Optional[Type[AbstractT TABLE_SET_MANAGER_REGISTRY = TableSetManagerRegistry() -ITEM_MANAGER_REGISTRY = TableSetManagerRegistry() + @TABLE_SET_MANAGER_REGISTRY.register() @@ -607,151 +375,14 @@ def _process_row(self, tab_name: str, headers: Headers, row_data: SheetRow) -> A for i, row_datum in enumerate(row_data)} -class SchemaAutoloadMixin(AbstractTableSetManager): - - SCHEMA_CACHE = {} # Shared cache. Do not override. Use .clear_schema_cache() to clear it. - CACHE_SCHEMAS = True # Controls whether we're doing caching at all - AUTOLOAD_SCHEMAS_DEFAULT = True - - def __init__(self, filename: str, autoload_schemas: Optional[bool] = None, portal_env: Optional[str] = None, - portal_vapp: Optional[AbstractVirtualApp] = None, **kwargs): - # This setup must be in place before the class initialization is done (via the super call). - self.autoload_schemas: bool = self.AUTOLOAD_SCHEMAS_DEFAULT if autoload_schemas is None else autoload_schemas - if self.autoload_schemas: # If autoload_schemas is False, we don't care about doing this defaulting. - if portal_env is None and portal_vapp is None: - portal_env = public_env_name(EnvUtils.PRD_ENV_NAME) - PRINT(f"The portal_env was not explicitly supplied. Schemas will come from portal_env={portal_env!r}.") - self.portal_env: Optional[str] = portal_env - self.portal_vapp: Optional[AbstractVirtualApp] = portal_vapp - super().__init__(filename=filename, **kwargs) - - def fetch_relevant_schemas(self, schema_names: List[str]): - # The schema_names argument is not normally given, but it is there for easier testing - def fetch_schema(schema_name): - schema = self.fetch_schema(schema_name, portal_env=self.portal_env, portal_vapp=self.portal_vapp) - return schema_name, schema - if self.autoload_schemas and (self.portal_env or self.portal_vapp): - autoloaded = {tab_name: schema - for tab_name, schema in pmap(fetch_schema, schema_names)} - return autoloaded - else: - return {} - - @classmethod - def fetch_schema(cls, schema_name: str, *, portal_env: Optional[str] = None, - portal_vapp: Optional[AbstractVirtualApp] = None): - def just_fetch_it(): - return get_schema(schema_name, portal_env=portal_env, portal_vapp=portal_vapp) - if cls.CACHE_SCHEMAS: - schema: Optional[AnyJsonData] = cls.SCHEMA_CACHE.get(schema_name) - if schema is None: - cls.SCHEMA_CACHE[schema_name] = schema = just_fetch_it() - return schema - else: - return just_fetch_it() - - @classmethod - def clear_schema_cache(cls): - for key in list(cls.SCHEMA_CACHE.keys()): # important to get the list of keys as a separate object first - cls.SCHEMA_CACHE.pop(key, None) - - -class ItemManagerMixin(SchemaAutoloadMixin, AbstractItemManager, BasicTableSetManager): - """ - This can add functionality to a reader such as an XlsxManager or a CsvManager in order to make its rows - get handled like Items instead of just flat table rows. - """ - - def __init__(self, filename: str, schemas: Optional[Dict[str, AnyJsonData]] = None, **kwargs): - super().__init__(filename=filename, **kwargs) - self.patch_prototypes_by_tab_name: Dict[str, Dict] = {} - self.parsed_headers_by_tab_name: Dict[str, ParsedHeaders] = {} - self.type_hints_by_tab_name: Dict[str, OptionalTypeHints] = {} - self._schemas = schemas - self._instaguid_context_table: Dict[str, str] = {} - - @property - def schemas(self): - schemas = self._schemas - if schemas is None: - self._schemas = schemas = self.fetch_relevant_schemas(self.tab_names) - return schemas - - def sheet_patch_prototype(self, tab_name: str) -> Dict: - return self.patch_prototypes_by_tab_name[tab_name] - - def sheet_parsed_headers(self, tab_name: str) -> ParsedHeaders: - return self.parsed_headers_by_tab_name[tab_name] - - def sheet_type_hints(self, tab_name: str) -> OptionalTypeHints: - return self.type_hints_by_tab_name[tab_name] - - class SheetState: - - def __init__(self, parsed_headers: ParsedHeaders, type_hints: OptionalTypeHints): - self.parsed_headers = parsed_headers - self.type_hints = type_hints - - def _compile_type_hints(self, tab_name: str): - parsed_headers = self.sheet_parsed_headers(tab_name) - schema = self.schemas.get(tab_name) - with deferred_problems() as note_problem: - for required_header in self._schema_required_headers(schema): - if required_header not in parsed_headers: - note_problem("Missing required header") - type_hints = [ItemTools.find_type_hint(parsed_header, schema) if schema else None - for parsed_header in parsed_headers] - self.type_hints_by_tab_name[tab_name] = type_hints - - @classmethod - def _schema_required_headers(cls, schema): - ignored(schema) - return [] # TODO: Make this compute a list of required headers (in parsed header form) - - def _compile_sheet_headers(self, tab_name: str): - headers = self.headers_by_tab_name[tab_name] - parsed_headers = ItemTools.parse_sheet_headers(headers) - self.parsed_headers_by_tab_name[tab_name] = parsed_headers - prototype = ItemTools.compute_patch_prototype(parsed_headers) - self.patch_prototypes_by_tab_name[tab_name] = prototype - - def _create_tab_processor_state(self, tab_name: str) -> SheetState: - super()._create_tab_processor_state(tab_name) - # This will create state that allows us to efficiently assign values in the right place on each row - # by setting up a prototype we can copy and then drop values into. - self._compile_sheet_headers(tab_name) - self._compile_type_hints(tab_name) - return self.SheetState(parsed_headers=self.sheet_parsed_headers(tab_name), - type_hints=self.sheet_type_hints(tab_name)) - - def _process_row(self, tab_name: str, state: SheetState, row_data: SheetRow) -> AnyJsonData: - parsed_headers = state.parsed_headers - type_hints = state.type_hints - patch_item = copy.deepcopy(self.sheet_patch_prototype(tab_name)) - for i, value in enumerate(row_data): - parsed_value = self.parse_cell_value(value) - type_hint = type_hints[i] - if type_hint: - parsed_value = type_hint.apply_hint(parsed_value) - ItemTools.set_path_value(patch_item, parsed_headers[i], parsed_value) - return patch_item - - def parse_cell_value(self, value: SheetCellValue) -> AnyJsonData: - return ItemTools.parse_item_value(value, context=self._instaguid_context_table) - - -@ITEM_MANAGER_REGISTRY.register() -class XlsxItemManager(ItemManagerMixin, XlsxManager): - """ - This layers item-style row processing functionality on an XLSX file. - """ - pass +def infer_tab_name_from_filename(filename): + return os.path.basename(filename).split('.')[0] class SingleTableMixin(AbstractTableSetManager): def __init__(self, filename: str, tab_name: Optional[str] = None, **kwargs): - self._tab_name = tab_name or ItemTools.infer_tab_name(filename) + self._tab_name = tab_name or infer_tab_name_from_filename(filename) super().__init__(filename=filename, **kwargs) @property @@ -759,7 +390,7 @@ def tab_names(self) -> List[str]: return [self._tab_name] -class InsertsManager(BasicTableSetManager): # ItemManagerMixin isn't really appropriate here +class InsertsManager(BasicTableSetManager): ALLOWED_FILE_EXTENSIONS = [] @@ -767,12 +398,12 @@ def _parse_inserts_data(self, filename: str) -> AnyJsonData: raise NotImplementedError(f"._parse_inserts_dataa(...) is not implemented for {self.__class__.__name__}.") # noQA def _load_inserts_data(self, filename: str) -> TabbedSheetData: - data: AnyJsonData = self._parse_inserts_data(filename) - tabbed_inserts: AnyJsonData = self._wrap_inserts_data(filename, data) + raw_data: AnyJsonData = self._parse_inserts_data(filename) + tabbed_inserts: AnyJsonData = self._wrap_inserts_data(filename, raw_data) if (not isinstance(tabbed_inserts, dict) or not all(isinstance(tab_name, str) for tab_name in tabbed_inserts.keys()) - or not all(isinstance(content, list) and all(isinstance(item, dict) for item in content) - for content in tabbed_inserts.values())): + or not all(isinstance(data, list) and all(isinstance(datum, dict) for datum in data) + for data in tabbed_inserts.values())): raise ValueError(f"Data in {filename} is not of type TabbedSheetData (Dict[str, List[dict]]).") tabbed_inserts: TabbedSheetData # we've just checked that return tabbed_inserts @@ -804,7 +435,7 @@ class SimpleInsertsMixin(SingleTableMixin): def _wrap_inserts_data(self, filename: str, data: AnyJsonData) -> TabbedSheetData: if (not isinstance(data, list) - or not all(isinstance(item, dict) for item in data)): + or not all(isinstance(datum, dict) for datum in data)): raise ValueError(f"Data in {filename} is not of type SheetData (List[dict]).") return {self._tab_name: data} @@ -849,44 +480,7 @@ class SimpleYamlInsertsManager(SimpleInsertsMixin, YamlInsertsMixin, InsertsMana ALLOWED_FILE_EXTENSIONS = [".yaml"] -class InsertsItemMixin(AbstractItemManager): # ItemManagerMixin isn't really appropriate here - """ - This class is used for inserts directories and other JSON-like data that will be literally used as an Item - without semantic pre-processing. In other words, these classes will not be pre-checked for semantic correctness - but instead assumed to have been checked by other means. - """ - - AUTOLOAD_SCHEMAS_DEFAULT = False # Has no effect, but someone might inspect the value. - def __init__(self, filename: str, *, autoload_schemas: Optional[bool] = None, portal_env: Optional[str] = None, - portal_vapp: Optional[AbstractVirtualApp] = None, schemas: Optional[Dict[str, AnyJsonData]] = None, - **kwargs): - ignored(portal_env, portal_vapp) # Would only be used if autoload_schemas was true, and we don't allow that. - if schemas not in [None, {}]: - raise ValueError(f"{self.__class__.__name__} does not allow schemas={schemas!r}.") - if autoload_schemas not in [None, False]: - raise ValueError(f"{self.__class__.__name__} does not allow autoload_schemas={autoload_schemas!r}.") - super().__init__(filename=filename, **kwargs) - - -@ITEM_MANAGER_REGISTRY.register() -class TabbedJsonInsertsItemManager(InsertsItemMixin, TabbedJsonInsertsManager): - pass - - -@ITEM_MANAGER_REGISTRY.register() -class SimpleJsonInsertsItemManager(InsertsItemMixin, SimpleJsonInsertsManager): - pass - - -@ITEM_MANAGER_REGISTRY.register() -class TabbedYamlInsertsItemManager(InsertsItemMixin, TabbedYamlInsertsManager): - pass - - -@ITEM_MANAGER_REGISTRY.register() -class SimpleYamlInsertsItemManager(InsertsItemMixin, SimpleYamlInsertsManager): - pass @TABLE_SET_MANAGER_REGISTRY.register() @@ -898,9 +492,6 @@ def _parse_inserts_data(self, filename: str) -> AnyJsonData: return [line for line in JsonLinesReader(open_unicode_text_input_file_respecting_byte_order_mark(filename))] -@ITEM_MANAGER_REGISTRY.register() -class SimpleJsonLinesInsertsItemManager(InsertsItemMixin, SimpleJsonLinesInsertsManager): - pass @TABLE_SET_MANAGER_REGISTRY.register(regexp="^(.*/)?(|[^/]*[-_])inserts/?$") @@ -924,9 +515,6 @@ def _parse_inserts_data(self, filename: str) -> AnyJsonData: return data -@ITEM_MANAGER_REGISTRY.register(regexp="^(.*/)?(|[^/]*[-_])inserts/?$") -class InsertsDirectoryItemManager(InsertsItemMixin, InsertsDirectoryManager): - pass @TABLE_SET_MANAGER_REGISTRY.register() @@ -982,12 +570,6 @@ def _process_row(self, tab_name: str, headers: Headers, row_data: SheetRow) -> A for i, cell_text in enumerate(row_data)} -@ITEM_MANAGER_REGISTRY.register() -class CsvItemManager(ItemManagerMixin, CsvManager): - """ - This layers item-style row processing functionality on a CSV file. - """ - pass @TABLE_SET_MANAGER_REGISTRY.register() @@ -1003,12 +585,6 @@ def _get_reader_agent_for_filename(cls, filename) -> CsvReader: return csv.reader(open_unicode_text_input_file_respecting_byte_order_mark(filename), delimiter='\t') -@ITEM_MANAGER_REGISTRY.register() -class TsvItemManager(ItemManagerMixin, TsvManager): - """ - This layers item-style row processing functionality on a TSV file. - """ - pass class TableSetManager(AbstractTableSetManager): @@ -1020,8 +596,11 @@ class TableSetManager(AbstractTableSetManager): @classmethod def create_implementation_manager(cls, filename: str, **kwargs) -> AbstractTableSetManager: reader_agent_class = TABLE_SET_MANAGER_REGISTRY.manager_for_filename(filename) - if issubclass(reader_agent_class, AbstractItemManager): - raise ValueError(f"TableSetManager unexpectedly found reader agent class {reader_agent_class}.") + # This is a bad forward reference in current refactor, but also may be testing for something we don't need + # to worry about anymore. -kmp 11-Sep-2023 + # + # if issubclass(reader_agent_class, AbstractItemManager): + # raise ValueError(f"TableSetManager unexpectedly found reader agent class {reader_agent_class}.") reader_agent = reader_agent_class(filename=filename, **kwargs) return reader_agent @@ -1035,44 +614,8 @@ def load(cls, filename: str, tab_name: Optional[str] = None, escaping: Optional[ return manager.load_content() -class ItemManager(AbstractTableSetManager): - """ - This class will open a .xlsx or .csv file and load its content in our standard format. - (See more detailed description in AbstractTableManager.) - """ - - @classmethod - def create_implementation_manager(cls, filename: str, **kwargs) -> AbstractItemManager: - reader_agent_class: Type[AbstractTableSetManager] = ITEM_MANAGER_REGISTRY.manager_for_filename(filename) - if not issubclass(reader_agent_class, AbstractItemManager): - raise ValueError(f"ItemManager unexpectedly found reader agent class {reader_agent_class}.") - reader_agent_class: Type[AbstractItemManager] - reader_agent = reader_agent_class(filename=filename, **kwargs) - return reader_agent - @classmethod - def load(cls, filename: str, tab_name: Optional[str] = None, escaping: Optional[bool] = None, - schemas: Optional[Dict] = None, autoload_schemas: Optional[bool] = None, - portal_env: Optional[str] = None, portal_vapp: Optional[AbstractVirtualApp] = None, - **kwargs) -> TabbedSheetData: - """ - Given a filename and various options, loads the items associated with that filename. - - :param filename: The name of the file to load. - :param tab_name: For files that lack multiple tabs (such as .csv or .tsv), - the tab name to associate with the data. - :param escaping: Whether to perform escape processing on backslashes. - :param schemas: A set of schemas to use instead of trying to load them. - :param autoload_schemas: Whether to try autoloading schemas. - :param portal_env: A portal to consult to find schemas (usually if calling from the outside of a portal). - :param portal_vapp: A vapp to use (usually if calling from within a portal). - """ - manager = cls.create_implementation_manager(filename=filename, tab_name=tab_name, escaping=escaping, - schemas=schemas, autoload_schemas=autoload_schemas, - portal_env=portal_env, portal_vapp=portal_vapp, - **kwargs) - return manager.load_content() load_table_set = TableSetManager.load -load_items = ItemManager.load + diff --git a/test/test_bundle_utils.py b/test/test_bundle_utils.py new file mode 100644 index 000000000..48c8fb2af --- /dev/null +++ b/test/test_bundle_utils.py @@ -0,0 +1,702 @@ +import contextlib +import json +import os +import pytest + +# from collections import namedtuple +from dcicutils import bundle_utils as bundle_utils_module, ff_utils as ff_utils_module +from dcicutils.common import AnyJsonData +from dcicutils.env_utils import EnvUtils, public_env_name +from dcicutils.misc_utils import is_uuid, local_attrs, NamedObject, AbstractVirtualApp +from dcicutils.qa_utils import printed_output, mock_not_called, MockResponse +from dcicutils.bundle_utils import ( + # High-level interfaces + ItemManager, load_items, ITEM_MANAGER_REGISTRY, + # Low-level implementation + SchemaAutoloadMixin, + ItemTools, XlsxItemManager, + CsvItemManager, TsvItemManager, + # TypeHint, EnumHint, + BoolHint, +) +from dcicutils.sheet_utils import ( + # High-level interfaces + # TABLE_SET_MANAGER_REGISTRY, + # Low-level implementation + # BasicTableSetManager, + # XlsxManager, + CsvManager, # TsvManager, + # Error handling + LoadArgumentsError, LoadTableError, # LoadFailure, + # Utilities + infer_tab_name_from_filename, # prefer_number, unwanted_kwargs, expand_string_escape_sequences, +) +from typing import Dict, Optional +from unittest import mock +from .conftest_settings import TEST_DIR +from .helpers import using_fresh_ff_state_for_testing +from .test_sheet_utils import ( + SAMPLE_XLSX_FILE, SAMPLE_XLSX_FILE_ITEM_CONTENT, # SAMPLE_XLSX_FILE_RAW_CONTENT, + SAMPLE_CSV_FILE, SAMPLE_CSV_FILE_ITEM_CONTENT, # SAMPLE_CSV_FILE_RAW_CONTENT, + SAMPLE_TSV_FILE, SAMPLE_TSV_FILE_ITEM_CONTENT, # SAMPLE_TSV_FILE_RAW_CONTENT, + SAMPLE_JSON_TABS_FILE, SAMPLE_JSON_TABS_FILE_ITEM_CONTENT, + SAMPLE_YAML_TABS_FILE, +) + + +def test_item_tools_parse_sheet_header(): + assert ItemTools.parse_sheet_header('.a') == ['a'] + assert ItemTools.parse_sheet_header('a') == ['a'] + assert ItemTools.parse_sheet_header('#0') == [0] + assert ItemTools.parse_sheet_header('0') == [0] + assert ItemTools.parse_sheet_header('foo.bar') == ['foo', 'bar'] + assert ItemTools.parse_sheet_header('a.b#0') == ['a', 'b', 0] + assert ItemTools.parse_sheet_header('x.xx#17#8.z') == ['x', 'xx', 17, 8, 'z'] + + # We don't error-check this, but it shouldn't matter + assert ItemTools.parse_sheet_header('#abc') == ['abc'] + assert ItemTools.parse_sheet_header('.123') == [123] + assert ItemTools.parse_sheet_header('#abc.123#456.def') == ['abc', 123, 456, 'def'] + + +def test_item_tools_parse_sheet_headers(): + input = ['a.b', 'a.c', 'a.d#1', 'a.d#2'] + expected = [['a', 'b'], ['a', 'c'], ['a', 'd', 1], ['a', 'd', 2]] + assert ItemTools.parse_sheet_headers(input) == expected + + +@pytest.mark.parametrize('parsed_headers,expected_prototype', [ + (['a'], + {'a': None}), + (['a', 'b'], + {'a': None, 'b': None}), + (['a.b', 'a.c', 'a.d#0', 'a.d#1'], + {'a': {'b': None, 'c': None, 'd': [None, None]}}), + (['a.b', 'a.c', 'a.d#0.foo', 'a.d#0.bar'], + {'a': {'b': None, 'c': None, 'd': [{'foo': None, 'bar': None}]}}), + (['a.b', 'a.c', 'a.d#0.foo', 'a.d#0.bar', 'a.d#1.foo', 'a.d#1.bar'], + {'a': {'b': None, 'c': None, 'd': [{'foo': None, 'bar': None}, {'foo': None, 'bar': None}]}}), +]) +def test_item_tools_compute_patch_prototype(parsed_headers, expected_prototype): + parsed_headers = ItemTools.parse_sheet_headers(parsed_headers) + assert ItemTools.compute_patch_prototype(parsed_headers) == expected_prototype + + +@pytest.mark.parametrize('headers', [['0'], ['x', '0.y']]) +def test_item_tools_compute_patch_prototype_errors(headers): + + parsed_headers = ItemTools.parse_sheet_headers(headers) + with pytest.raises(LoadTableError) as exc: + ItemTools.compute_patch_prototype(parsed_headers) + assert str(exc.value) == "A header cannot begin with a numeric ref: 0" + + +def test_item_tools_parse_item_value_basic(): + + for x in [37, 19.3, True, False, None, 'simple text']: + assert ItemTools.parse_item_value(x) == x + + assert ItemTools.parse_item_value('3') == 3 + assert ItemTools.parse_item_value('+3') == 3 + assert ItemTools.parse_item_value('-3') == -3 + + assert ItemTools.parse_item_value('3.5') == 3.5 + assert ItemTools.parse_item_value('+3.5') == 3.5 + assert ItemTools.parse_item_value('-3.5') == -3.5 + + assert ItemTools.parse_item_value('3.5e1') == 35.0 + assert ItemTools.parse_item_value('+3.5e1') == 35.0 + assert ItemTools.parse_item_value('-3.5e1') == -35.0 + + assert ItemTools.parse_item_value('') is None + + assert ItemTools.parse_item_value('null') is None + assert ItemTools.parse_item_value('Null') is None + assert ItemTools.parse_item_value('NULL') is None + + assert ItemTools.parse_item_value('true') is True + assert ItemTools.parse_item_value('True') is True + assert ItemTools.parse_item_value('TRUE') is True + + assert ItemTools.parse_item_value('false') is False + assert ItemTools.parse_item_value('False') is False + assert ItemTools.parse_item_value('FALSE') is False + + assert ItemTools.parse_item_value('|') == [] # special case: lone '|' means empty + assert ItemTools.parse_item_value('alpha|') == ['alpha'] # special case: trailing '|' means singleton + assert ItemTools.parse_item_value('|alpha|') == [None, 'alpha'] + assert ItemTools.parse_item_value('|alpha') == [None, 'alpha'] + assert ItemTools.parse_item_value('alpha|beta|gamma') == ['alpha', 'beta', 'gamma'] + assert ItemTools.parse_item_value('alpha|true|false|null||7|1.5') == ['alpha', True, False, None, None, 7, 1.5] + + +@pytest.mark.parametrize('instaguids_enabled', [True, False]) +def test_item_tools_parse_item_value_guids(instaguids_enabled): + + with local_attrs(ItemTools, INSTAGUIDS_ENABLED=instaguids_enabled): + + sample_simple_field_input = "#foo" + + parsed = ItemTools.parse_item_value(sample_simple_field_input) + assert parsed == sample_simple_field_input + + context = {} + parsed = ItemTools.parse_item_value(sample_simple_field_input, context=context) + if instaguids_enabled: + assert is_uuid(parsed) + assert parsed == context[sample_simple_field_input] + else: + assert parsed == sample_simple_field_input + assert context == {} + + sample_compound_field_input = '#foo|#bar' + sample_compound_field_list = ['#foo', '#bar'] + + parsed = ItemTools.parse_item_value(sample_compound_field_input) + assert parsed == sample_compound_field_list + + context = {} + parsed = ItemTools.parse_item_value(sample_compound_field_input, context=context) + assert isinstance(parsed, list) + if instaguids_enabled: + assert all(is_uuid(x) for x in parsed) + assert '#foo' in context and '#bar' in context + else: + assert parsed == sample_compound_field_list + assert context == {} + + +def test_item_tools_set_path_value(): + + x = {'foo': 1, 'bar': 2} + ItemTools.set_path_value(x, ['foo'], 3) + assert x == {'foo': 3, 'bar': 2} + + x = {'foo': [11, 22, 33], 'bar': {'x': 'xx', 'y': 'yy'}} + ItemTools.set_path_value(x, ['foo', 1], 17) + assert x == {'foo': [11, 17, 33], 'bar': {'x': 'xx', 'y': 'yy'}} + + x = {'foo': [11, 22, 33], 'bar': {'x': 'xx', 'y': 'yy'}} + ItemTools.set_path_value(x, ['bar', 'x'], 'something') + assert x == {'foo': [11, 22, 33], 'bar': {'x': 'something', 'y': 'yy'}} + + +def test_item_tools_find_type_hint(): + + assert ItemTools.find_type_hint(None, 'anything') is None + + assert ItemTools.find_type_hint(['foo', 'bar'], None) is None + assert ItemTools.find_type_hint(['foo', 'bar'], "something") is None + assert ItemTools.find_type_hint(['foo', 'bar'], {}) is None + + actual = ItemTools.find_type_hint(['foo', 'bar'], {"type": "object"}) + assert actual is None + + schema = { + "type": "object", + "properties": { + "foo": { + "type": "boolean" + } + } + } + actual = ItemTools.find_type_hint(['foo', 'bar'], schema) + assert actual is None + + actual = ItemTools.find_type_hint(['foo'], schema) + assert isinstance(actual, BoolHint) + + schema = { + "type": "object", + "properties": { + "foo": { + "type": "object", + "properties": { + "bar": { + "type": "boolean" + } + } + } + } + } + actual = ItemTools.find_type_hint(['foo', 'bar'], schema) + assert isinstance(actual, BoolHint) + + actual = ItemTools.find_type_hint(['foo'], schema) + assert actual is None + + +def test_item_manager_registry_manager_for_filename(): + + assert ITEM_MANAGER_REGISTRY.manager_for_filename("xyz/foo.csv") == CsvItemManager + + with pytest.raises(Exception) as exc: + ITEM_MANAGER_REGISTRY.manager_for_filename("xyz/foo.something.missing") + assert str(exc.value) == "Unknown file type: xyz/foo.something.missing" + + +def test_xlsx_item_manager_load_content(): + + it = XlsxItemManager(SAMPLE_XLSX_FILE, autoload_schemas=False) + assert it.load_content() == SAMPLE_XLSX_FILE_ITEM_CONTENT + + +def test_xlsx_item_manager_load(): + + assert XlsxItemManager.load(SAMPLE_XLSX_FILE, autoload_schemas=False) == SAMPLE_XLSX_FILE_ITEM_CONTENT + + +def test_xlsx_item_manager_load_csv(): + + with pytest.raises(LoadArgumentsError) as exc: + XlsxItemManager.load(SAMPLE_CSV_FILE) + assert str(exc.value).startswith('The TableSetManager subclass XlsxItemManager' + ' expects only .xlsx filenames:') + + + + +def test_csv_item_manager_load_content(): + + it = CsvItemManager(SAMPLE_CSV_FILE, autoload_schemas=False) + assert it.load_content() == SAMPLE_CSV_FILE_ITEM_CONTENT + + +def test_csv_item_manager_load(): + + assert CsvItemManager.load(SAMPLE_CSV_FILE, autoload_schemas=False) == SAMPLE_CSV_FILE_ITEM_CONTENT + + +def test_csv_item_manager_load_csv(): + + with pytest.raises(LoadArgumentsError) as exc: + CsvItemManager.load(SAMPLE_XLSX_FILE, autoload_schemas=False) + assert str(exc.value).startswith('The TableSetManager subclass CsvItemManager' + ' expects only .csv filenames:') + +def test_tsv_item_manager_load_content(): + + it = TsvItemManager(SAMPLE_TSV_FILE, autoload_schemas=False) + assert it.load_content() == SAMPLE_TSV_FILE_ITEM_CONTENT + + +def test_tsv_item_manager_load(): + + assert TsvItemManager.load(SAMPLE_TSV_FILE, autoload_schemas=False) == SAMPLE_TSV_FILE_ITEM_CONTENT + + +def test_tsv_item_manager_load_csv(): + + with pytest.raises(LoadArgumentsError) as exc: + TsvItemManager.load(SAMPLE_XLSX_FILE, autoload_schemas=False) + assert str(exc.value).startswith('The TableSetManager subclass TsvItemManager' + ' expects only .tsv or .tsv.txt filenames:') + + +def test_item_manager_load(): + + assert ItemManager.load(SAMPLE_XLSX_FILE, autoload_schemas=False) == SAMPLE_XLSX_FILE_ITEM_CONTENT + + assert ItemManager.load(SAMPLE_CSV_FILE, autoload_schemas=False) == SAMPLE_CSV_FILE_ITEM_CONTENT + + assert ItemManager.load(SAMPLE_TSV_FILE, autoload_schemas=False) == SAMPLE_TSV_FILE_ITEM_CONTENT + + loaded = ItemManager.load(SAMPLE_JSON_TABS_FILE, autoload_schemas=False) + print("loaded=", json.dumps(loaded, indent=2)) + expected = SAMPLE_JSON_TABS_FILE_ITEM_CONTENT + print("expected=", json.dumps(expected, indent=2)) + assert loaded == expected + + with pytest.raises(LoadArgumentsError) as exc: + ItemManager.load("something.else") + assert str(exc.value) == "Unknown file type: something.else" + + +def test_load_items(): + + assert load_items(SAMPLE_XLSX_FILE, autoload_schemas=False) == SAMPLE_XLSX_FILE_ITEM_CONTENT + + assert load_items(SAMPLE_CSV_FILE, autoload_schemas=False) == SAMPLE_CSV_FILE_ITEM_CONTENT + + with pytest.raises(LoadArgumentsError) as exc: + load_items("something.else") + assert str(exc.value) == "Unknown file type: something.else" + + +SAMPLE_CSV_FILE2 = os.path.join(TEST_DIR, 'data_files/sample_items2.csv') + +SAMPLE_CSV_FILE2_SHEET_NAME = infer_tab_name_from_filename(SAMPLE_CSV_FILE2) + +SAMPLE_CSV_FILE2_SCHEMAS = { + "Person": { + "type": "object", + "properties": { + "name": {"type": "string"}, + "sex": {"type": "string", "enum": ["Male", "Female"]}, + "member": {"type": "boolean"} + } + } +} + +SAMPLE_CSV_FILE2_CONTENT = { + SAMPLE_CSV_FILE2_SHEET_NAME: [ + {"name": "john", "sex": "M", "member": "false"}, + {"name": "juan", "sex": "male", "member": "true"}, + {"name": "igor", "sex": "unknown", "member": None}, + {"name": "mary", "sex": "Female", "member": "t"} + ] +} + +SAMPLE_CSV_FILE2_ITEM_CONTENT = { + SAMPLE_CSV_FILE2_SHEET_NAME: [ + {"name": "john", "sex": "M", "member": False}, + {"name": "juan", "sex": "male", "member": True}, + {"name": "igor", "sex": "unknown", "member": None}, + {"name": "mary", "sex": "Female", "member": "t"} + ] +} + +SAMPLE_CSV_FILE2_PERSON_CONTENT_HINTED = { + "Person": [ + {"name": "john", "sex": "Male", "member": False}, + {"name": "juan", "sex": "Male", "member": True}, + {"name": "igor", "sex": "unknown", "member": None}, + {"name": "mary", "sex": "Female", "member": True} + ] +} + + +SAMPLE_JSON_FILE2 = os.path.join(TEST_DIR, 'data_files/sample_items2.json') + +SAMPLE_JSON_FILE2_SHEET_NAME = infer_tab_name_from_filename(SAMPLE_JSON_FILE2) + + +SAMPLE_CSV_FILE3_SCHEMAS = { + "Person": { + "type": "object", + "properties": { + "name": {"type": "string"}, + "sex": {"type": "string", "enum": ["Male", "Female"]}, + "children": {"type": "array", "items": {"type": "string"}}, + "parents": {"type": "array", "items": {"type": "string"}}, + "mother": {"type": "string"}, + "father": {"type": "string"}, + } + } +} + +SAMPLE_CSV_FILE3_PERSON_CONTENT_HINTED = { + "Person": [ + { + "name": "John", + "uuid": "#john", + "sex": "Male", + "father": "#igor", + "mother": "#mary", + "parents": None, + "children": None, + }, + { + "name": "Juan", + "uuid": "#juan", + "sex": "Male", + "father": None, + "mother": None, + "parents": ["#igor", "#mary"], + "children": None, + }, + { + "name": "Igor", + "uuid": "#igor", + "sex": "Male", + "father": None, + "mother": None, + "parents": None, + "children": ["#john"], + }, + { + "name": "Mary", + "uuid": "#mary", + "sex": "Female", + "father": None, + "mother": None, + "parents": None, + "children": ["#john"], + }, + ] +} + +SAMPLE_CSV_FILE3 = os.path.join(TEST_DIR, 'data_files/sample_items3.csv') + + +def matches_template(json1: AnyJsonData, json2: AnyJsonData, *, previous_matches: Dict[str, str] = None) -> bool: + if previous_matches is None: + previous_matches = {} + if isinstance(json1, dict) and isinstance(json2, dict): + keys1 = set(json1.keys()) + keys2 = set(json2.keys()) + if keys1 != keys2: + print(f"Keys don't match: {keys1} vs {keys2}") + return False + return all(matches_template(json1[key], json2[key], previous_matches=previous_matches) for key in keys1) + elif isinstance(json1, list) and isinstance(json2, list): + n1 = len(json1) + n2 = len(json2) + if n1 != n2: + print(f"Length doesn't match: {n1} vs {n2}") + return False + return all(matches_template(json1[i], json2[i], previous_matches=previous_matches) for i in range(n1)) + elif isinstance(json1, str) and isinstance(json2, str) and is_uuid(json1) and json2.startswith("#"): + previously_matched = previous_matches.get(json2) + if previously_matched: + result = json1 == previously_matched + if not result: + print(f"Instaguid mismatch: {json1} vs {json2}") + return result + else: + # Remember the match + previous_matches[json2] = json1 + return True + else: # any other atomic items can be just directly compared + result = json1 == json2 + if not result: + print(f"Unequal: {json1} vs {json2}") + return result + + +def test_load_items_with_schema(): + + print("Case 1") + expected = SAMPLE_CSV_FILE2_CONTENT + actual = CsvManager.load(SAMPLE_CSV_FILE2) + assert actual == expected + + print("Case 2") + expected = SAMPLE_CSV_FILE2_ITEM_CONTENT + actual = load_items(SAMPLE_CSV_FILE2, schemas=SAMPLE_CSV_FILE2_SCHEMAS) + assert actual == expected + + print("Case 3") + expected = SAMPLE_CSV_FILE2_PERSON_CONTENT_HINTED + actual = load_items(SAMPLE_CSV_FILE2, schemas=SAMPLE_CSV_FILE2_SCHEMAS, tab_name='Person') + assert actual == expected + + +def test_sample_items_csv_vs_json(): + + csv_content = load_items(SAMPLE_CSV_FILE2, schemas=SAMPLE_CSV_FILE2_SCHEMAS, tab_name='Person') + + json_content = load_items(SAMPLE_JSON_FILE2, tab_name="Person") + + assert csv_content == json_content + + +def test_sample_items_json_vs_yaml(): + + tabs_data_from_json = load_items(SAMPLE_JSON_TABS_FILE) + tabs_data_from_yaml = load_items(SAMPLE_YAML_TABS_FILE) + assert tabs_data_from_json == tabs_data_from_yaml + + +@pytest.mark.parametrize('instaguids_enabled', [True, False]) +def test_load_items_with_schema_and_instaguids(instaguids_enabled): + + with local_attrs(ItemTools, INSTAGUIDS_ENABLED=instaguids_enabled): + + expected = SAMPLE_CSV_FILE3_PERSON_CONTENT_HINTED + print("expected=", json.dumps(expected, indent=2)) + actual = load_items(SAMPLE_CSV_FILE3, schemas=SAMPLE_CSV_FILE3_SCHEMAS, tab_name='Person') + print("actual=", json.dumps(actual, indent=2)) + if instaguids_enabled: + assert matches_template(actual, expected) + else: + assert actual == expected # no substitution performed + + +class SchemaAutoloaderForTesting(SchemaAutoloadMixin): + + def __init__(self, **kwargs): + super().__init__(filename='ignored.file.name', **kwargs) + + +@contextlib.contextmanager +def schema_autoloader_for_testing(**kwargs) -> SchemaAutoloadMixin: + autoloader: Optional[SchemaAutoloadMixin] = None + success = False + try: + autoloader: SchemaAutoloadMixin = SchemaAutoloaderForTesting(**kwargs) + assert autoloader.SCHEMA_CACHE == {}, "The schema cache is not clean." + yield autoloader + success = True + finally: + if autoloader is not None: + autoloader.clear_schema_cache() + assert autoloader.SCHEMA_CACHE == SchemaAutoloadMixin.SCHEMA_CACHE == {} + if not success: + raise + + +@using_fresh_ff_state_for_testing() +@pytest.mark.integrated +@pytest.mark.parametrize('portal_env', [None, 'data']) +def test_schema_autoload_mixin_caching(portal_env): + + with schema_autoloader_for_testing(portal_env=portal_env) as autoloader: + + assert autoloader.portal_env == 'data' # it should have defaulted even if we didn't supply it + + assert autoloader.SCHEMA_CACHE == SchemaAutoloadMixin.SCHEMA_CACHE == {} + + sample_schema_name = 'foo' + sample_schema = {'mock_schema_for': 'foo'} + + with mock.patch.object(bundle_utils_module, "get_schema") as mock_get_schema: + mock_get_schema.return_value = sample_schema + assert autoloader.fetch_schema(sample_schema_name, portal_env=autoloader.portal_env) == sample_schema + + schema_cache_with_sample_schema = {sample_schema_name: sample_schema} + assert SchemaAutoloadMixin.SCHEMA_CACHE == schema_cache_with_sample_schema + assert autoloader.SCHEMA_CACHE == schema_cache_with_sample_schema + + +@using_fresh_ff_state_for_testing() +@pytest.mark.integrated +@pytest.mark.parametrize('portal_env', [None, 'data']) +def test_schema_autoload_mixin_fetch_schema(portal_env): + + with schema_autoloader_for_testing(portal_env=portal_env) as autoloader: + + assert autoloader.portal_env == 'data' + + user_schema = autoloader.fetch_schema('user', portal_env=autoloader.portal_env) + + assert user_schema['$id'] == '/profiles/user.json' + assert user_schema['title'] == 'User' + assert 'properties' in user_schema + + +@using_fresh_ff_state_for_testing() +@pytest.mark.integrated +@pytest.mark.parametrize('autoload_schemas', [True, False]) +@pytest.mark.parametrize('cache_schemas', [True, False]) +@pytest.mark.parametrize('portal_env', [None, 'data']) +def test_schema_autoload_mixin_fetch_relevant_schemas(autoload_schemas, cache_schemas, portal_env): + + with printed_output() as printed: + with local_attrs(SchemaAutoloadMixin, CACHE_SCHEMAS=cache_schemas): + with schema_autoloader_for_testing(portal_env=portal_env, autoload_schemas=autoload_schemas) as autoloader: + + assert autoloader.portal_env == ('data' if autoload_schemas or portal_env else None) + + if autoload_schemas: + + schemas = autoloader.fetch_relevant_schemas(['User', 'Lab']) + assert isinstance(schemas, dict) + assert len(schemas) == 2 + assert set(schemas.keys()) == {'User', 'Lab'} + + else: + + assert autoloader.fetch_relevant_schemas(['User', 'Lab']) == {} + + if portal_env == 'data' or not autoload_schemas: + assert printed.lines == [] + else: + assert printed.lines == [ + "The portal_env was not explicitly supplied. Schemas will come from portal_env='data'." + ] + + +SAMPLE_ITEMS_FOR_REAL_SCHEMAS_FILE = os.path.join(TEST_DIR, 'data_files/sample_items_for_real_schemas.csv') + + +@using_fresh_ff_state_for_testing() +@pytest.mark.integrated +def test_workbook_with_schemas(): + + print() # start o a fresh line + + SchemaAutoloadMixin.clear_schema_cache() + + actual_data = CsvManager(filename=SAMPLE_ITEMS_FOR_REAL_SCHEMAS_FILE, tab_name='ExperimentSeq').load_content() + expected_data = { + "ExperimentSeq": [ + { + "accession": "foo", + "fragment_size_selection_method": "spri" + }, + { + "accession": "bar", + "fragment_size_selection_method": "blue" + } + ] + } + assert actual_data == expected_data + + actual_items = load_items(SAMPLE_ITEMS_FOR_REAL_SCHEMAS_FILE, + tab_name='ExperimentSeq', autoload_schemas=True) + expected_items = { + "ExperimentSeq": [ + { + "accession": "foo", + "fragment_size_selection_method": "SPRI beads" + }, + { + "accession": "bar", + "fragment_size_selection_method": "BluePippin" + } + ] + } + assert actual_items == expected_items + + +@using_fresh_ff_state_for_testing() +@pytest.mark.integrated +def test_workbook_with_schemas_and_portal_vapp(): + + print() # start on a fresh line + + SchemaAutoloadMixin.clear_schema_cache() + + portal_env = public_env_name(EnvUtils.PRD_ENV_NAME) + + experiment_seq_schema = ff_utils_module.get_schema('ExperimentSeq', portal_env=portal_env) + + expected_items = { + "ExperimentSeq": [ + { + "accession": "foo", + "fragment_size_selection_method": "SPRI beads" + }, + { + "accession": "bar", + "fragment_size_selection_method": "BluePippin" + } + ] + } + + class MockVapp(NamedObject, AbstractVirtualApp): + + def __init__(self, name): + super().__init__(name=name) + self.call_count = 0 + + def get(self, path_url): + assert path_url.startswith('/profiles/ExperimentSeq.json?') + self.call_count += 1 + response = MockResponse(200, json=experiment_seq_schema) + return response + + portal_vapp = MockVapp(name=f'MockVapp[{portal_env}]') + + old_count = portal_vapp.call_count + + with mock.patch.object(ff_utils_module, "get_authentication_with_server", + mock_not_called("get_authentication_with_server")): + with mock.patch.object(ff_utils_module, "get_metadata", + mock_not_called("get_metadata")): + actual_items = load_items(SAMPLE_ITEMS_FOR_REAL_SCHEMAS_FILE, + tab_name='ExperimentSeq', autoload_schemas=True, portal_vapp=portal_vapp) + + assert portal_vapp.call_count == old_count + 1 + assert actual_items == expected_items diff --git a/test/test_common.py b/test/test_common.py index 35e096256..5921c8096 100644 --- a/test/test_common.py +++ b/test/test_common.py @@ -1,4 +1,10 @@ -from dcicutils.common import EnvName, OrchestratedApp, APP_CGAP, APP_FOURFRONT, APP_SMAHT, ORCHESTRATED_APPS +import csv +import io +import re + +from dcicutils.common import ( + EnvName, OrchestratedApp, APP_CGAP, APP_FOURFRONT, APP_SMAHT, ORCHESTRATED_APPS, Regexp, CsvReader +) def test_app_constants(): @@ -8,3 +14,19 @@ def test_app_constants(): # For thexe next two, which are really type hints, just test that they exist. assert EnvName assert OrchestratedApp + + +def test_type_hint_regexp(): + + regexp_string = "x.?y*" + assert not isinstance(regexp_string, Regexp) + assert isinstance(re.compile(regexp_string), Regexp) + + +def test_type_hint_csv_reader(): + + csv_filename = "something.csv" + open_csv_file = io.StringIO("some,csv,data") + assert not isinstance(csv_filename, CsvReader) + assert not isinstance(open_csv_file, CsvReader) + assert isinstance(csv.reader(open_csv_file), CsvReader) diff --git a/test/test_sheet_utils.py b/test/test_sheet_utils.py index b9472a94c..2a580afce 100644 --- a/test/test_sheet_utils.py +++ b/test/test_sheet_utils.py @@ -1,32 +1,41 @@ -import contextlib +# import contextlib import json import os import pytest from collections import namedtuple -from dcicutils import sheet_utils as sheet_utils_module, ff_utils as ff_utils_module -from dcicutils.common import AnyJsonData -from dcicutils.env_utils import EnvUtils, public_env_name -from dcicutils.misc_utils import is_uuid, local_attrs, NamedObject, AbstractVirtualApp -from dcicutils.qa_utils import printed_output, mock_not_called, MockResponse +# from dcicutils import bundle_utils as bundle_utils_module, ff_utils as ff_utils_module +# from dcicutils.common import AnyJsonData +# from dcicutils.env_utils import EnvUtils, public_env_name +# from dcicutils.misc_utils import is_uuid, local_attrs, NamedObject, AbstractVirtualApp +# from dcicutils.qa_utils import printed_output, mock_not_called, MockResponse +# from dcicutils.bundle_utils import ( +# # High-level interfaces +# ItemManager, load_items, ITEM_MANAGER_REGISTRY, +# # Low-level implementation +# SchemaAutoloadMixin, +# ItemTools, +# XlsxItemManager, +# CsvItemManager, TsvItemManager, +# # TypeHint, EnumHint, +# BoolHint, +# ) from dcicutils.sheet_utils import ( # High-level interfaces - ItemManager, load_items, TABLE_SET_MANAGER_REGISTRY, ITEM_MANAGER_REGISTRY, + TABLE_SET_MANAGER_REGISTRY, # Low-level implementation - BasicTableSetManager, SchemaAutoloadMixin, - ItemTools, XlsxManager, XlsxItemManager, - CsvManager, CsvItemManager, TsvManager, TsvItemManager, - # TypeHint, EnumHint, - BoolHint, + BasicTableSetManager, + XlsxManager, + CsvManager, TsvManager, # Error handling LoadFailure, LoadArgumentsError, LoadTableError, # Utilities - prefer_number, unwanted_kwargs, expand_string_escape_sequences, + prefer_number, unwanted_kwargs, expand_string_escape_sequences, infer_tab_name_from_filename, ) -from typing import Dict, Optional -from unittest import mock +# from typing import Dict, Optional +# from unittest import mock from .conftest_settings import TEST_DIR -from .helpers import using_fresh_ff_state_for_testing +# from .helpers import using_fresh_ff_state_for_testing TEST_SHEET_1 = 'Sheet1' @@ -118,193 +127,11 @@ def test_back_table_set_create_state(): assert BasicTableSetManager._create_tab_processor_state('some-tab') is None -def test_item_tools_parse_sheet_header(): - assert ItemTools.parse_sheet_header('.a') == ['a'] - assert ItemTools.parse_sheet_header('a') == ['a'] - assert ItemTools.parse_sheet_header('#0') == [0] - assert ItemTools.parse_sheet_header('0') == [0] - assert ItemTools.parse_sheet_header('foo.bar') == ['foo', 'bar'] - assert ItemTools.parse_sheet_header('a.b#0') == ['a', 'b', 0] - assert ItemTools.parse_sheet_header('x.xx#17#8.z') == ['x', 'xx', 17, 8, 'z'] +def test_infer_tab_name_from_filename(): - # We don't error-check this, but it shouldn't matter - assert ItemTools.parse_sheet_header('#abc') == ['abc'] - assert ItemTools.parse_sheet_header('.123') == [123] - assert ItemTools.parse_sheet_header('#abc.123#456.def') == ['abc', 123, 456, 'def'] - - -def test_item_tools_parse_sheet_headers(): - input = ['a.b', 'a.c', 'a.d#1', 'a.d#2'] - expected = [['a', 'b'], ['a', 'c'], ['a', 'd', 1], ['a', 'd', 2]] - assert ItemTools.parse_sheet_headers(input) == expected - - -def test_item_tools_infer_tab_name(): - - assert ItemTools.infer_tab_name('some/dir/some') == 'some' - assert ItemTools.infer_tab_name('some/dir/some.file') == 'some' - assert ItemTools.infer_tab_name('some/dir/some.file.name') == 'some' - - -@pytest.mark.parametrize('parsed_headers,expected_prototype', [ - (['a'], - {'a': None}), - (['a', 'b'], - {'a': None, 'b': None}), - (['a.b', 'a.c', 'a.d#0', 'a.d#1'], - {'a': {'b': None, 'c': None, 'd': [None, None]}}), - (['a.b', 'a.c', 'a.d#0.foo', 'a.d#0.bar'], - {'a': {'b': None, 'c': None, 'd': [{'foo': None, 'bar': None}]}}), - (['a.b', 'a.c', 'a.d#0.foo', 'a.d#0.bar', 'a.d#1.foo', 'a.d#1.bar'], - {'a': {'b': None, 'c': None, 'd': [{'foo': None, 'bar': None}, {'foo': None, 'bar': None}]}}), -]) -def test_item_tools_compute_patch_prototype(parsed_headers, expected_prototype): - parsed_headers = ItemTools.parse_sheet_headers(parsed_headers) - assert ItemTools.compute_patch_prototype(parsed_headers) == expected_prototype - - -@pytest.mark.parametrize('headers', [['0'], ['x', '0.y']]) -def test_item_tools_compute_patch_prototype_errors(headers): - - parsed_headers = ItemTools.parse_sheet_headers(headers) - with pytest.raises(LoadTableError) as exc: - ItemTools.compute_patch_prototype(parsed_headers) - assert str(exc.value) == "A header cannot begin with a numeric ref: 0" - - -def test_item_tools_parse_item_value_basic(): - - for x in [37, 19.3, True, False, None, 'simple text']: - assert ItemTools.parse_item_value(x) == x - - assert ItemTools.parse_item_value('3') == 3 - assert ItemTools.parse_item_value('+3') == 3 - assert ItemTools.parse_item_value('-3') == -3 - - assert ItemTools.parse_item_value('3.5') == 3.5 - assert ItemTools.parse_item_value('+3.5') == 3.5 - assert ItemTools.parse_item_value('-3.5') == -3.5 - - assert ItemTools.parse_item_value('3.5e1') == 35.0 - assert ItemTools.parse_item_value('+3.5e1') == 35.0 - assert ItemTools.parse_item_value('-3.5e1') == -35.0 - - assert ItemTools.parse_item_value('') is None - - assert ItemTools.parse_item_value('null') is None - assert ItemTools.parse_item_value('Null') is None - assert ItemTools.parse_item_value('NULL') is None - - assert ItemTools.parse_item_value('true') is True - assert ItemTools.parse_item_value('True') is True - assert ItemTools.parse_item_value('TRUE') is True - - assert ItemTools.parse_item_value('false') is False - assert ItemTools.parse_item_value('False') is False - assert ItemTools.parse_item_value('FALSE') is False - - assert ItemTools.parse_item_value('|') == [] # special case: lone '|' means empty - assert ItemTools.parse_item_value('alpha|') == ['alpha'] # special case: trailing '|' means singleton - assert ItemTools.parse_item_value('|alpha|') == [None, 'alpha'] - assert ItemTools.parse_item_value('|alpha') == [None, 'alpha'] - assert ItemTools.parse_item_value('alpha|beta|gamma') == ['alpha', 'beta', 'gamma'] - assert ItemTools.parse_item_value('alpha|true|false|null||7|1.5') == ['alpha', True, False, None, None, 7, 1.5] - - -@pytest.mark.parametrize('instaguids_enabled', [True, False]) -def test_item_tools_parse_item_value_guids(instaguids_enabled): - - with local_attrs(ItemTools, INSTAGUIDS_ENABLED=instaguids_enabled): - - sample_simple_field_input = "#foo" - - parsed = ItemTools.parse_item_value(sample_simple_field_input) - assert parsed == sample_simple_field_input - - context = {} - parsed = ItemTools.parse_item_value(sample_simple_field_input, context=context) - if instaguids_enabled: - assert is_uuid(parsed) - assert parsed == context[sample_simple_field_input] - else: - assert parsed == sample_simple_field_input - assert context == {} - - sample_compound_field_input = '#foo|#bar' - sample_compound_field_list = ['#foo', '#bar'] - - parsed = ItemTools.parse_item_value(sample_compound_field_input) - assert parsed == sample_compound_field_list - - context = {} - parsed = ItemTools.parse_item_value(sample_compound_field_input, context=context) - assert isinstance(parsed, list) - if instaguids_enabled: - assert all(is_uuid(x) for x in parsed) - assert '#foo' in context and '#bar' in context - else: - assert parsed == sample_compound_field_list - assert context == {} - - -def test_item_tools_set_path_value(): - - x = {'foo': 1, 'bar': 2} - ItemTools.set_path_value(x, ['foo'], 3) - assert x == {'foo': 3, 'bar': 2} - - x = {'foo': [11, 22, 33], 'bar': {'x': 'xx', 'y': 'yy'}} - ItemTools.set_path_value(x, ['foo', 1], 17) - assert x == {'foo': [11, 17, 33], 'bar': {'x': 'xx', 'y': 'yy'}} - - x = {'foo': [11, 22, 33], 'bar': {'x': 'xx', 'y': 'yy'}} - ItemTools.set_path_value(x, ['bar', 'x'], 'something') - assert x == {'foo': [11, 22, 33], 'bar': {'x': 'something', 'y': 'yy'}} - - -def test_item_tools_find_type_hint(): - - assert ItemTools.find_type_hint(None, 'anything') is None - - assert ItemTools.find_type_hint(['foo', 'bar'], None) is None - assert ItemTools.find_type_hint(['foo', 'bar'], "something") is None - assert ItemTools.find_type_hint(['foo', 'bar'], {}) is None - - actual = ItemTools.find_type_hint(['foo', 'bar'], {"type": "object"}) - assert actual is None - - schema = { - "type": "object", - "properties": { - "foo": { - "type": "boolean" - } - } - } - actual = ItemTools.find_type_hint(['foo', 'bar'], schema) - assert actual is None - - actual = ItemTools.find_type_hint(['foo'], schema) - assert isinstance(actual, BoolHint) - - schema = { - "type": "object", - "properties": { - "foo": { - "type": "object", - "properties": { - "bar": { - "type": "boolean" - } - } - } - } - } - actual = ItemTools.find_type_hint(['foo', 'bar'], schema) - assert isinstance(actual, BoolHint) - - actual = ItemTools.find_type_hint(['foo'], schema) - assert actual is None + assert infer_tab_name_from_filename('some/dir/some') == 'some' + assert infer_tab_name_from_filename('some/dir/some.file') == 'some' + assert infer_tab_name_from_filename('some/dir/some.file.name') == 'some' def test_table_set_manager_registry_manager_for_filename(): @@ -315,12 +142,6 @@ def test_table_set_manager_registry_manager_for_filename(): TABLE_SET_MANAGER_REGISTRY.manager_for_filename("xyz/foo.something.missing") assert str(exc.value) == "Unknown file type: xyz/foo.something.missing" - assert ITEM_MANAGER_REGISTRY.manager_for_filename("xyz/foo.csv") == CsvItemManager - - with pytest.raises(Exception) as exc: - ITEM_MANAGER_REGISTRY.manager_for_filename("xyz/foo.something.missing") - assert str(exc.value) == "Unknown file type: xyz/foo.something.missing" - SAMPLE_XLSX_FILE = os.path.join(TEST_DIR, 'data_files/sample_items.xlsx') @@ -378,7 +199,7 @@ def test_table_set_manager_registry_manager_for_filename(): SAMPLE_CSV_FILE = os.path.join(TEST_DIR, 'data_files/sample_items_sheet2.csv') -SAMPLE_CSV_FILE_SHEET_NAME = ItemTools.infer_tab_name(SAMPLE_CSV_FILE) +SAMPLE_CSV_FILE_SHEET_NAME = infer_tab_name_from_filename(SAMPLE_CSV_FILE) SAMPLE_CSV_FILE_RAW_CONTENT = {SAMPLE_CSV_FILE_SHEET_NAME: SAMPLE_XLSX_FILE_RAW_CONTENT['Sheet2']} @@ -386,7 +207,7 @@ def test_table_set_manager_registry_manager_for_filename(): SAMPLE_TSV_FILE = os.path.join(TEST_DIR, 'data_files/sample_items_sheet2.tsv') -SAMPLE_TSV_FILE_SHEET_NAME = ItemTools.infer_tab_name(SAMPLE_TSV_FILE) +SAMPLE_TSV_FILE_SHEET_NAME = infer_tab_name_from_filename(SAMPLE_TSV_FILE) SAMPLE_TSV_FILE_RAW_CONTENT = {SAMPLE_TSV_FILE_SHEET_NAME: SAMPLE_XLSX_FILE_RAW_CONTENT['Sheet2']} @@ -420,25 +241,6 @@ def test_xlsx_manager_load_csv(): ' expects only .xlsx filenames:') -def test_xlsx_item_manager_load_content(): - - it = XlsxItemManager(SAMPLE_XLSX_FILE, autoload_schemas=False) - assert it.load_content() == SAMPLE_XLSX_FILE_ITEM_CONTENT - - -def test_xlsx_item_manager_load(): - - assert XlsxItemManager.load(SAMPLE_XLSX_FILE, autoload_schemas=False) == SAMPLE_XLSX_FILE_ITEM_CONTENT - - -def test_xlsx_item_manager_load_csv(): - - with pytest.raises(LoadArgumentsError) as exc: - XlsxItemManager.load(SAMPLE_CSV_FILE) - assert str(exc.value).startswith('The TableSetManager subclass XlsxItemManager' - ' expects only .xlsx filenames:') - - def test_csv_manager_load_content(): wt = CsvManager(SAMPLE_CSV_FILE) @@ -458,24 +260,6 @@ def test_csv_manager_load_csv(): ' expects only .csv filenames:') -def test_csv_item_manager_load_content(): - - it = CsvItemManager(SAMPLE_CSV_FILE, autoload_schemas=False) - assert it.load_content() == SAMPLE_CSV_FILE_ITEM_CONTENT - - -def test_csv_item_manager_load(): - - assert CsvItemManager.load(SAMPLE_CSV_FILE, autoload_schemas=False) == SAMPLE_CSV_FILE_ITEM_CONTENT - - -def test_csv_item_manager_load_csv(): - - with pytest.raises(LoadArgumentsError) as exc: - CsvItemManager.load(SAMPLE_XLSX_FILE, autoload_schemas=False) - assert str(exc.value).startswith('The TableSetManager subclass CsvItemManager' - ' expects only .csv filenames:') - def test_csv_escaping(): @@ -507,429 +291,3 @@ def test_tsv_manager_load_csv(): ' expects only .tsv or .tsv.txt filenames:') -def test_tsv_item_manager_load_content(): - - it = TsvItemManager(SAMPLE_TSV_FILE, autoload_schemas=False) - assert it.load_content() == SAMPLE_TSV_FILE_ITEM_CONTENT - - -def test_tsv_item_manager_load(): - - assert TsvItemManager.load(SAMPLE_TSV_FILE, autoload_schemas=False) == SAMPLE_TSV_FILE_ITEM_CONTENT - - -def test_tsv_item_manager_load_csv(): - - with pytest.raises(LoadArgumentsError) as exc: - TsvItemManager.load(SAMPLE_XLSX_FILE, autoload_schemas=False) - assert str(exc.value).startswith('The TableSetManager subclass TsvItemManager' - ' expects only .tsv or .tsv.txt filenames:') - - -def test_item_manager_load(): - - assert ItemManager.load(SAMPLE_XLSX_FILE, autoload_schemas=False) == SAMPLE_XLSX_FILE_ITEM_CONTENT - - assert ItemManager.load(SAMPLE_CSV_FILE, autoload_schemas=False) == SAMPLE_CSV_FILE_ITEM_CONTENT - - assert ItemManager.load(SAMPLE_TSV_FILE, autoload_schemas=False) == SAMPLE_TSV_FILE_ITEM_CONTENT - - loaded = ItemManager.load(SAMPLE_JSON_TABS_FILE, autoload_schemas=False) - print("loaded=", json.dumps(loaded, indent=2)) - expected = SAMPLE_JSON_TABS_FILE_ITEM_CONTENT - print("expected=", json.dumps(expected, indent=2)) - assert loaded == expected - - with pytest.raises(LoadArgumentsError) as exc: - ItemManager.load("something.else") - assert str(exc.value) == "Unknown file type: something.else" - - -def test_load_items(): - - assert load_items(SAMPLE_XLSX_FILE, autoload_schemas=False) == SAMPLE_XLSX_FILE_ITEM_CONTENT - - assert load_items(SAMPLE_CSV_FILE, autoload_schemas=False) == SAMPLE_CSV_FILE_ITEM_CONTENT - - with pytest.raises(LoadArgumentsError) as exc: - load_items("something.else") - assert str(exc.value) == "Unknown file type: something.else" - - -SAMPLE_CSV_FILE2 = os.path.join(TEST_DIR, 'data_files/sample_items2.csv') - -SAMPLE_CSV_FILE2_SHEET_NAME = ItemTools.infer_tab_name(SAMPLE_CSV_FILE2) - -SAMPLE_CSV_FILE2_SCHEMAS = { - "Person": { - "type": "object", - "properties": { - "name": {"type": "string"}, - "sex": {"type": "string", "enum": ["Male", "Female"]}, - "member": {"type": "boolean"} - } - } -} - -SAMPLE_CSV_FILE2_CONTENT = { - SAMPLE_CSV_FILE2_SHEET_NAME: [ - {"name": "john", "sex": "M", "member": "false"}, - {"name": "juan", "sex": "male", "member": "true"}, - {"name": "igor", "sex": "unknown", "member": None}, - {"name": "mary", "sex": "Female", "member": "t"} - ] -} - -SAMPLE_CSV_FILE2_ITEM_CONTENT = { - SAMPLE_CSV_FILE2_SHEET_NAME: [ - {"name": "john", "sex": "M", "member": False}, - {"name": "juan", "sex": "male", "member": True}, - {"name": "igor", "sex": "unknown", "member": None}, - {"name": "mary", "sex": "Female", "member": "t"} - ] -} - -SAMPLE_CSV_FILE2_PERSON_CONTENT_HINTED = { - "Person": [ - {"name": "john", "sex": "Male", "member": False}, - {"name": "juan", "sex": "Male", "member": True}, - {"name": "igor", "sex": "unknown", "member": None}, - {"name": "mary", "sex": "Female", "member": True} - ] -} - - -SAMPLE_JSON_FILE2 = os.path.join(TEST_DIR, 'data_files/sample_items2.json') - -SAMPLE_JSON_FILE2_SHEET_NAME = ItemTools.infer_tab_name(SAMPLE_JSON_FILE2) - - -SAMPLE_CSV_FILE3_SCHEMAS = { - "Person": { - "type": "object", - "properties": { - "name": {"type": "string"}, - "sex": {"type": "string", "enum": ["Male", "Female"]}, - "children": {"type": "array", "items": {"type": "string"}}, - "parents": {"type": "array", "items": {"type": "string"}}, - "mother": {"type": "string"}, - "father": {"type": "string"}, - } - } -} - -SAMPLE_CSV_FILE3_PERSON_CONTENT_HINTED = { - "Person": [ - { - "name": "John", - "uuid": "#john", - "sex": "Male", - "father": "#igor", - "mother": "#mary", - "parents": None, - "children": None, - }, - { - "name": "Juan", - "uuid": "#juan", - "sex": "Male", - "father": None, - "mother": None, - "parents": ["#igor", "#mary"], - "children": None, - }, - { - "name": "Igor", - "uuid": "#igor", - "sex": "Male", - "father": None, - "mother": None, - "parents": None, - "children": ["#john"], - }, - { - "name": "Mary", - "uuid": "#mary", - "sex": "Female", - "father": None, - "mother": None, - "parents": None, - "children": ["#john"], - }, - ] -} - -SAMPLE_CSV_FILE3 = os.path.join(TEST_DIR, 'data_files/sample_items3.csv') - - -def matches_template(json1: AnyJsonData, json2: AnyJsonData, *, previous_matches: Dict[str, str] = None) -> bool: - if previous_matches is None: - previous_matches = {} - if isinstance(json1, dict) and isinstance(json2, dict): - keys1 = set(json1.keys()) - keys2 = set(json2.keys()) - if keys1 != keys2: - print(f"Keys don't match: {keys1} vs {keys2}") - return False - return all(matches_template(json1[key], json2[key], previous_matches=previous_matches) for key in keys1) - elif isinstance(json1, list) and isinstance(json2, list): - n1 = len(json1) - n2 = len(json2) - if n1 != n2: - print(f"Length doesn't match: {n1} vs {n2}") - return False - return all(matches_template(json1[i], json2[i], previous_matches=previous_matches) for i in range(n1)) - elif isinstance(json1, str) and isinstance(json2, str) and is_uuid(json1) and json2.startswith("#"): - previously_matched = previous_matches.get(json2) - if previously_matched: - result = json1 == previously_matched - if not result: - print(f"Instaguid mismatch: {json1} vs {json2}") - return result - else: - # Remember the match - previous_matches[json2] = json1 - return True - else: # any other atomic items can be just directly compared - result = json1 == json2 - if not result: - print(f"Unequal: {json1} vs {json2}") - return result - - -def test_load_items_with_schema(): - - print("Case 1") - expected = SAMPLE_CSV_FILE2_CONTENT - actual = CsvManager.load(SAMPLE_CSV_FILE2) - assert actual == expected - - print("Case 2") - expected = SAMPLE_CSV_FILE2_ITEM_CONTENT - actual = load_items(SAMPLE_CSV_FILE2, schemas=SAMPLE_CSV_FILE2_SCHEMAS) - assert actual == expected - - print("Case 3") - expected = SAMPLE_CSV_FILE2_PERSON_CONTENT_HINTED - actual = load_items(SAMPLE_CSV_FILE2, schemas=SAMPLE_CSV_FILE2_SCHEMAS, tab_name='Person') - assert actual == expected - - -def test_sample_items_csv_vs_json(): - - csv_content = load_items(SAMPLE_CSV_FILE2, schemas=SAMPLE_CSV_FILE2_SCHEMAS, tab_name='Person') - - json_content = load_items(SAMPLE_JSON_FILE2, tab_name="Person") - - assert csv_content == json_content - - -def test_sample_items_json_vs_yaml(): - - tabs_data_from_json = load_items(SAMPLE_JSON_TABS_FILE) - tabs_data_from_yaml = load_items(SAMPLE_YAML_TABS_FILE) - assert tabs_data_from_json == tabs_data_from_yaml - - -@pytest.mark.parametrize('instaguids_enabled', [True, False]) -def test_load_items_with_schema_and_instaguids(instaguids_enabled): - - with local_attrs(ItemTools, INSTAGUIDS_ENABLED=instaguids_enabled): - - expected = SAMPLE_CSV_FILE3_PERSON_CONTENT_HINTED - print("expected=", json.dumps(expected, indent=2)) - actual = load_items(SAMPLE_CSV_FILE3, schemas=SAMPLE_CSV_FILE3_SCHEMAS, tab_name='Person') - print("actual=", json.dumps(actual, indent=2)) - if instaguids_enabled: - assert matches_template(actual, expected) - else: - assert actual == expected # no substitution performed - - -class SchemaAutoloaderForTesting(SchemaAutoloadMixin): - - def __init__(self, **kwargs): - super().__init__(filename='ignored.file.name', **kwargs) - - -@contextlib.contextmanager -def schema_autoloader_for_testing(**kwargs) -> SchemaAutoloadMixin: - autoloader: Optional[SchemaAutoloadMixin] = None - success = False - try: - autoloader: SchemaAutoloadMixin = SchemaAutoloaderForTesting(**kwargs) - assert autoloader.SCHEMA_CACHE == {}, "The schema cache is not clean." - yield autoloader - success = True - finally: - if autoloader is not None: - autoloader.clear_schema_cache() - assert autoloader.SCHEMA_CACHE == SchemaAutoloadMixin.SCHEMA_CACHE == {} - if not success: - raise - - -@using_fresh_ff_state_for_testing() -@pytest.mark.integrated -@pytest.mark.parametrize('portal_env', [None, 'data']) -def test_schema_autoload_mixin_caching(portal_env): - - with schema_autoloader_for_testing(portal_env=portal_env) as autoloader: - - assert autoloader.portal_env == 'data' # it should have defaulted even if we didn't supply it - - assert autoloader.SCHEMA_CACHE == SchemaAutoloadMixin.SCHEMA_CACHE == {} - - sample_schema_name = 'foo' - sample_schema = {'mock_schema_for': 'foo'} - - with mock.patch.object(sheet_utils_module, "get_schema") as mock_get_schema: - mock_get_schema.return_value = sample_schema - assert autoloader.fetch_schema(sample_schema_name, portal_env=autoloader.portal_env) == sample_schema - - schema_cache_with_sample_schema = {sample_schema_name: sample_schema} - assert SchemaAutoloadMixin.SCHEMA_CACHE == schema_cache_with_sample_schema - assert autoloader.SCHEMA_CACHE == schema_cache_with_sample_schema - - -@using_fresh_ff_state_for_testing() -@pytest.mark.integrated -@pytest.mark.parametrize('portal_env', [None, 'data']) -def test_schema_autoload_mixin_fetch_schema(portal_env): - - with schema_autoloader_for_testing(portal_env=portal_env) as autoloader: - - assert autoloader.portal_env == 'data' - - user_schema = autoloader.fetch_schema('user', portal_env=autoloader.portal_env) - - assert user_schema['$id'] == '/profiles/user.json' - assert user_schema['title'] == 'User' - assert 'properties' in user_schema - - -@using_fresh_ff_state_for_testing() -@pytest.mark.integrated -@pytest.mark.parametrize('autoload_schemas', [True, False]) -@pytest.mark.parametrize('cache_schemas', [True, False]) -@pytest.mark.parametrize('portal_env', [None, 'data']) -def test_schema_autoload_mixin_fetch_relevant_schemas(autoload_schemas, cache_schemas, portal_env): - - with printed_output() as printed: - with local_attrs(SchemaAutoloadMixin, CACHE_SCHEMAS=cache_schemas): - with schema_autoloader_for_testing(portal_env=portal_env, autoload_schemas=autoload_schemas) as autoloader: - - assert autoloader.portal_env == ('data' if autoload_schemas or portal_env else None) - - if autoload_schemas: - - schemas = autoloader.fetch_relevant_schemas(['User', 'Lab']) - assert isinstance(schemas, dict) - assert len(schemas) == 2 - assert set(schemas.keys()) == {'User', 'Lab'} - - else: - - assert autoloader.fetch_relevant_schemas(['User', 'Lab']) == {} - - if portal_env == 'data' or not autoload_schemas: - assert printed.lines == [] - else: - assert printed.lines == [ - "The portal_env was not explicitly supplied. Schemas will come from portal_env='data'." - ] - - -SAMPLE_ITEMS_FOR_REAL_SCHEMAS_FILE = os.path.join(TEST_DIR, 'data_files/sample_items_for_real_schemas.csv') - - -@using_fresh_ff_state_for_testing() -@pytest.mark.integrated -def test_workbook_with_schemas(): - - print() # start o a fresh line - - SchemaAutoloadMixin.clear_schema_cache() - - actual_data = CsvManager(filename=SAMPLE_ITEMS_FOR_REAL_SCHEMAS_FILE, tab_name='ExperimentSeq').load_content() - expected_data = { - "ExperimentSeq": [ - { - "accession": "foo", - "fragment_size_selection_method": "spri" - }, - { - "accession": "bar", - "fragment_size_selection_method": "blue" - } - ] - } - assert actual_data == expected_data - - actual_items = load_items(SAMPLE_ITEMS_FOR_REAL_SCHEMAS_FILE, - tab_name='ExperimentSeq', autoload_schemas=True) - expected_items = { - "ExperimentSeq": [ - { - "accession": "foo", - "fragment_size_selection_method": "SPRI beads" - }, - { - "accession": "bar", - "fragment_size_selection_method": "BluePippin" - } - ] - } - assert actual_items == expected_items - - -@using_fresh_ff_state_for_testing() -@pytest.mark.integrated -def test_workbook_with_schemas_and_portal_vapp(): - - print() # start on a fresh line - - SchemaAutoloadMixin.clear_schema_cache() - - portal_env = public_env_name(EnvUtils.PRD_ENV_NAME) - - experiment_seq_schema = ff_utils_module.get_schema('ExperimentSeq', portal_env=portal_env) - - expected_items = { - "ExperimentSeq": [ - { - "accession": "foo", - "fragment_size_selection_method": "SPRI beads" - }, - { - "accession": "bar", - "fragment_size_selection_method": "BluePippin" - } - ] - } - - class MockVapp(NamedObject, AbstractVirtualApp): - - def __init__(self, name): - super().__init__(name=name) - self.call_count = 0 - - def get(self, path_url): - assert path_url.startswith('profiles/ExperimentSeq.json?') - self.call_count += 1 - response = MockResponse(200, json=experiment_seq_schema) - return response - - portal_vapp = MockVapp(name=f'MockVapp[{portal_env}]') - - old_count = portal_vapp.call_count - - with mock.patch.object(ff_utils_module, "get_authentication_with_server", - mock_not_called("get_authentication_with_server")): - with mock.patch.object(ff_utils_module, "get_metadata", - mock_not_called("get_metadata")): - actual_items = load_items(SAMPLE_ITEMS_FOR_REAL_SCHEMAS_FILE, - tab_name='ExperimentSeq', autoload_schemas=True, portal_vapp=portal_vapp) - - assert portal_vapp.call_count == old_count + 1 - assert actual_items == expected_items From a7aac440c8b601f0137b55aeb060391d20045469 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Tue, 12 Sep 2023 05:44:16 -0400 Subject: [PATCH 059/101] Mostly PEP8 --- dcicutils/bundle_utils.py | 5 +-- dcicutils/sheet_utils.py | 72 ++++++++++++++++++++++++++++----------- test/test_bundle_utils.py | 20 +---------- test/test_sheet_utils.py | 22 ------------ 4 files changed, 57 insertions(+), 62 deletions(-) diff --git a/dcicutils/bundle_utils.py b/dcicutils/bundle_utils.py index d6d35d00d..2462dedb6 100644 --- a/dcicutils/bundle_utils.py +++ b/dcicutils/bundle_utils.py @@ -10,7 +10,7 @@ from .lang_utils import there_are from .misc_utils import AbstractVirtualApp, ignored, PRINT from .sheet_utils import ( - Header, Headers, ParsedHeader, ParsedHeaders, SheetCellValue, SheetRow, TabbedSheetData, # SheetData, + Header, Headers, ParsedHeader, ParsedHeaders, SheetCellValue, SheetRow, TabbedSheetData, # SheetData, prefer_number, LoadTableError, TableSetManagerRegistry, AbstractTableSetManager, BasicTableSetManager, @@ -500,4 +500,5 @@ def load(cls, filename: str, tab_name: Optional[str] = None, escaping: Optional[ **kwargs) return manager.load_content() -load_items = ItemManager.load \ No newline at end of file + +load_items = ItemManager.load diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index 41a130ab0..aab9a080f 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -1,3 +1,5 @@ +import contextlib + import chardet # import contextlib # import copy @@ -8,18 +10,19 @@ import openpyxl import os import re +import subprocess # import uuid import yaml from openpyxl.worksheet.worksheet import Worksheet from openpyxl.workbook.workbook import Workbook -from tempfile import TemporaryFile +from tempfile import TemporaryFile, TemporaryDirectory from typing import Any, Dict, Iterable, List, Optional, Tuple, Type, Union from .common import AnyJsonData, Regexp # from .env_utils import public_env_name, EnvUtils # from .ff_utils import get_schema from .lang_utils import conjoined_list, disjoined_list, maybe_pluralize # , there_are -from .misc_utils import ignored, pad_to, JsonLinesReader # , PRINT, AbstractVirtualApp +from .misc_utils import ignored, pad_to, JsonLinesReader, remove_suffix # , PRINT, AbstractVirtualApp # from .task_utils import pmap @@ -125,9 +128,6 @@ def open_unicode_text_input_file_respecting_byte_order_mark(filename): return io.open(filename, 'r', encoding=use_encoding) - - - # TODO: Consider whether this might want to be an abstract base class. Some change might be needed. # # Doug thinks we might want (metaclass=ABCMeta) here to make this an abstract base class. @@ -325,7 +325,6 @@ def manager_for_special_filename(self, filename: str) -> Optional[Type[AbstractT TABLE_SET_MANAGER_REGISTRY = TableSetManagerRegistry() - @TABLE_SET_MANAGER_REGISTRY.register() class XlsxManager(SemanticTableSetManager): """ @@ -480,9 +479,6 @@ class SimpleYamlInsertsManager(SimpleInsertsMixin, YamlInsertsMixin, InsertsMana ALLOWED_FILE_EXTENSIONS = [".yaml"] - - - @TABLE_SET_MANAGER_REGISTRY.register() class SimpleJsonLinesInsertsManager(SimpleInsertsMixin, InsertsManager): @@ -492,8 +488,6 @@ def _parse_inserts_data(self, filename: str) -> AnyJsonData: return [line for line in JsonLinesReader(open_unicode_text_input_file_respecting_byte_order_mark(filename))] - - @TABLE_SET_MANAGER_REGISTRY.register(regexp="^(.*/)?(|[^/]*[-_])inserts/?$") class InsertsDirectoryManager(InsertsManager): @@ -515,8 +509,6 @@ def _parse_inserts_data(self, filename: str) -> AnyJsonData: return data - - @TABLE_SET_MANAGER_REGISTRY.register() class CsvManager(SingleTableMixin, SemanticTableSetManager): """ @@ -570,8 +562,6 @@ def _process_row(self, tab_name: str, headers: Headers, row_data: SheetRow) -> A for i, cell_text in enumerate(row_data)} - - @TABLE_SET_MANAGER_REGISTRY.register() class TsvManager(CsvManager): """ @@ -585,6 +575,47 @@ def _get_reader_agent_for_filename(cls, filename) -> CsvReader: return csv.reader(open_unicode_text_input_file_respecting_byte_order_mark(filename), delimiter='\t') +def do_shell_command(command, cwd=None): + # This might need to be more elaborate, but hopefully it will do for now. -kmp 11-Sep-2023 + subprocess.check_output(command, cwd=cwd) + + +@contextlib.contextmanager +def maybe_unpack(filename): + """ + If necessary, unpack a file that is zipped and/or tarred, yielding the name of the file (unpacked or not). + """ + if not os.path.exists(filename): + raise ValueError(f"The file {filename!r} does not exist.") + unpackables = ['.tar.gz', '.tar', '.tgz', '.gz', '.zip'] + ext = None + for unpackable in unpackables: + if filename.endswith(unpackable): + ext = unpackable + break + if not ext: + yield filename + return + target_base_part = remove_suffix(ext, os.path.basename(filename), required=True) + target_ext = '.tar.gz' if ext == '.tgz' else ext + with TemporaryDirectory() as temp_dir: + temp_base = os.path.join(temp_dir, target_base_part) + temp_filename = temp_base + target_ext + do_shell_command(['cp', filename, temp_filename]) + if temp_filename.endswith('.gz'): + do_shell_command(['gunzip', temp_filename], cwd=temp_dir) + temp_filename = remove_suffix('.gz', temp_filename) + elif temp_filename.endswith(".zip"): + do_shell_command(['unzip', temp_filename], cwd=temp_dir) + temp_filename = remove_suffix('.zip', temp_filename) + if temp_filename.endswith(".tar"): + do_shell_command(['tar', '-xf', temp_filename], cwd=temp_dir) + tar_file = temp_filename + temp_filename = remove_suffix(".tar", temp_filename, required=True) + if not os.path.isdir(temp_filename): + raise Exception(f"{tar_file} didn't unpack to a dir: {temp_filename}") + # print(f"Unpacked {filename} to {temp_filename}") + yield temp_filename class TableSetManager(AbstractTableSetManager): @@ -593,8 +624,11 @@ class TableSetManager(AbstractTableSetManager): (See more detailed description in AbstractTableManager.) """ + COMPRESSION_EXTENSIONS = ['.gz', '.tgz', '.tar.gz'] + @classmethod def create_implementation_manager(cls, filename: str, **kwargs) -> AbstractTableSetManager: + reader_agent_class = TABLE_SET_MANAGER_REGISTRY.manager_for_filename(filename) # This is a bad forward reference in current refactor, but also may be testing for something we don't need # to worry about anymore. -kmp 11-Sep-2023 @@ -610,12 +644,12 @@ def load(cls, filename: str, tab_name: Optional[str] = None, escaping: Optional[ """ Given a filename and various options """ - manager = cls.create_implementation_manager(filename=filename, tab_name=tab_name, escaping=escaping, **kwargs) - return manager.load_content() - + with maybe_unpack(filename) as filename: + manager = cls.create_implementation_manager(filename=filename, tab_name=tab_name, escaping=escaping, + **kwargs) + return manager.load_content() load_table_set = TableSetManager.load - diff --git a/test/test_bundle_utils.py b/test/test_bundle_utils.py index 48c8fb2af..66bb589bd 100644 --- a/test/test_bundle_utils.py +++ b/test/test_bundle_utils.py @@ -167,7 +167,6 @@ def test_item_tools_parse_item_value_guids(instaguids_enabled): def test_item_tools_set_path_value(): - x = {'foo': 1, 'bar': 2} ItemTools.set_path_value(x, ['foo'], 3) assert x == {'foo': 3, 'bar': 2} @@ -182,7 +181,6 @@ def test_item_tools_set_path_value(): def test_item_tools_find_type_hint(): - assert ItemTools.find_type_hint(None, 'anything') is None assert ItemTools.find_type_hint(['foo', 'bar'], None) is None @@ -227,7 +225,6 @@ def test_item_tools_find_type_hint(): def test_item_manager_registry_manager_for_filename(): - assert ITEM_MANAGER_REGISTRY.manager_for_filename("xyz/foo.csv") == CsvItemManager with pytest.raises(Exception) as exc: @@ -236,57 +233,47 @@ def test_item_manager_registry_manager_for_filename(): def test_xlsx_item_manager_load_content(): - it = XlsxItemManager(SAMPLE_XLSX_FILE, autoload_schemas=False) assert it.load_content() == SAMPLE_XLSX_FILE_ITEM_CONTENT def test_xlsx_item_manager_load(): - assert XlsxItemManager.load(SAMPLE_XLSX_FILE, autoload_schemas=False) == SAMPLE_XLSX_FILE_ITEM_CONTENT def test_xlsx_item_manager_load_csv(): - with pytest.raises(LoadArgumentsError) as exc: XlsxItemManager.load(SAMPLE_CSV_FILE) assert str(exc.value).startswith('The TableSetManager subclass XlsxItemManager' ' expects only .xlsx filenames:') - - def test_csv_item_manager_load_content(): - it = CsvItemManager(SAMPLE_CSV_FILE, autoload_schemas=False) assert it.load_content() == SAMPLE_CSV_FILE_ITEM_CONTENT def test_csv_item_manager_load(): - assert CsvItemManager.load(SAMPLE_CSV_FILE, autoload_schemas=False) == SAMPLE_CSV_FILE_ITEM_CONTENT def test_csv_item_manager_load_csv(): - with pytest.raises(LoadArgumentsError) as exc: CsvItemManager.load(SAMPLE_XLSX_FILE, autoload_schemas=False) assert str(exc.value).startswith('The TableSetManager subclass CsvItemManager' ' expects only .csv filenames:') -def test_tsv_item_manager_load_content(): +def test_tsv_item_manager_load_content(): it = TsvItemManager(SAMPLE_TSV_FILE, autoload_schemas=False) assert it.load_content() == SAMPLE_TSV_FILE_ITEM_CONTENT def test_tsv_item_manager_load(): - assert TsvItemManager.load(SAMPLE_TSV_FILE, autoload_schemas=False) == SAMPLE_TSV_FILE_ITEM_CONTENT def test_tsv_item_manager_load_csv(): - with pytest.raises(LoadArgumentsError) as exc: TsvItemManager.load(SAMPLE_XLSX_FILE, autoload_schemas=False) assert str(exc.value).startswith('The TableSetManager subclass TsvItemManager' @@ -294,11 +281,8 @@ def test_tsv_item_manager_load_csv(): def test_item_manager_load(): - assert ItemManager.load(SAMPLE_XLSX_FILE, autoload_schemas=False) == SAMPLE_XLSX_FILE_ITEM_CONTENT - assert ItemManager.load(SAMPLE_CSV_FILE, autoload_schemas=False) == SAMPLE_CSV_FILE_ITEM_CONTENT - assert ItemManager.load(SAMPLE_TSV_FILE, autoload_schemas=False) == SAMPLE_TSV_FILE_ITEM_CONTENT loaded = ItemManager.load(SAMPLE_JSON_TABS_FILE, autoload_schemas=False) @@ -313,9 +297,7 @@ def test_item_manager_load(): def test_load_items(): - assert load_items(SAMPLE_XLSX_FILE, autoload_schemas=False) == SAMPLE_XLSX_FILE_ITEM_CONTENT - assert load_items(SAMPLE_CSV_FILE, autoload_schemas=False) == SAMPLE_CSV_FILE_ITEM_CONTENT with pytest.raises(LoadArgumentsError) as exc: diff --git a/test/test_sheet_utils.py b/test/test_sheet_utils.py index 2a580afce..a20f6bb8c 100644 --- a/test/test_sheet_utils.py +++ b/test/test_sheet_utils.py @@ -42,7 +42,6 @@ def test_load_failure(): - sample_message = "This is a test." load_failure_object = LoadFailure(sample_message) @@ -51,7 +50,6 @@ def test_load_failure(): def test_load_argument_error(): - sample_message = "This is a test." load_failure_object = LoadArgumentsError(sample_message) @@ -60,7 +58,6 @@ def test_load_argument_error(): def test_load_table_error(): - sample_message = "This is a test." load_failure_object = LoadTableError(sample_message) @@ -69,7 +66,6 @@ def test_load_table_error(): def test_prefer_number(): - assert prefer_number('') is None assert prefer_number('123') == 123 assert prefer_number('3.14') == 3.14 @@ -83,7 +79,6 @@ def test_prefer_number(): def test_expand_string_escape_sequences(): - assert expand_string_escape_sequences("foo") == "foo" assert expand_string_escape_sequences("foo\\tbar") == "foo\tbar" assert expand_string_escape_sequences("\\r\\t\\n\\\\") == "\r\t\n\\" @@ -116,26 +111,22 @@ def test_unwanted_kwargs_without_error(): message="Function foo doesn't use keyword arguments a=1 and b=2."), # noQA PyCharm can't see defaults ]) def test_unwanted_kwargs_with_error(context, context_plural, detailed, kwargs, message): - with pytest.raises(LoadArgumentsError) as exc: unwanted_kwargs(context=context, kwargs=kwargs, context_plural=context_plural, detailed=detailed) assert str(exc.value) == message def test_back_table_set_create_state(): - assert BasicTableSetManager._create_tab_processor_state('some-tab') is None def test_infer_tab_name_from_filename(): - assert infer_tab_name_from_filename('some/dir/some') == 'some' assert infer_tab_name_from_filename('some/dir/some.file') == 'some' assert infer_tab_name_from_filename('some/dir/some.file.name') == 'some' def test_table_set_manager_registry_manager_for_filename(): - assert TABLE_SET_MANAGER_REGISTRY.manager_for_filename("xyz/foo.csv") == CsvManager with pytest.raises(Exception) as exc: @@ -223,18 +214,15 @@ def test_table_set_manager_registry_manager_for_filename(): def test_xlsx_manager_load_content(): - wt = XlsxManager(SAMPLE_XLSX_FILE) assert wt.load_content() == SAMPLE_XLSX_FILE_RAW_CONTENT def test_xlsx_manager_load(): - assert XlsxManager.load(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_RAW_CONTENT def test_xlsx_manager_load_csv(): - with pytest.raises(LoadArgumentsError) as exc: XlsxManager.load(SAMPLE_CSV_FILE) assert str(exc.value).startswith('The TableSetManager subclass XlsxManager' @@ -242,27 +230,22 @@ def test_xlsx_manager_load_csv(): def test_csv_manager_load_content(): - wt = CsvManager(SAMPLE_CSV_FILE) assert wt.load_content() == SAMPLE_CSV_FILE_RAW_CONTENT def test_csv_manager_load(): - assert CsvManager.load(SAMPLE_CSV_FILE) == SAMPLE_CSV_FILE_RAW_CONTENT def test_csv_manager_load_csv(): - with pytest.raises(LoadArgumentsError) as exc: CsvManager.load(SAMPLE_XLSX_FILE) assert str(exc.value).startswith('The TableSetManager subclass CsvManager' ' expects only .csv filenames:') - def test_csv_escaping(): - actual = CsvManager.load("test/data_files/escaping.csv", escaping=False) expected = json.load(open("test/data_files/escaping-false.json")) assert actual == expected @@ -273,21 +256,16 @@ def test_csv_escaping(): def test_tsv_manager_load_content(): - wt = TsvManager(SAMPLE_TSV_FILE) assert wt.load_content() == SAMPLE_TSV_FILE_RAW_CONTENT def test_tsv_manager_load(): - assert TsvManager.load(SAMPLE_TSV_FILE) == SAMPLE_TSV_FILE_RAW_CONTENT def test_tsv_manager_load_csv(): - with pytest.raises(LoadArgumentsError) as exc: TsvManager.load(SAMPLE_XLSX_FILE) assert str(exc.value).startswith('The TableSetManager subclass TsvManager' ' expects only .tsv or .tsv.txt filenames:') - - From 54c51aaa5bd95a559b0c9e44379e54f638dc3c8b Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Tue, 12 Sep 2023 06:00:55 -0400 Subject: [PATCH 060/101] Add support for zipped files. --- dcicutils/sheet_utils.py | 71 +++++++++++++++++++++++++++++++++++----- pyproject.toml | 2 +- test/test_sheet_utils.py | 2 +- 3 files changed, 64 insertions(+), 11 deletions(-) diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index a3c6e02d5..5a311f7c0 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -8,18 +8,19 @@ import openpyxl import os import re +import subprocess import uuid import yaml from openpyxl.worksheet.worksheet import Worksheet from openpyxl.workbook.workbook import Workbook -from tempfile import TemporaryFile +from tempfile import TemporaryFile, TemporaryDirectory from typing import Any, Dict, Iterable, List, Optional, Tuple, Type, Union from .common import AnyJsonData from .env_utils import public_env_name, EnvUtils from .ff_utils import get_schema from .lang_utils import conjoined_list, disjoined_list, maybe_pluralize, there_are -from .misc_utils import ignored, PRINT, pad_to, JsonLinesReader, AbstractVirtualApp +from .misc_utils import ignored, PRINT, pad_to, JsonLinesReader, AbstractVirtualApp, remove_suffix from .task_utils import pmap @@ -1011,6 +1012,53 @@ class TsvItemManager(ItemManagerMixin, TsvManager): pass +def _do_shell_command(command, cwd=None): + # This might need to be more elaborate, but hopefully it will do for now. -kmp 11-Sep-2023 + subprocess.check_output(command, cwd=cwd) + + +@contextlib.contextmanager +def maybe_unpack(filename): # Maybe move to another module + """ + If necessary, unpack a file that is zipped and/or tarred, yielding the name of the file (unpacked or not). + """ + unpackables = ['.tar.gz', '.tar', '.tgz', '.gz', '.zip'] + ext = None + for unpackable in unpackables: + if filename.endswith(unpackable): + ext = unpackable + break + if not ext: + yield filename + return + if not os.path.exists(filename): + # We don't bother to raise this error if we're not planning to do any unpacking. + # The caller can decide if/when such errors are needed in that case. + # But if we are going to have to move bits around, they'll need to actually be there. + # -kmp 12-Sep-2023 + raise ValueError(f"The file {filename!r} does not exist.") + target_base_part = remove_suffix(ext, os.path.basename(filename), required=True) + target_ext = '.tar.gz' if ext == '.tgz' else ext + with TemporaryDirectory() as temp_dir: + temp_base = os.path.join(temp_dir, target_base_part) + temp_filename = temp_base + target_ext + _do_shell_command(['cp', filename, temp_filename]) + if temp_filename.endswith('.gz'): + _do_shell_command(['gunzip', temp_filename], cwd=temp_dir) + temp_filename = remove_suffix('.gz', temp_filename) + elif temp_filename.endswith(".zip"): + _do_shell_command(['unzip', temp_filename], cwd=temp_dir) + temp_filename = remove_suffix('.zip', temp_filename) + if temp_filename.endswith(".tar"): + _do_shell_command(['tar', '-xf', temp_filename], cwd=temp_dir) + tar_file = temp_filename + temp_filename = remove_suffix(".tar", temp_filename, required=True) + if not os.path.isdir(temp_filename): + raise Exception(f"{tar_file} didn't unpack to a dir: {temp_filename}") + # print(f"Unpacked {filename} to {temp_filename}") + yield temp_filename + + class TableSetManager(AbstractTableSetManager): """ This class will open a .xlsx or .csv file and load its content in our standard format. @@ -1031,8 +1079,10 @@ def load(cls, filename: str, tab_name: Optional[str] = None, escaping: Optional[ """ Given a filename and various options """ - manager = cls.create_implementation_manager(filename=filename, tab_name=tab_name, escaping=escaping, **kwargs) - return manager.load_content() + with maybe_unpack(filename) as filename: + manager = cls.create_implementation_manager(filename=filename, tab_name=tab_name, escaping=escaping, + **kwargs) + return manager.load_content() class ItemManager(AbstractTableSetManager): @@ -1067,11 +1117,14 @@ def load(cls, filename: str, tab_name: Optional[str] = None, escaping: Optional[ :param portal_env: A portal to consult to find schemas (usually if calling from the outside of a portal). :param portal_vapp: A vapp to use (usually if calling from within a portal). """ - manager = cls.create_implementation_manager(filename=filename, tab_name=tab_name, escaping=escaping, - schemas=schemas, autoload_schemas=autoload_schemas, - portal_env=portal_env, portal_vapp=portal_vapp, - **kwargs) - return manager.load_content() + + with maybe_unpack(filename) as filename: + + manager = cls.create_implementation_manager(filename=filename, tab_name=tab_name, escaping=escaping, + schemas=schemas, autoload_schemas=autoload_schemas, + portal_env=portal_env, portal_vapp=portal_vapp, + **kwargs) + return manager.load_content() load_table_set = TableSetManager.load diff --git a/pyproject.toml b/pyproject.toml index d210a3e41..846624504 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dcicutils" -version = "7.11.0.1b8" # to become "7.12.0" +version = "7.11.0.1b9" # to become "7.12.0" description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources" authors = ["4DN-DCIC Team "] license = "MIT" diff --git a/test/test_sheet_utils.py b/test/test_sheet_utils.py index b9472a94c..ed312bf21 100644 --- a/test/test_sheet_utils.py +++ b/test/test_sheet_utils.py @@ -915,7 +915,7 @@ def __init__(self, name): self.call_count = 0 def get(self, path_url): - assert path_url.startswith('profiles/ExperimentSeq.json?') + assert path_url.startswith('/profiles/ExperimentSeq.json?') self.call_count += 1 response = MockResponse(200, json=experiment_seq_schema) return response From a19e8e32a3db3f7290f5fad5ffc8aeaea0b827f6 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Tue, 12 Sep 2023 06:36:58 -0400 Subject: [PATCH 061/101] Add bundle_utils to autodoc. --- docs/source/dcicutils.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/source/dcicutils.rst b/docs/source/dcicutils.rst index 8481da6a7..19569fa0c 100644 --- a/docs/source/dcicutils.rst +++ b/docs/source/dcicutils.rst @@ -23,6 +23,13 @@ beanstalk_utils :members: +bundle_utils +^^^^^^^^^^^^ + +.. automodule:: dcicutils.bundle_utils + :members: + + codebuild_utils ^^^^^^^^^^^^^^^ From 36de089d37e36d9a31a96d29468e9917fa075c96 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Sat, 23 Sep 2023 09:08:46 -0400 Subject: [PATCH 062/101] Added sheet_utils and glacier_utils changes from 7.11.0.1b9 --- dcicutils/glacier_utils.py | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/dcicutils/glacier_utils.py b/dcicutils/glacier_utils.py index 7609ab316..bbcf77893 100644 --- a/dcicutils/glacier_utils.py +++ b/dcicutils/glacier_utils.py @@ -58,10 +58,6 @@ def __init__(self, env_name: str): self.env_key = self.key_manager.get_keydict_for_env(env_name) self.health_page = get_health_page(key=self.env_key, ff_env=env_name) - @property - def kms_key_id(self) -> str: - return self.health_page.get("s3_encrypt_key_id", "") - @classmethod def is_glacier_storage_class(cls, storage_class: S3StorageClass): return storage_class in S3_GLACIER_CLASSES @@ -299,9 +295,6 @@ def _do_multipart_upload(self, bucket: str, key: str, total_size: int, part_size } if tags: cmu['Tagging'] = tags - if self.kms_key_id: - cmu['ServerSideEncryption'] = 'aws:kms' - cmu['SSEKMSKeyId'] = self.kms_key_id mpu = self.s3.create_multipart_upload(**cmu) mpu_upload_id = mpu['UploadId'] except Exception as e: @@ -388,21 +381,16 @@ def copy_object_back_to_original_location(self, bucket: str, key: str, storage_c else: # Force copy the object into standard in a single operation copy_source = {'Bucket': bucket, 'Key': key} - copy_args = { + copy_target = { 'Bucket': bucket, 'Key': key, 'StorageClass': storage_class, } if version_id: copy_source['VersionId'] = version_id - copy_args['CopySourceVersionId'] = version_id + copy_target['CopySourceVersionId'] = version_id if tags: - copy_args['Tagging'] = tags - if self.kms_key_id: - copy_args['ServerSideEncryption'] = 'aws:kms' - copy_args['SSEKMSKeyId'] = self.kms_key_id - response = self.s3.copy_object( - **copy_args, CopySource=copy_source - ) + copy_target['Tagging'] = tags + response = self.s3.copy_object(CopySource=copy_source, **copy_target) PRINT(f'Response from boto3 copy:\n{response}') PRINT(f'Object {bucket}/{key} copied back to its original location in S3') return response From 13d40f3f0da9b6dd41304de4a2bb100cf9d492d9 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Sat, 23 Sep 2023 09:11:41 -0400 Subject: [PATCH 063/101] Added sheet_utils and glacier_utils changes from 7.11.0.1b9 --- poetry.lock | 38 +++++++++++++++++++------------------- pyproject.toml | 2 +- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/poetry.lock b/poetry.lock index 81cd1d452..4a74465e6 100644 --- a/poetry.lock +++ b/poetry.lock @@ -45,17 +45,17 @@ lxml = ["lxml"] [[package]] name = "boto3" -version = "1.28.52" +version = "1.28.53" description = "The AWS SDK for Python" optional = false python-versions = ">= 3.7" files = [ - {file = "boto3-1.28.52-py3-none-any.whl", hash = "sha256:1d36db102517d62c6968b3b0636303241f56859d12dd071def4882fc6e030b20"}, - {file = "boto3-1.28.52.tar.gz", hash = "sha256:a34fc153cb2f6fb2f79a764286c967392e8aae9412381d943bddc576c4f7631a"}, + {file = "boto3-1.28.53-py3-none-any.whl", hash = "sha256:dc2da9aff7de359774030a243a09b74568664117e2afb77c6e4b90572ae3a6c3"}, + {file = "boto3-1.28.53.tar.gz", hash = "sha256:b95b0cc39f08402029c3a2bb141e1775cfa46576ebe9f9916f79bde90e27f53f"}, ] [package.dependencies] -botocore = ">=1.31.52,<1.32.0" +botocore = ">=1.31.53,<1.32.0" jmespath = ">=0.7.1,<2.0.0" s3transfer = ">=0.6.0,<0.7.0" @@ -64,13 +64,13 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] [[package]] name = "boto3-stubs" -version = "1.28.52" -description = "Type annotations for boto3 1.28.52 generated with mypy-boto3-builder 7.19.0" +version = "1.28.53" +description = "Type annotations for boto3 1.28.53 generated with mypy-boto3-builder 7.19.0" optional = false python-versions = ">=3.7" files = [ - {file = "boto3-stubs-1.28.52.tar.gz", hash = "sha256:12d7e5865aeec52e1f73b935b1c6a42e61325538fc2cb83a87a83e41e9485241"}, - {file = "boto3_stubs-1.28.52-py3-none-any.whl", hash = "sha256:3ea81a225e062f3bcb205467891086ea031519697ad54622e61251b52609b8d6"}, + {file = "boto3-stubs-1.28.53.tar.gz", hash = "sha256:453fb59aae740be06ac7baedfcdcaeb6644ac6a4f1382cb6a6c529ba1a94d9a2"}, + {file = "boto3_stubs-1.28.53-py3-none-any.whl", hash = "sha256:ea9341e0864df79385a72e1d5f2f24a79ad5d4a8fe33154dc82bf53f14752b04"}, ] [package.dependencies] @@ -114,7 +114,7 @@ backup-gateway = ["mypy-boto3-backup-gateway (>=1.28.0,<1.29.0)"] backupstorage = ["mypy-boto3-backupstorage (>=1.28.0,<1.29.0)"] batch = ["mypy-boto3-batch (>=1.28.0,<1.29.0)"] billingconductor = ["mypy-boto3-billingconductor (>=1.28.0,<1.29.0)"] -boto3 = ["boto3 (==1.28.52)", "botocore (==1.31.52)"] +boto3 = ["boto3 (==1.28.53)", "botocore (==1.31.53)"] braket = ["mypy-boto3-braket (>=1.28.0,<1.29.0)"] budgets = ["mypy-boto3-budgets (>=1.28.0,<1.29.0)"] ce = ["mypy-boto3-ce (>=1.28.0,<1.29.0)"] @@ -440,13 +440,13 @@ xray = ["mypy-boto3-xray (>=1.28.0,<1.29.0)"] [[package]] name = "botocore" -version = "1.31.52" +version = "1.31.53" description = "Low-level, data-driven core of boto 3." optional = false python-versions = ">= 3.7" files = [ - {file = "botocore-1.31.52-py3-none-any.whl", hash = "sha256:46b0a75a38521aa6a75fddccb1542e002930e609d4e13516f40fef170d32e515"}, - {file = "botocore-1.31.52.tar.gz", hash = "sha256:6d09881c5a8be34b497872ca3936f8757d886a6f42f2a8703411928189cfedc0"}, + {file = "botocore-1.31.53-py3-none-any.whl", hash = "sha256:aa647f94039d21de97c969df21ce8c5186b68234eb5c53148f0d8bbd708e375d"}, + {file = "botocore-1.31.53.tar.gz", hash = "sha256:905580ea724d74f11652bab63fcec6bf0d32f1cf8b2963f7388efc0ea406b69b"}, ] [package.dependencies] @@ -459,13 +459,13 @@ crt = ["awscrt (==0.16.26)"] [[package]] name = "botocore-stubs" -version = "1.31.52" +version = "1.31.53" description = "Type annotations and code completion for botocore" optional = false python-versions = ">=3.7,<4.0" files = [ - {file = "botocore_stubs-1.31.52-py3-none-any.whl", hash = "sha256:11431ac0faa35cad6deed6e87002f312f8ea6358d8106a241b60bcead2f84279"}, - {file = "botocore_stubs-1.31.52.tar.gz", hash = "sha256:2fa9b78c7a335a94d918079773dc3198786de741586187d847a8710b9c337009"}, + {file = "botocore_stubs-1.31.53-py3-none-any.whl", hash = "sha256:40cab7fdb56d4a33329a1a548f428fd89a2a470e0b6262931ed32b2752e7fb89"}, + {file = "botocore_stubs-1.31.53.tar.gz", hash = "sha256:020ede076d740da52fcc9ead8b0641fb6c5573f1d7c17d61b9d9d144b39905f8"}, ] [package.dependencies] @@ -865,20 +865,20 @@ smmap = ">=3.0.1,<6" [[package]] name = "gitpython" -version = "3.1.36" +version = "3.1.37" description = "GitPython is a Python library used to interact with Git repositories" optional = false python-versions = ">=3.7" files = [ - {file = "GitPython-3.1.36-py3-none-any.whl", hash = "sha256:8d22b5cfefd17c79914226982bb7851d6ade47545b1735a9d010a2a4c26d8388"}, - {file = "GitPython-3.1.36.tar.gz", hash = "sha256:4bb0c2a6995e85064140d31a33289aa5dce80133a23d36fcd372d716c54d3ebf"}, + {file = "GitPython-3.1.37-py3-none-any.whl", hash = "sha256:5f4c4187de49616d710a77e98ddf17b4782060a1788df441846bddefbb89ab33"}, + {file = "GitPython-3.1.37.tar.gz", hash = "sha256:f9b9ddc0761c125d5780eab2d64be4873fc6817c2899cbcb34b02344bdc7bc54"}, ] [package.dependencies] gitdb = ">=4.0.1,<5" [package.extras] -test = ["black", "coverage[toml]", "ddt (>=1.1.1,!=1.4.3)", "mypy", "pre-commit", "pytest", "pytest-cov", "pytest-sugar", "virtualenv"] +test = ["black", "coverage[toml]", "ddt (>=1.1.1,!=1.4.3)", "mypy", "pre-commit", "pytest", "pytest-cov", "pytest-sugar"] [[package]] name = "idna" diff --git a/pyproject.toml b/pyproject.toml index ac8a71e1e..8cb3e449f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dcicutils" -version = "7.12.0.1b5" # TODO: To become 8.0.0 +version = "7.12.0.2b1" # TODO: To become 8.0.0 description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources" authors = ["4DN-DCIC Team "] license = "MIT" From 23a87a7e203f8785c830992e34173ea855ec83be Mon Sep 17 00:00:00 2001 From: David Michaels Date: Sat, 23 Sep 2023 09:31:15 -0400 Subject: [PATCH 064/101] Update dcicutils to Python 3.11 WITH sheet_utils --- dcicutils/sheet_utils.py | 1131 ++++++++++++++++++++++++++++++++++++++ pyproject.toml | 2 +- 2 files changed, 1132 insertions(+), 1 deletion(-) create mode 100644 dcicutils/sheet_utils.py diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py new file mode 100644 index 000000000..5a311f7c0 --- /dev/null +++ b/dcicutils/sheet_utils.py @@ -0,0 +1,1131 @@ +import chardet +import contextlib +import copy +import csv +import glob +import io +import json +import openpyxl +import os +import re +import subprocess +import uuid +import yaml + +from openpyxl.worksheet.worksheet import Worksheet +from openpyxl.workbook.workbook import Workbook +from tempfile import TemporaryFile, TemporaryDirectory +from typing import Any, Dict, Iterable, List, Optional, Tuple, Type, Union +from .common import AnyJsonData +from .env_utils import public_env_name, EnvUtils +from .ff_utils import get_schema +from .lang_utils import conjoined_list, disjoined_list, maybe_pluralize, there_are +from .misc_utils import ignored, PRINT, pad_to, JsonLinesReader, AbstractVirtualApp, remove_suffix +from .task_utils import pmap + + +Header = str +Headers = List[str] +ParsedHeader = List[Union[str, int]] +ParsedHeaders = List[ParsedHeader] +SheetCellValue = Union[int, float, str] +SheetRow = List[SheetCellValue] +CsvReader = type(csv.reader(TemporaryFile())) +SheetData = List[dict] +TabbedSheetData = Dict[str, SheetData] +Regexp = type(re.compile("sample")) + + +class LoadFailure(Exception): + """ + In general, we'd prefer to load up the spreadsheet with clumsy data that can then be validated in detail, + but some errors are so confusing or so problematic that we need to just fail the load right away. + """ + pass + + +class LoadArgumentsError(LoadFailure): + """ + Errors of this class represent situations where we can't get started because + there's a problem with the given arguments. + """ + pass + + +class LoadTableError(LoadFailure): + """ + Errors of this class represent situations where we can't get started because + there's a problem with some table's syntax, for example headers that don't make sense. + """ + pass + + +@contextlib.contextmanager +def deferred_problems(): + problems = [] + + def note_problems(problem): + problems.append(problem) + + yield note_problems + + if problems: + for problem in problems: + PRINT(f"Problem: {problem}") + raise Exception(there_are(problems, kind='problem while compiling hints', tense='past', show=False)) + + +def unwanted_kwargs(*, context, kwargs, context_plural=False, detailed=False): + if kwargs: + unwanted = [f"{argname}={value!r}" if detailed else argname + for argname, value in kwargs.items() + if value is not None] + if unwanted: + does_not = "don't" if context_plural else "doesn't" + raise LoadArgumentsError(f"{context} {does_not} use" + f" {maybe_pluralize(unwanted, 'keyword argument')} {conjoined_list(unwanted)}.") + + +def prefer_number(value: SheetCellValue): + if isinstance(value, str): # the given value might be an int or float, in which case just fall through + if not value: + return None + value = value + ch0 = value[0] + if ch0 == '+' or ch0 == '-' or ch0.isdigit(): + try: + return int(value) + except Exception: + pass + try: + return float(value) + except Exception: + pass + # If we couldn't parse it as an int or float, fall through to returning the original value + pass + return value + + +def expand_string_escape_sequences(text: str) -> str: + s = io.StringIO() + escaping = False + for ch in text: + if escaping: + if ch == 'r': + s.write('\r') + elif ch == 't': + s.write('\t') + elif ch == 'n': + s.write('\n') + elif ch == '\\': + s.write('\\') + else: + # Rather than err, just leave other sequences as-is. + s.write(f"\\{ch}") + escaping = False + elif ch == '\\': + escaping = True + else: + s.write(ch) + return s.getvalue() + + +def open_unicode_text_input_file_respecting_byte_order_mark(filename): + """ + Opens a file for text input, respecting a byte-order mark (BOM). + """ + with io.open(filename, 'rb') as fp: + leading_bytes = fp.read(4 * 8) # 4 bytes is all we need + bom_info = chardet.detect(leading_bytes, should_rename_legacy=True) + detected_encoding = bom_info and bom_info.get('encoding') # tread lightly + use_encoding = 'utf-8' if detected_encoding == 'ascii' else detected_encoding + return io.open(filename, 'r', encoding=use_encoding) + + +class TypeHint: + def apply_hint(self, value): + return value + + def __str__(self): + return f"<{self.__class__.__name__}>" + + def __repr__(self): + return self.__str__() + + +class BoolHint(TypeHint): + + def apply_hint(self, value): + if isinstance(value, str) and value: + if 'true'.startswith(value.lower()): + return True + elif 'false'.startswith(value.lower()): + return False + return super().apply_hint(value) + + +class EnumHint(TypeHint): + + def __str__(self): + return f"" + + def __init__(self, value_map): + self.value_map = value_map + + def apply_hint(self, value): + if isinstance(value, str): + if value in self.value_map: + result = self.value_map[value] + return result + else: + lvalue = value.lower() + found = [] + for lkey, key in self.value_map.items(): + if lkey.startswith(lvalue): + found.append(lkey) + if len(found) == 1: + [only_found] = found + result = self.value_map[only_found] + return result + return super().apply_hint(value) + + +OptionalTypeHints = List[Optional[TypeHint]] + + +class ItemTools: + """ + Implements operations on table-related data without pre-supposing the specific representation of the table. + It is assumed this can be used for data that was obtained from .json, .csv, .tsv, and .xlsx files because + it does not presuppose the source of the data nor where it will be written to. + + For the purpose of this class: + + * a 'header' is a string representing the top of a column. + + * a 'parsed header' is a list of strings and/or ints, after splitting at uses of '#' or '.', so that + "a.b.c" is represented as ["a", "b", "c"], and "x.y#0" is represented as ["x", "y", 0], and representing + each numeric token as an int instead of a string. + + * a 'headers' object is just a list of strings, each of which is a 'header'. + + * a 'parsed headers' object is a non-empty list of lists, each of which is a 'parsed header'. + e..g., the headers ["a.b.c", "x.y#0"] is represented as parsed hearders [["a", "b", "c"], ["x", "y", 0]]. + + """ + + @classmethod + def parse_sheet_header(cls, header: Header) -> ParsedHeader: + result = [] + token = "" + for i in range(len(header)): + ch = header[i] + if ch == '.' or ch == '#': + if token: + result.append(int(token) if token.isdigit() else token) + token = "" + else: + token += ch + if token: + result.append(int(token) if token.isdigit() else token) + return result + + @classmethod + def parse_sheet_headers(cls, headers: Headers): + return [cls.parse_sheet_header(header) + for header in headers] + + @classmethod + def compute_patch_prototype(cls, parsed_headers: ParsedHeaders): + prototype = {} + for parsed_header in parsed_headers: + parsed_header0 = parsed_header[0] + if isinstance(parsed_header0, int): + raise LoadTableError(f"A header cannot begin with a numeric ref: {parsed_header0}") + cls.assure_patch_prototype_shape(parent=prototype, keys=parsed_header) + return prototype + + @classmethod + def assure_patch_prototype_shape(cls, *, parent: Union[Dict, List], keys: ParsedHeader): + [key0, *more_keys] = keys + key1 = more_keys[0] if more_keys else None + if isinstance(key1, int): + placeholder = [] + elif isinstance(key1, str): + placeholder = {} + else: + placeholder = None + if isinstance(key0, int): + n = len(parent) + if key0 == n: + parent.append(placeholder) + elif key0 > n: + raise LoadTableError("Numeric items must occur sequentially.") + elif isinstance(key0, str): + if key0 not in parent: + parent[key0] = placeholder + if key1 is not None: + cls.assure_patch_prototype_shape(parent=parent[key0], keys=more_keys) + return parent + + INSTAGUIDS_ENABLED = False # Experimental feature not enabled by default + + @classmethod + def parse_item_value(cls, value: SheetCellValue, context=None) -> AnyJsonData: + # TODO: Remodularize this for easier testing and more Schema-driven effect + # Doug asks that this be broken up into different mechanisms, more modular and separately testable. + # I pretty much agree with that. I'm just waiting for suggestions on what kinds of features are desired. + if isinstance(value, str): + lvalue = value.lower() + # TODO: We could consult a schema to make this less heuristic, but this may do for now + if lvalue == 'true': + return True + elif lvalue == 'false': + return False + elif lvalue == 'null' or lvalue == '': + return None + elif '|' in value: + if value == '|': # Use '|' for [] + return [] + else: + if value.endswith("|"): # Use 'foo|' for ['foo'] + value = value[:-1] + return [cls.parse_item_value(subvalue, context=context) for subvalue in value.split('|')] + elif cls.INSTAGUIDS_ENABLED and context is not None and value.startswith('#'): + # Note that this clause MUST follow '|' clause above so '#foo|#bar' isn't seen as instaguid + return cls.get_instaguid(value, context=context) + else: + # Doug points out that the schema might not agree, might want a string representation of a number. + # At this semantic layer, this might be a bad choice. + return prefer_number(value) + else: # presumably a number (int or float) + return value + + @classmethod + def get_instaguid(cls, guid_placeholder: str, *, context: Optional[Dict] = None): + if context is None: + return guid_placeholder + else: + referent = context.get(guid_placeholder) + if not referent: + context[guid_placeholder] = referent = str(uuid.uuid4()) + return referent + + @classmethod + def set_path_value(cls, datum: Union[List, Dict], path: ParsedHeader, value: Any, force: bool = False): + if (value is None or value == '') and not force: + return + [key, *more_path] = path + if not more_path: + datum[key] = value + else: + cls.set_path_value(datum[key], more_path, value) + + @classmethod + def find_type_hint(cls, parsed_header: Optional[ParsedHeader], schema: Any): + + def finder(subheader, subschema): + if not parsed_header: + return None + else: + [key1, *other_headers] = subheader + if isinstance(key1, str) and isinstance(subschema, dict): + if subschema.get('type') == 'object': + def1 = subschema.get('properties', {}).get(key1) + if not other_headers: + if def1 is not None: + t = def1.get('type') + if t == 'string': + enum = def1.get('enum') + if enum: + mapping = {e.lower(): e for e in enum} + return EnumHint(mapping) + elif t == 'boolean': + return BoolHint() + else: + pass # fall through to asking super() + else: + pass # fall through to asking super() + else: + return finder(subheader=other_headers, subschema=def1) + + return finder(subheader=parsed_header, subschema=schema) + + @classmethod + def infer_tab_name(cls, filename): + return os.path.basename(filename).split('.')[0] + + +# TODO: Consider whether this might want to be an abstract base class. Some change might be needed. +# +# Doug thinks we might want (metaclass=ABCMeta) here to make this an abstract base class. +# I am less certain but open to discussion. Among other things, as implemented now, +# the __init__ method here needs to run and the documentation says that ABC's won't appear +# in the method resolution order. -kmp 17-Aug-2023 +# See also discussion at https://github.com/4dn-dcic/utils/pull/276#discussion_r1297775535 +class AbstractTableSetManager: + """ + The TableSetManager is the spanning class of anything that wants to be able to load a table set, + regardless of what it wants to load it from. To do this, it must support a load method + that takes a filename and returns the file content in the form: + { + "Sheet1": [ + {...representation of row1 as some kind of dict...}, + {...representation of row2 as some kind of dict...} + ], + "Sheet2": [...], + ..., + } + It also needs some implementation of the .tab_names property. + Note that at this level of abstraction, we take no position on what form of representation is used + for the rows, as long as it is JSON data of some kind. It might be + {"col1": "val1", "col2": "val2", ...} + or it might be something more structured like + {"something": "val1", {"something_else": ["val2"]}} + Additionally, the values stored might be altered as well. In particular, the most likely alteration + is to turn "123" to 123 or "" to None, though the specifics of whether and how such transformations + happen is not constrained by this class. + """ + + ALLOWED_FILE_EXTENSIONS: List[str] = [] + + def __init__(self, filename: str, **kwargs): + self.filename: str = filename + unwanted_kwargs(context=self.__class__.__name__, kwargs=kwargs) + + # TODO: Consider whether this should be an abstractmethod (but first see detailed design note at top of class.) + @classmethod + def load(cls, filename: str, **kwargs) -> TabbedSheetData: + """ + Reads a filename and returns a dictionary that maps sheet names to rows of dictionary data. + For more information, see documentation of AbstractTableSetManager. + """ + raise NotImplementedError(f".load(...) is not implemented for {cls.__name__}.") # noQA + + @property + def tab_names(self) -> List[str]: + raise NotImplementedError(f".tab_names is not implemented for {self.__class__.__name__}..") # noQA + + def load_content(self) -> Any: + raise NotImplementedError(f".load_content() is not implemented for {self.__class__.__name__}.") # noQA + + +class BasicTableSetManager(AbstractTableSetManager): + """ + A BasicTableManager provides some structure that most kinds of parsers will need. + In particular, everything will likely need some way of storing headers and some way of storing content + of each sheet. Even a csv file, which doesn't have multiple tabs can be seen as the degenerate case + of this where there's only one set of headers and only one block of content. + """ + + def __init__(self, filename: str, **kwargs): + super().__init__(filename=filename, **kwargs) + self.headers_by_tab_name: Dict[str, Headers] = {} + self.content_by_tab_name: Dict[str, SheetData] = {} + self.reader_agent: Any = self._get_reader_agent() + + def tab_headers(self, tab_name: str) -> Headers: + return self.headers_by_tab_name[tab_name] + + def tab_content(self, tab_name: str) -> List[AnyJsonData]: + return self.content_by_tab_name[tab_name] + + @classmethod + def _create_tab_processor_state(cls, tab_name: str) -> Any: + """ + This method provides for the possibility that some parsers will want auxiliary state, + (such as parsed headers or a line count or a table of temporary names for objects to cross-link + or some other such feature) that it carries with it as it moves from line to line parsing things. + Subclasses might therefore want to make this do something more interesting. + """ + ignored(tab_name) # subclasses might need this, but we don't + return None + + def _get_reader_agent(self) -> Any: + """This function is responsible for opening the workbook and returning a workbook object.""" + raise NotImplementedError(f"._get_reader_agent() is not implemented for {self.__class__.__name__}.") # noQA + + +class SemanticTableSetManager(BasicTableSetManager): + """ + This is the base class for all workbook-like data sources, i.e., that may need to apply semantic processing. + Those may be: + * Excel workbook readers (.xlsx) + * Comma-separated file readers (.csv) + * Tab-separarated file readers (.tsv in most of the world, but Microsoft stupidly calls this .txt, outright + refusing to write a .tsv file, so many people seem to compromise and call this .tsv.txt) + There are two levels to each of these: a class that is not semantically interpreted, + and a class that is semantically interpreted as an "item". + + This is NOT a parent class of these kinds of files, which we always take literally as if semantic processing + were already done (in part so that they can be used to test the results of other formats): + * Json files + * Yaml files + * Inserts directories + * JsonLines files + """ + + @classmethod + def load(cls, filename: str, **kwargs) -> AnyJsonData: + if cls.ALLOWED_FILE_EXTENSIONS: + if not any(filename.lower().endswith(suffix) for suffix in cls.ALLOWED_FILE_EXTENSIONS): + raise LoadArgumentsError(f"The TableSetManager subclass {cls.__name__} expects only" + f" {disjoined_list(cls.ALLOWED_FILE_EXTENSIONS)} filenames: {filename}") + + table_set_manager: SemanticTableSetManager = cls(filename=filename, **kwargs) + return table_set_manager.load_content() + + def __init__(self, filename: str, **kwargs): + super().__init__(filename=filename, **kwargs) + + def _raw_row_generator_for_tab_name(self, tab_name: str) -> Iterable[SheetRow]: + """ + Given a tab_name and a state (returned by _sheet_loader_state), return a generator for a set of row values. + """ + raise NotImplementedError(f"._rows_for_tab_name(...) is not implemented for {self.__class__.__name__}.") # noQA + + def _process_row(self, tab_name: str, state: Any, row: List[SheetCellValue]) -> AnyJsonData: + """ + This needs to take a state and whatever represents a row and + must return a list of objects representing column values. + What constitutes a processed up to the class, but other than that the result must be a JSON dictionary. + """ + raise NotImplementedError(f"._process_row(...) is not implemented for {self.__class__.__name__}.") # noQA + + def load_content(self) -> AnyJsonData: + for tab_name in self.tab_names: + sheet_content = [] + state = self._create_tab_processor_state(tab_name) + for row_data in self._raw_row_generator_for_tab_name(tab_name): + processed_row_data: AnyJsonData = self._process_row(tab_name, state, row_data) + sheet_content.append(processed_row_data) + self.content_by_tab_name[tab_name] = sheet_content + return self.content_by_tab_name + + @classmethod + def parse_cell_value(cls, value: SheetCellValue) -> AnyJsonData: + return prefer_number(value) + + +class AbstractItemManager(AbstractTableSetManager): + + pass + + +class TableSetManagerRegistry: + + def __init__(self): + self.manager_table: Dict[str, Type[AbstractTableSetManager]] = {} + self.regexp_mappings: List[Tuple[Regexp, Type[AbstractTableSetManager]]] = [] + + def register(self, regexp: Optional[str] = None): + def _wrapped_register(class_to_register: Type[AbstractTableSetManager]): + if regexp: + self.regexp_mappings.append((re.compile(regexp), class_to_register)) + for ext in class_to_register.ALLOWED_FILE_EXTENSIONS: + existing = self.manager_table.get(ext) + if existing: + raise Exception(f"Tried to define {class_to_register} to extension {ext}," + f" but {existing} already claimed that.") + self.manager_table[ext] = class_to_register + return class_to_register + return _wrapped_register + + register1 = register + + def manager_for_filename(self, filename: str) -> Type[AbstractTableSetManager]: + base: str = os.path.basename(filename) + suffix_parts = base.split('.')[1:] + if suffix_parts: + for i in range(0, len(suffix_parts)): + suffix = f".{'.'.join(suffix_parts[i:])}" + found: Optional[Type[AbstractTableSetManager]] = self.manager_table.get(suffix) + if found: + return found + else: + special_case: Optional[Type[AbstractItemManager]] = self.manager_for_special_filename(filename) + if special_case: + return special_case + raise LoadArgumentsError(f"Unknown file type: {filename}") + + def manager_for_special_filename(self, filename: str) -> Optional[Type[AbstractTableSetManager]]: + for pattern, manager_class in self.regexp_mappings: + if pattern.match(filename): + return manager_class + return None + + +TABLE_SET_MANAGER_REGISTRY = TableSetManagerRegistry() +ITEM_MANAGER_REGISTRY = TableSetManagerRegistry() + + +@TABLE_SET_MANAGER_REGISTRY.register() +class XlsxManager(SemanticTableSetManager): + """ + This implements the mechanism to get a series of rows out of the sheets in an XLSX file. + """ + + ALLOWED_FILE_EXTENSIONS = ['.xlsx'] + + @classmethod + def _all_rows(cls, sheet: Worksheet): + row_max = sheet.max_row + for row in range(2, row_max + 1): + yield row + + @classmethod + def _all_cols(cls, sheet: Worksheet): + col_max = sheet.max_column + for col in range(1, col_max + 1): + yield col + + @property + def tab_names(self) -> List[str]: + return self.reader_agent.sheetnames + + def _get_reader_agent(self) -> Workbook: + return openpyxl.load_workbook(self.filename) + + def _raw_row_generator_for_tab_name(self, tab_name: str) -> Iterable[SheetRow]: + sheet = self.reader_agent[tab_name] + return (self._get_raw_row_content_tuple(sheet, row) + for row in self._all_rows(sheet)) + + def _get_raw_row_content_tuple(self, sheet: Worksheet, row: int) -> SheetRow: + return [sheet.cell(row=row, column=col).value + for col in self._all_cols(sheet)] + + def _create_tab_processor_state(self, tab_name: str) -> Headers: + sheet = self.reader_agent[tab_name] + headers: Headers = [str(sheet.cell(row=1, column=col).value) + for col in self._all_cols(sheet)] + self.headers_by_tab_name[sheet.title] = headers + return headers + + def _process_row(self, tab_name: str, headers: Headers, row_data: SheetRow) -> AnyJsonData: + ignored(tab_name) + return {headers[i]: self.parse_cell_value(row_datum) + for i, row_datum in enumerate(row_data)} + + +class SchemaAutoloadMixin(AbstractTableSetManager): + + SCHEMA_CACHE = {} # Shared cache. Do not override. Use .clear_schema_cache() to clear it. + CACHE_SCHEMAS = True # Controls whether we're doing caching at all + AUTOLOAD_SCHEMAS_DEFAULT = True + + def __init__(self, filename: str, autoload_schemas: Optional[bool] = None, portal_env: Optional[str] = None, + portal_vapp: Optional[AbstractVirtualApp] = None, **kwargs): + # This setup must be in place before the class initialization is done (via the super call). + self.autoload_schemas: bool = self.AUTOLOAD_SCHEMAS_DEFAULT if autoload_schemas is None else autoload_schemas + if self.autoload_schemas: # If autoload_schemas is False, we don't care about doing this defaulting. + if portal_env is None and portal_vapp is None: + portal_env = public_env_name(EnvUtils.PRD_ENV_NAME) + PRINT(f"The portal_env was not explicitly supplied. Schemas will come from portal_env={portal_env!r}.") + self.portal_env: Optional[str] = portal_env + self.portal_vapp: Optional[AbstractVirtualApp] = portal_vapp + super().__init__(filename=filename, **kwargs) + + def fetch_relevant_schemas(self, schema_names: List[str]): + # The schema_names argument is not normally given, but it is there for easier testing + def fetch_schema(schema_name): + schema = self.fetch_schema(schema_name, portal_env=self.portal_env, portal_vapp=self.portal_vapp) + return schema_name, schema + if self.autoload_schemas and (self.portal_env or self.portal_vapp): + autoloaded = {tab_name: schema + for tab_name, schema in pmap(fetch_schema, schema_names)} + return autoloaded + else: + return {} + + @classmethod + def fetch_schema(cls, schema_name: str, *, portal_env: Optional[str] = None, + portal_vapp: Optional[AbstractVirtualApp] = None): + def just_fetch_it(): + return get_schema(schema_name, portal_env=portal_env, portal_vapp=portal_vapp) + if cls.CACHE_SCHEMAS: + schema: Optional[AnyJsonData] = cls.SCHEMA_CACHE.get(schema_name) + if schema is None: + cls.SCHEMA_CACHE[schema_name] = schema = just_fetch_it() + return schema + else: + return just_fetch_it() + + @classmethod + def clear_schema_cache(cls): + for key in list(cls.SCHEMA_CACHE.keys()): # important to get the list of keys as a separate object first + cls.SCHEMA_CACHE.pop(key, None) + + +class ItemManagerMixin(SchemaAutoloadMixin, AbstractItemManager, BasicTableSetManager): + """ + This can add functionality to a reader such as an XlsxManager or a CsvManager in order to make its rows + get handled like Items instead of just flat table rows. + """ + + def __init__(self, filename: str, schemas: Optional[Dict[str, AnyJsonData]] = None, **kwargs): + super().__init__(filename=filename, **kwargs) + self.patch_prototypes_by_tab_name: Dict[str, Dict] = {} + self.parsed_headers_by_tab_name: Dict[str, ParsedHeaders] = {} + self.type_hints_by_tab_name: Dict[str, OptionalTypeHints] = {} + self._schemas = schemas + self._instaguid_context_table: Dict[str, str] = {} + + @property + def schemas(self): + schemas = self._schemas + if schemas is None: + self._schemas = schemas = self.fetch_relevant_schemas(self.tab_names) + return schemas + + def sheet_patch_prototype(self, tab_name: str) -> Dict: + return self.patch_prototypes_by_tab_name[tab_name] + + def sheet_parsed_headers(self, tab_name: str) -> ParsedHeaders: + return self.parsed_headers_by_tab_name[tab_name] + + def sheet_type_hints(self, tab_name: str) -> OptionalTypeHints: + return self.type_hints_by_tab_name[tab_name] + + class SheetState: + + def __init__(self, parsed_headers: ParsedHeaders, type_hints: OptionalTypeHints): + self.parsed_headers = parsed_headers + self.type_hints = type_hints + + def _compile_type_hints(self, tab_name: str): + parsed_headers = self.sheet_parsed_headers(tab_name) + schema = self.schemas.get(tab_name) + with deferred_problems() as note_problem: + for required_header in self._schema_required_headers(schema): + if required_header not in parsed_headers: + note_problem("Missing required header") + type_hints = [ItemTools.find_type_hint(parsed_header, schema) if schema else None + for parsed_header in parsed_headers] + self.type_hints_by_tab_name[tab_name] = type_hints + + @classmethod + def _schema_required_headers(cls, schema): + ignored(schema) + return [] # TODO: Make this compute a list of required headers (in parsed header form) + + def _compile_sheet_headers(self, tab_name: str): + headers = self.headers_by_tab_name[tab_name] + parsed_headers = ItemTools.parse_sheet_headers(headers) + self.parsed_headers_by_tab_name[tab_name] = parsed_headers + prototype = ItemTools.compute_patch_prototype(parsed_headers) + self.patch_prototypes_by_tab_name[tab_name] = prototype + + def _create_tab_processor_state(self, tab_name: str) -> SheetState: + super()._create_tab_processor_state(tab_name) + # This will create state that allows us to efficiently assign values in the right place on each row + # by setting up a prototype we can copy and then drop values into. + self._compile_sheet_headers(tab_name) + self._compile_type_hints(tab_name) + return self.SheetState(parsed_headers=self.sheet_parsed_headers(tab_name), + type_hints=self.sheet_type_hints(tab_name)) + + def _process_row(self, tab_name: str, state: SheetState, row_data: SheetRow) -> AnyJsonData: + parsed_headers = state.parsed_headers + type_hints = state.type_hints + patch_item = copy.deepcopy(self.sheet_patch_prototype(tab_name)) + for i, value in enumerate(row_data): + parsed_value = self.parse_cell_value(value) + type_hint = type_hints[i] + if type_hint: + parsed_value = type_hint.apply_hint(parsed_value) + ItemTools.set_path_value(patch_item, parsed_headers[i], parsed_value) + return patch_item + + def parse_cell_value(self, value: SheetCellValue) -> AnyJsonData: + return ItemTools.parse_item_value(value, context=self._instaguid_context_table) + + +@ITEM_MANAGER_REGISTRY.register() +class XlsxItemManager(ItemManagerMixin, XlsxManager): + """ + This layers item-style row processing functionality on an XLSX file. + """ + pass + + +class SingleTableMixin(AbstractTableSetManager): + + def __init__(self, filename: str, tab_name: Optional[str] = None, **kwargs): + self._tab_name = tab_name or ItemTools.infer_tab_name(filename) + super().__init__(filename=filename, **kwargs) + + @property + def tab_names(self) -> List[str]: + return [self._tab_name] + + +class InsertsManager(BasicTableSetManager): # ItemManagerMixin isn't really appropriate here + + ALLOWED_FILE_EXTENSIONS = [] + + def _parse_inserts_data(self, filename: str) -> AnyJsonData: + raise NotImplementedError(f"._parse_inserts_dataa(...) is not implemented for {self.__class__.__name__}.") # noQA + + def _load_inserts_data(self, filename: str) -> TabbedSheetData: + data: AnyJsonData = self._parse_inserts_data(filename) + tabbed_inserts: AnyJsonData = self._wrap_inserts_data(filename, data) + if (not isinstance(tabbed_inserts, dict) + or not all(isinstance(tab_name, str) for tab_name in tabbed_inserts.keys()) + or not all(isinstance(content, list) and all(isinstance(item, dict) for item in content) + for content in tabbed_inserts.values())): + raise ValueError(f"Data in {filename} is not of type TabbedSheetData (Dict[str, List[dict]]).") + tabbed_inserts: TabbedSheetData # we've just checked that + return tabbed_inserts + + @classmethod + def _wrap_inserts_data(cls, filename: str, data: AnyJsonData) -> AnyJsonData: + ignored(filename) + return data + + @property + def tab_names(self) -> List[str]: + return list(self.content_by_tab_name.keys()) + + def _get_reader_agent(self) -> Any: + return self + + def load_content(self) -> Dict[str, AnyJsonData]: + data = self._load_inserts_data(self.filename) + for tab_name, tab_content in data.items(): + self.content_by_tab_name[tab_name] = tab_content + if not tab_content: + self.headers_by_tab_name[tab_name] = [] + else: + self.headers_by_tab_name[tab_name] = list(tab_content[0].keys()) + return self.content_by_tab_name + + +class SimpleInsertsMixin(SingleTableMixin): + + def _wrap_inserts_data(self, filename: str, data: AnyJsonData) -> TabbedSheetData: + if (not isinstance(data, list) + or not all(isinstance(item, dict) for item in data)): + raise ValueError(f"Data in {filename} is not of type SheetData (List[dict]).") + return {self._tab_name: data} + + +class JsonInsertsMixin: + + @classmethod + def _parse_inserts_data(cls, filename: str) -> AnyJsonData: + return json.load(open_unicode_text_input_file_respecting_byte_order_mark(filename)) + + +@TABLE_SET_MANAGER_REGISTRY.register() +class TabbedJsonInsertsManager(JsonInsertsMixin, InsertsManager): + + ALLOWED_FILE_EXTENSIONS = [".tabs.json"] # If you want them all in one family, use this extension + + +@TABLE_SET_MANAGER_REGISTRY.register() +class SimpleJsonInsertsManager(SimpleInsertsMixin, JsonInsertsMixin, InsertsManager): + + ALLOWED_FILE_EXTENSIONS = [".json"] + + +class YamlInsertsMixin: + + def _parse_inserts_data(self, filename) -> AnyJsonData: + return yaml.safe_load(open_unicode_text_input_file_respecting_byte_order_mark(filename)) + + +@TABLE_SET_MANAGER_REGISTRY.register() +class TabbedYamlInsertsManager(YamlInsertsMixin, InsertsManager): + + ALLOWED_FILE_EXTENSIONS = [".tabs.yaml"] + + def _parse_inserts_data(self, filename) -> AnyJsonData: + return yaml.safe_load(open_unicode_text_input_file_respecting_byte_order_mark(filename)) + + +@TABLE_SET_MANAGER_REGISTRY.register() +class SimpleYamlInsertsManager(SimpleInsertsMixin, YamlInsertsMixin, InsertsManager): + + ALLOWED_FILE_EXTENSIONS = [".yaml"] + + +class InsertsItemMixin(AbstractItemManager): # ItemManagerMixin isn't really appropriate here + """ + This class is used for inserts directories and other JSON-like data that will be literally used as an Item + without semantic pre-processing. In other words, these classes will not be pre-checked for semantic correctness + but instead assumed to have been checked by other means. + """ + + AUTOLOAD_SCHEMAS_DEFAULT = False # Has no effect, but someone might inspect the value. + + def __init__(self, filename: str, *, autoload_schemas: Optional[bool] = None, portal_env: Optional[str] = None, + portal_vapp: Optional[AbstractVirtualApp] = None, schemas: Optional[Dict[str, AnyJsonData]] = None, + **kwargs): + ignored(portal_env, portal_vapp) # Would only be used if autoload_schemas was true, and we don't allow that. + if schemas not in [None, {}]: + raise ValueError(f"{self.__class__.__name__} does not allow schemas={schemas!r}.") + if autoload_schemas not in [None, False]: + raise ValueError(f"{self.__class__.__name__} does not allow autoload_schemas={autoload_schemas!r}.") + super().__init__(filename=filename, **kwargs) + + +@ITEM_MANAGER_REGISTRY.register() +class TabbedJsonInsertsItemManager(InsertsItemMixin, TabbedJsonInsertsManager): + pass + + +@ITEM_MANAGER_REGISTRY.register() +class SimpleJsonInsertsItemManager(InsertsItemMixin, SimpleJsonInsertsManager): + pass + + +@ITEM_MANAGER_REGISTRY.register() +class TabbedYamlInsertsItemManager(InsertsItemMixin, TabbedYamlInsertsManager): + pass + + +@ITEM_MANAGER_REGISTRY.register() +class SimpleYamlInsertsItemManager(InsertsItemMixin, SimpleYamlInsertsManager): + pass + + +@TABLE_SET_MANAGER_REGISTRY.register() +class SimpleJsonLinesInsertsManager(SimpleInsertsMixin, InsertsManager): + + ALLOWED_FILE_EXTENSIONS = [".jsonl"] + + def _parse_inserts_data(self, filename: str) -> AnyJsonData: + return [line for line in JsonLinesReader(open_unicode_text_input_file_respecting_byte_order_mark(filename))] + + +@ITEM_MANAGER_REGISTRY.register() +class SimpleJsonLinesInsertsItemManager(InsertsItemMixin, SimpleJsonLinesInsertsManager): + pass + + +@TABLE_SET_MANAGER_REGISTRY.register(regexp="^(.*/)?(|[^/]*[-_])inserts/?$") +class InsertsDirectoryManager(InsertsManager): + + ALLOWED_FILE_EXTENSIONS = [] + + def _parse_inserts_data(self, filename: str) -> AnyJsonData: + if not os.path.isdir(filename): + raise LoadArgumentsError(f"{filename} is not the name of an inserts directory.") + tab_files = glob.glob(os.path.join(filename, "*.json")) + data = {} + for tab_file in tab_files: + tab_content = json.load(open_unicode_text_input_file_respecting_byte_order_mark(tab_file)) + # Here we don't use os.path.splitext because we want to split on the first dot. + # e.g., for foo.bar.baz, return just foo + # this allows names like ExperimentSet.tab.json that might need to use multi-dot suffixes + # for things unrelated to the tab name. + tab_name = os.path.basename(tab_file).split('.')[0] + data[tab_name] = tab_content + return data + + +@ITEM_MANAGER_REGISTRY.register(regexp="^(.*/)?(|[^/]*[-_])inserts/?$") +class InsertsDirectoryItemManager(InsertsItemMixin, InsertsDirectoryManager): + pass + + +@TABLE_SET_MANAGER_REGISTRY.register() +class CsvManager(SingleTableMixin, SemanticTableSetManager): + """ + This implements the mechanism to get a series of rows out of the sheet in a csv file, + returning a result that still looks like there could have been multiple tabs. + """ + + ALLOWED_FILE_EXTENSIONS = ['.csv'] + + def __init__(self, filename: str, escaping: Optional[bool] = None, **kwargs): + super().__init__(filename=filename, **kwargs) + self.escaping: bool = escaping or False + + def _get_reader_agent(self) -> CsvReader: + return self._get_reader_agent_for_filename(self.filename) + + @classmethod + def _get_reader_agent_for_filename(cls, filename) -> CsvReader: + return csv.reader(open_unicode_text_input_file_respecting_byte_order_mark(filename)) + + PAD_TRAILING_TABS = True + + def _raw_row_generator_for_tab_name(self, tab_name: str) -> Iterable[SheetRow]: + headers = self.tab_headers(tab_name) + n_headers = len(headers) + for row_data in self.reader_agent: + if self.PAD_TRAILING_TABS: + row_data = pad_to(n_headers, row_data, padding='') + yield row_data + + def _create_tab_processor_state(self, tab_name: str) -> Headers: + headers: Optional[Headers] = self.headers_by_tab_name.get(tab_name) + if headers is None: + self.headers_by_tab_name[tab_name] = headers = self.reader_agent.__next__() + return headers + + @classmethod + def _escape_cell_text(cls, cell_text): + if '\\' in cell_text: + return expand_string_escape_sequences(cell_text) + else: + return cell_text + + def _process_row(self, tab_name: str, headers: Headers, row_data: SheetRow) -> AnyJsonData: + ignored(tab_name) + if self.escaping: + return {headers[i]: self.parse_cell_value(self._escape_cell_text(cell_text)) + for i, cell_text in enumerate(row_data)} + else: + return {headers[i]: self.parse_cell_value(cell_text) + for i, cell_text in enumerate(row_data)} + + +@ITEM_MANAGER_REGISTRY.register() +class CsvItemManager(ItemManagerMixin, CsvManager): + """ + This layers item-style row processing functionality on a CSV file. + """ + pass + + +@TABLE_SET_MANAGER_REGISTRY.register() +class TsvManager(CsvManager): + """ + TSV files are just CSV files with tabs instead of commas as separators. + (We do not presently handle any escaping of strange characters. May need to add handling for backslash escaping.) + """ + ALLOWED_FILE_EXTENSIONS = ['.tsv', '.tsv.txt'] + + @classmethod + def _get_reader_agent_for_filename(cls, filename) -> CsvReader: + return csv.reader(open_unicode_text_input_file_respecting_byte_order_mark(filename), delimiter='\t') + + +@ITEM_MANAGER_REGISTRY.register() +class TsvItemManager(ItemManagerMixin, TsvManager): + """ + This layers item-style row processing functionality on a TSV file. + """ + pass + + +def _do_shell_command(command, cwd=None): + # This might need to be more elaborate, but hopefully it will do for now. -kmp 11-Sep-2023 + subprocess.check_output(command, cwd=cwd) + + +@contextlib.contextmanager +def maybe_unpack(filename): # Maybe move to another module + """ + If necessary, unpack a file that is zipped and/or tarred, yielding the name of the file (unpacked or not). + """ + unpackables = ['.tar.gz', '.tar', '.tgz', '.gz', '.zip'] + ext = None + for unpackable in unpackables: + if filename.endswith(unpackable): + ext = unpackable + break + if not ext: + yield filename + return + if not os.path.exists(filename): + # We don't bother to raise this error if we're not planning to do any unpacking. + # The caller can decide if/when such errors are needed in that case. + # But if we are going to have to move bits around, they'll need to actually be there. + # -kmp 12-Sep-2023 + raise ValueError(f"The file {filename!r} does not exist.") + target_base_part = remove_suffix(ext, os.path.basename(filename), required=True) + target_ext = '.tar.gz' if ext == '.tgz' else ext + with TemporaryDirectory() as temp_dir: + temp_base = os.path.join(temp_dir, target_base_part) + temp_filename = temp_base + target_ext + _do_shell_command(['cp', filename, temp_filename]) + if temp_filename.endswith('.gz'): + _do_shell_command(['gunzip', temp_filename], cwd=temp_dir) + temp_filename = remove_suffix('.gz', temp_filename) + elif temp_filename.endswith(".zip"): + _do_shell_command(['unzip', temp_filename], cwd=temp_dir) + temp_filename = remove_suffix('.zip', temp_filename) + if temp_filename.endswith(".tar"): + _do_shell_command(['tar', '-xf', temp_filename], cwd=temp_dir) + tar_file = temp_filename + temp_filename = remove_suffix(".tar", temp_filename, required=True) + if not os.path.isdir(temp_filename): + raise Exception(f"{tar_file} didn't unpack to a dir: {temp_filename}") + # print(f"Unpacked {filename} to {temp_filename}") + yield temp_filename + + +class TableSetManager(AbstractTableSetManager): + """ + This class will open a .xlsx or .csv file and load its content in our standard format. + (See more detailed description in AbstractTableManager.) + """ + + @classmethod + def create_implementation_manager(cls, filename: str, **kwargs) -> AbstractTableSetManager: + reader_agent_class = TABLE_SET_MANAGER_REGISTRY.manager_for_filename(filename) + if issubclass(reader_agent_class, AbstractItemManager): + raise ValueError(f"TableSetManager unexpectedly found reader agent class {reader_agent_class}.") + reader_agent = reader_agent_class(filename=filename, **kwargs) + return reader_agent + + @classmethod + def load(cls, filename: str, tab_name: Optional[str] = None, escaping: Optional[bool] = None, + **kwargs) -> TabbedSheetData: + """ + Given a filename and various options + """ + with maybe_unpack(filename) as filename: + manager = cls.create_implementation_manager(filename=filename, tab_name=tab_name, escaping=escaping, + **kwargs) + return manager.load_content() + + +class ItemManager(AbstractTableSetManager): + """ + This class will open a .xlsx or .csv file and load its content in our standard format. + (See more detailed description in AbstractTableManager.) + """ + + @classmethod + def create_implementation_manager(cls, filename: str, **kwargs) -> AbstractItemManager: + reader_agent_class: Type[AbstractTableSetManager] = ITEM_MANAGER_REGISTRY.manager_for_filename(filename) + if not issubclass(reader_agent_class, AbstractItemManager): + raise ValueError(f"ItemManager unexpectedly found reader agent class {reader_agent_class}.") + reader_agent_class: Type[AbstractItemManager] + reader_agent = reader_agent_class(filename=filename, **kwargs) + return reader_agent + + @classmethod + def load(cls, filename: str, tab_name: Optional[str] = None, escaping: Optional[bool] = None, + schemas: Optional[Dict] = None, autoload_schemas: Optional[bool] = None, + portal_env: Optional[str] = None, portal_vapp: Optional[AbstractVirtualApp] = None, + **kwargs) -> TabbedSheetData: + """ + Given a filename and various options, loads the items associated with that filename. + + :param filename: The name of the file to load. + :param tab_name: For files that lack multiple tabs (such as .csv or .tsv), + the tab name to associate with the data. + :param escaping: Whether to perform escape processing on backslashes. + :param schemas: A set of schemas to use instead of trying to load them. + :param autoload_schemas: Whether to try autoloading schemas. + :param portal_env: A portal to consult to find schemas (usually if calling from the outside of a portal). + :param portal_vapp: A vapp to use (usually if calling from within a portal). + """ + + with maybe_unpack(filename) as filename: + + manager = cls.create_implementation_manager(filename=filename, tab_name=tab_name, escaping=escaping, + schemas=schemas, autoload_schemas=autoload_schemas, + portal_env=portal_env, portal_vapp=portal_vapp, + **kwargs) + return manager.load_content() + + +load_table_set = TableSetManager.load +load_items = ItemManager.load diff --git a/pyproject.toml b/pyproject.toml index 8cb3e449f..1cef34510 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dcicutils" -version = "7.12.0.2b1" # TODO: To become 8.0.0 +version = "7.12.0.2b2" # TODO: To become 8.0.0 description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources" authors = ["4DN-DCIC Team "] license = "MIT" From ecea2924ab7d939e9ad0128099c958092f523527 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Sat, 23 Sep 2023 09:38:05 -0400 Subject: [PATCH 065/101] Update dcicutils to Python 3.11 WITH sheet_utils --- dcicutils/misc_utils.py | 96 ++++++++++++++++++++++++++++++++++++++++- pyproject.toml | 2 +- 2 files changed, 96 insertions(+), 2 deletions(-) diff --git a/dcicutils/misc_utils.py b/dcicutils/misc_utils.py index cc18f4b19..8ebd991a4 100644 --- a/dcicutils/misc_utils.py +++ b/dcicutils/misc_utils.py @@ -9,6 +9,7 @@ import inspect import math import io +import json import os import logging import pytz @@ -191,7 +192,11 @@ class _VirtualAppHelper(webtest.TestApp): pass -class VirtualApp: +class AbstractVirtualApp: + pass + + +class VirtualApp(AbstractVirtualApp): """ Wrapper class for TestApp, to allow custom control over submitting Encoded requests, simulating a number of conditions, including permissions. @@ -1352,6 +1357,25 @@ def capitalize1(s): return s[:1].upper() + s[1:] +""" +Python's UUID ignores all dashes, whereas Postgres is more strict +http://www.postgresql.org/docs/9.2/static/datatype-uuid.html +See also http://www.postgresql.org/docs/9.2/static/datatype-uuid.html +And, anyway, this pattern is what our portals have been doing +for quite a while, so it's the most stable choice for us now. +""" + +uuid_re = re.compile(r'(?i)[{]?(?:[0-9a-f]{4}-?){8}[}]?') + + +def is_uuid(instance): + """ + Predicate returns true for any group of 32 hex characters with optional hyphens every four characters. + We insist on lowercase to make matching faster. See other notes on this design choice above. + """ + return bool(uuid_re.match(instance)) + + def string_list(s): """ Turns a comma-separated list into an actual list, trimming whitespace and ignoring nulls. @@ -2313,3 +2337,73 @@ def parse_in_radix(text: str, *, radix: int): except Exception: pass raise ValueError(f"Unable to parse: {text!r}") + + +def pad_to(target_size: int, data: list, *, padding=None): + """ + This will pad to a given target size, a list of a potentially different actual size, using given padding. + e.g., pad_to(3, [1, 2]) will return [1, 2, None] + """ + actual_size = len(data) + if actual_size < target_size: + data = data + [padding] * (target_size - actual_size) + return data + + +class JsonLinesReader: + + def __init__(self, fp, padded=False, padding=None): + """ + Given an fp (the conventional name for a "file pointer", the thing a call to io.open returns, + this creates an object that can be used to iterate across the lines in the JSON lines file + that the fp is reading from. + + There are two possible formats that this will return. + + For files that contain a series of dictionaries, such as: + {"something": 1, "else": "a"} + {"something": 2, "else": "b"} + ...etc + this will just return thos those dictionaries one-by-one when iterated over. + + The same set of dictionaries will also be yielded by a file containing: + ["something", "else"] + [1, "a"] + [2, "b"] + ...etc + this will just return thos those dictionaries one-by-one when iterated over. + + NOTES: + + * In the second case, shorter lists on subsequent lines return only partial dictionaries. + * In the second case, longer lists on subsequent lines will quietly drop any extra elements. + """ + + self.fp = fp + self.padded: bool = padded + self.padding = padding + self.headers = None # Might change after we see first line + + def __iter__(self): + first_line = True + n_headers = 0 + for raw_line in self.fp: + line = json.loads(raw_line) + if first_line: + first_line = False + if isinstance(line, list): + self.headers = line + n_headers = len(line) + continue + # If length of line is more than we expect, ignore it. Let user put comments beyond our table + # But if length of line is less than we expect, extend the line with None + if self.headers: + if not isinstance(line, list): + raise Exception("If the first line is a list, all lines must be.") + if self.padded and len(line) < n_headers: + line = pad_to(n_headers, line, padding=self.padding) + yield dict(zip(self.headers, line)) + elif isinstance(line, dict): + yield line + else: + raise Exception(f"If the first line is not a list, all lines must be dictionaries: {line!r}") diff --git a/pyproject.toml b/pyproject.toml index 1cef34510..cd4c5b2bd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dcicutils" -version = "7.12.0.2b2" # TODO: To become 8.0.0 +version = "7.12.0.2b3" # TODO: To become 8.0.0 description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources" authors = ["4DN-DCIC Team "] license = "MIT" From cdc46770e498d9660812bb5b90360079c4be36f4 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Tue, 26 Sep 2023 08:20:05 -0400 Subject: [PATCH 066/101] Updated boto versions. --- poetry.lock | 32 ++++++++++++++++---------------- pyproject.toml | 10 +++++----- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/poetry.lock b/poetry.lock index 4a74465e6..d0564a273 100644 --- a/poetry.lock +++ b/poetry.lock @@ -45,17 +45,17 @@ lxml = ["lxml"] [[package]] name = "boto3" -version = "1.28.53" +version = "1.28.54" description = "The AWS SDK for Python" optional = false python-versions = ">= 3.7" files = [ - {file = "boto3-1.28.53-py3-none-any.whl", hash = "sha256:dc2da9aff7de359774030a243a09b74568664117e2afb77c6e4b90572ae3a6c3"}, - {file = "boto3-1.28.53.tar.gz", hash = "sha256:b95b0cc39f08402029c3a2bb141e1775cfa46576ebe9f9916f79bde90e27f53f"}, + {file = "boto3-1.28.54-py3-none-any.whl", hash = "sha256:3cb2aee317a1b8686e3b23674e4099b8ff7451bd8acc61b9719acff86fa024d1"}, + {file = "boto3-1.28.54.tar.gz", hash = "sha256:22e37d8c4f2d97b5e5c6ccc1d9edc7760717990b0ba8b8ea17a58cc87e57c5c9"}, ] [package.dependencies] -botocore = ">=1.31.53,<1.32.0" +botocore = ">=1.31.54,<1.32.0" jmespath = ">=0.7.1,<2.0.0" s3transfer = ">=0.6.0,<0.7.0" @@ -64,13 +64,13 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] [[package]] name = "boto3-stubs" -version = "1.28.53" -description = "Type annotations for boto3 1.28.53 generated with mypy-boto3-builder 7.19.0" +version = "1.28.54" +description = "Type annotations for boto3 1.28.54 generated with mypy-boto3-builder 7.19.0" optional = false python-versions = ">=3.7" files = [ - {file = "boto3-stubs-1.28.53.tar.gz", hash = "sha256:453fb59aae740be06ac7baedfcdcaeb6644ac6a4f1382cb6a6c529ba1a94d9a2"}, - {file = "boto3_stubs-1.28.53-py3-none-any.whl", hash = "sha256:ea9341e0864df79385a72e1d5f2f24a79ad5d4a8fe33154dc82bf53f14752b04"}, + {file = "boto3-stubs-1.28.54.tar.gz", hash = "sha256:cb6f6a6cca4826c2b5d61b0a3fcf9cb84d7924c120b7e1c670f08e21cc010425"}, + {file = "boto3_stubs-1.28.54-py3-none-any.whl", hash = "sha256:a58a6ebd0767d7aa1f4ab6f3eec93388b20afff5c5b25c8fe8f866d85eda6f67"}, ] [package.dependencies] @@ -114,7 +114,7 @@ backup-gateway = ["mypy-boto3-backup-gateway (>=1.28.0,<1.29.0)"] backupstorage = ["mypy-boto3-backupstorage (>=1.28.0,<1.29.0)"] batch = ["mypy-boto3-batch (>=1.28.0,<1.29.0)"] billingconductor = ["mypy-boto3-billingconductor (>=1.28.0,<1.29.0)"] -boto3 = ["boto3 (==1.28.53)", "botocore (==1.31.53)"] +boto3 = ["boto3 (==1.28.54)", "botocore (==1.31.54)"] braket = ["mypy-boto3-braket (>=1.28.0,<1.29.0)"] budgets = ["mypy-boto3-budgets (>=1.28.0,<1.29.0)"] ce = ["mypy-boto3-ce (>=1.28.0,<1.29.0)"] @@ -440,13 +440,13 @@ xray = ["mypy-boto3-xray (>=1.28.0,<1.29.0)"] [[package]] name = "botocore" -version = "1.31.53" +version = "1.31.54" description = "Low-level, data-driven core of boto 3." optional = false python-versions = ">= 3.7" files = [ - {file = "botocore-1.31.53-py3-none-any.whl", hash = "sha256:aa647f94039d21de97c969df21ce8c5186b68234eb5c53148f0d8bbd708e375d"}, - {file = "botocore-1.31.53.tar.gz", hash = "sha256:905580ea724d74f11652bab63fcec6bf0d32f1cf8b2963f7388efc0ea406b69b"}, + {file = "botocore-1.31.54-py3-none-any.whl", hash = "sha256:71fdb337ddcdb6bf378e1211cba9ce754c35f12b1524c7d0c0c147b2310356c7"}, + {file = "botocore-1.31.54.tar.gz", hash = "sha256:c98e78a9490c4166b205f87912b46770e156bfe7d53bae54ccbd49c68a336ec6"}, ] [package.dependencies] @@ -459,13 +459,13 @@ crt = ["awscrt (==0.16.26)"] [[package]] name = "botocore-stubs" -version = "1.31.53" +version = "1.31.54" description = "Type annotations and code completion for botocore" optional = false python-versions = ">=3.7,<4.0" files = [ - {file = "botocore_stubs-1.31.53-py3-none-any.whl", hash = "sha256:40cab7fdb56d4a33329a1a548f428fd89a2a470e0b6262931ed32b2752e7fb89"}, - {file = "botocore_stubs-1.31.53.tar.gz", hash = "sha256:020ede076d740da52fcc9ead8b0641fb6c5573f1d7c17d61b9d9d144b39905f8"}, + {file = "botocore_stubs-1.31.54-py3-none-any.whl", hash = "sha256:6e3e015ba4d2172c6a9bd5ff5131c39c995991559a567dce6c8534884f83ded1"}, + {file = "botocore_stubs-1.31.54.tar.gz", hash = "sha256:96223fdf87a68037dd1ed366ffe28b5b744aca495a365f4535e2179c9a88455c"}, ] [package.dependencies] @@ -1606,4 +1606,4 @@ tests = ["PasteDeploy", "WSGIProxy2", "coverage", "mock", "nose (<1.3.0)", "pyqu [metadata] lock-version = "2.0" python-versions = ">=3.8,<3.12" -content-hash = "fb29c53ae0f29eab1185dee384d1cd3b7dd039ca2cef7a5a38915de0e939a6a1" +content-hash = "6cdcf4e9fab04bbcaa83c3054b61356b4830b55c67e2a362dd1af635ae75a688" diff --git a/pyproject.toml b/pyproject.toml index cd4c5b2bd..df505329a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dcicutils" -version = "7.12.0.2b3" # TODO: To become 8.0.0 +version = "7.12.0.2b4" # TODO: To become 8.0.0 description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources" authors = ["4DN-DCIC Team "] license = "MIT" @@ -36,8 +36,8 @@ classifiers = [ [tool.poetry.dependencies] python = ">=3.8,<3.12" -boto3 = "^1.28.51" -botocore = "^1.31.51" +boto3 = "^1.28.54" +botocore = "^1.31.54" # The DCIC portals (cgap-portal and fourfront) are very particular about which ElasticSearch version. # This value is intentionally pinned and must not be changed casually. elasticsearch = "7.13.4" @@ -61,8 +61,8 @@ tqdm = "^4.65.0" [tool.poetry.dev-dependencies] -botocore-stubs = "^1.31.51" -boto3-stubs = "^1.28.51" +boto3-stubs = "^1.28.54" +botocore-stubs = "^1.31.54" coverage = ">=7.2.3" # Loaded manually in GA workflow for coverage because a dependency on 2to3 # in its docopts dependency makes a problem for laoding it here in poetry. -kmp 7-Apr-2023 From e4f846dd4ca80b41b00060edb0d4fbeddfc9e3ac Mon Sep 17 00:00:00 2001 From: David Michaels Date: Tue, 26 Sep 2023 08:23:50 -0400 Subject: [PATCH 067/101] Updated boto versions. --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index df505329a..96809ea5b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dcicutils" -version = "7.12.0.2b4" # TODO: To become 8.0.0 +version = "7.12.0.2b6" # TODO: To become 8.0.0 description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources" authors = ["4DN-DCIC Team "] license = "MIT" From 00f15414f5f25d01b445fc7882f526694ac19a5b Mon Sep 17 00:00:00 2001 From: David Michaels Date: Thu, 28 Sep 2023 07:06:08 -0400 Subject: [PATCH 068/101] Added Python 3.10, 3.11 to pyproject Python list --- pyproject.toml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 96809ea5b..cbb7db262 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dcicutils" -version = "7.12.0.2b6" # TODO: To become 8.0.0 +version = "7.12.0.2b7" # TODO: To become 8.0.0 description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources" authors = ["4DN-DCIC Team "] license = "MIT" @@ -31,6 +31,8 @@ classifiers = [ 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9', + 'Programming Language :: Python :: 3.10', + 'Programming Language :: Python :: 3.11' ] From 3a73263e4d38a97b69c02adfc7a0e18be2f078d5 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Thu, 28 Sep 2023 12:47:51 -0400 Subject: [PATCH 069/101] Fix to qa_utils for application/vnd.software602.filler.form+xml mime type return from mimetypes.guess_type on Ubuntu 22.04. --- dcicutils/qa_utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/dcicutils/qa_utils.py b/dcicutils/qa_utils.py index 8b440c413..0c9b019ed 100644 --- a/dcicutils/qa_utils.py +++ b/dcicutils/qa_utils.py @@ -2308,6 +2308,10 @@ def get_object(self, Bucket, Key, **kwargs): # noqa - Uppercase argument names "application/json": [".json"], "text/plain": [".txt", ".text"], "binary/octet-stream": [".fo"], + # The below is because on Ubuntu 22.04 (docker), as opposed to 20.04, the mimetypes.guess_type + # function returns this strange value for an argument ending in ".fo"; this manifested as a test + # failure (put_object assert below) in test_s3_utils.test_unzip_s3_to_s3_unit. dmichaels/2023-09-28. + "application/vnd.software602.filler.form+xml": [".fo"], } def put_object(self, *, Bucket, Key, Body, ContentType=None, **kwargs): # noqa - Uppercase argument names are chosen by AWS From cd4dd353c0beff3e8d1f7275b8a0b007fee0dad1 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Thu, 28 Sep 2023 12:48:54 -0400 Subject: [PATCH 070/101] Fix to qa_utils for application/vnd.software602.filler.form+xml mime type return from mimetypes.guess_type on Ubuntu 22.04. --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index cbb7db262..a0c152d08 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dcicutils" -version = "7.12.0.2b7" # TODO: To become 8.0.0 +version = "7.12.0.2b8" # TODO: To become 8.0.0 description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources" authors = ["4DN-DCIC Team "] license = "MIT" From 860a43341922989d9ac302816b2f1e4a8601b7b6 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Thu, 28 Sep 2023 12:51:47 -0400 Subject: [PATCH 071/101] Fix to qa_utils for application/vnd.software602.filler.form+xml mime type return from mimetypes.guess_type on Ubuntu 22.04. --- .github/workflows/main.yml | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index d9d6dd741..5ec18cdda 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -20,7 +20,7 @@ jobs: name: Test dcicutils with Python ${{ matrix.python_version }} # The type of runner that the job will run on - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 strategy: matrix: python_version: [3.8, 3.9, 3.11] diff --git a/pyproject.toml b/pyproject.toml index a0c152d08..93cae5b8f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dcicutils" -version = "7.12.0.2b8" # TODO: To become 8.0.0 +version = "7.12.0.2b9" # TODO: To become 8.0.0 description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources" authors = ["4DN-DCIC Team "] license = "MIT" From 87fbcdb88a344bfadbf2b45a32c370f174db9b97 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Thu, 28 Sep 2023 12:52:11 -0400 Subject: [PATCH 072/101] Fix to qa_utils for application/vnd.software602.filler.form+xml mime type return from mimetypes.guess_type on Ubuntu 22.04. --- poetry.lock | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/poetry.lock b/poetry.lock index d0564a273..ea0071db0 100644 --- a/poetry.lock +++ b/poetry.lock @@ -45,32 +45,32 @@ lxml = ["lxml"] [[package]] name = "boto3" -version = "1.28.54" +version = "1.28.56" description = "The AWS SDK for Python" optional = false python-versions = ">= 3.7" files = [ - {file = "boto3-1.28.54-py3-none-any.whl", hash = "sha256:3cb2aee317a1b8686e3b23674e4099b8ff7451bd8acc61b9719acff86fa024d1"}, - {file = "boto3-1.28.54.tar.gz", hash = "sha256:22e37d8c4f2d97b5e5c6ccc1d9edc7760717990b0ba8b8ea17a58cc87e57c5c9"}, + {file = "boto3-1.28.56-py3-none-any.whl", hash = "sha256:f5fcb27cdbd08ca38d699f2d2e32d96d1d9fab3368c15c6bc326256612d2cfd7"}, + {file = "boto3-1.28.56.tar.gz", hash = "sha256:b927a7ed335d543c33c15fa63f1076f3fa8422959771c2187da74bc4395ab6e3"}, ] [package.dependencies] -botocore = ">=1.31.54,<1.32.0" +botocore = ">=1.31.56,<1.32.0" jmespath = ">=0.7.1,<2.0.0" -s3transfer = ">=0.6.0,<0.7.0" +s3transfer = ">=0.7.0,<0.8.0" [package.extras] crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] [[package]] name = "boto3-stubs" -version = "1.28.54" -description = "Type annotations for boto3 1.28.54 generated with mypy-boto3-builder 7.19.0" +version = "1.28.56" +description = "Type annotations for boto3 1.28.56 generated with mypy-boto3-builder 7.19.0" optional = false python-versions = ">=3.7" files = [ - {file = "boto3-stubs-1.28.54.tar.gz", hash = "sha256:cb6f6a6cca4826c2b5d61b0a3fcf9cb84d7924c120b7e1c670f08e21cc010425"}, - {file = "boto3_stubs-1.28.54-py3-none-any.whl", hash = "sha256:a58a6ebd0767d7aa1f4ab6f3eec93388b20afff5c5b25c8fe8f866d85eda6f67"}, + {file = "boto3-stubs-1.28.56.tar.gz", hash = "sha256:2b77d4b9e944ffa4308822dc0327269f47cc4527a97903a3c5d14b9abed4eaad"}, + {file = "boto3_stubs-1.28.56-py3-none-any.whl", hash = "sha256:e9c1f32463cf8e6ba56588f7c0f5cae52190b87eb19744ad1067a717be1c80b3"}, ] [package.dependencies] @@ -114,7 +114,7 @@ backup-gateway = ["mypy-boto3-backup-gateway (>=1.28.0,<1.29.0)"] backupstorage = ["mypy-boto3-backupstorage (>=1.28.0,<1.29.0)"] batch = ["mypy-boto3-batch (>=1.28.0,<1.29.0)"] billingconductor = ["mypy-boto3-billingconductor (>=1.28.0,<1.29.0)"] -boto3 = ["boto3 (==1.28.54)", "botocore (==1.31.54)"] +boto3 = ["boto3 (==1.28.56)", "botocore (==1.31.56)"] braket = ["mypy-boto3-braket (>=1.28.0,<1.29.0)"] budgets = ["mypy-boto3-budgets (>=1.28.0,<1.29.0)"] ce = ["mypy-boto3-ce (>=1.28.0,<1.29.0)"] @@ -440,13 +440,13 @@ xray = ["mypy-boto3-xray (>=1.28.0,<1.29.0)"] [[package]] name = "botocore" -version = "1.31.54" +version = "1.31.56" description = "Low-level, data-driven core of boto 3." optional = false python-versions = ">= 3.7" files = [ - {file = "botocore-1.31.54-py3-none-any.whl", hash = "sha256:71fdb337ddcdb6bf378e1211cba9ce754c35f12b1524c7d0c0c147b2310356c7"}, - {file = "botocore-1.31.54.tar.gz", hash = "sha256:c98e78a9490c4166b205f87912b46770e156bfe7d53bae54ccbd49c68a336ec6"}, + {file = "botocore-1.31.56-py3-none-any.whl", hash = "sha256:66c686e4eda7051ffcc9357d9075390c8ab2f95a2977669039618ee186fb533b"}, + {file = "botocore-1.31.56.tar.gz", hash = "sha256:70252cd8abc2fe9b791328e187620f5a3911545e2520486b01ecfad31f41b9cb"}, ] [package.dependencies] @@ -459,13 +459,13 @@ crt = ["awscrt (==0.16.26)"] [[package]] name = "botocore-stubs" -version = "1.31.54" +version = "1.31.56" description = "Type annotations and code completion for botocore" optional = false python-versions = ">=3.7,<4.0" files = [ - {file = "botocore_stubs-1.31.54-py3-none-any.whl", hash = "sha256:6e3e015ba4d2172c6a9bd5ff5131c39c995991559a567dce6c8534884f83ded1"}, - {file = "botocore_stubs-1.31.54.tar.gz", hash = "sha256:96223fdf87a68037dd1ed366ffe28b5b744aca495a365f4535e2179c9a88455c"}, + {file = "botocore_stubs-1.31.56-py3-none-any.whl", hash = "sha256:e8f7273f57c09ef47f8bdc89765b151f43a46b3c29c52fb58e9982115ed14d84"}, + {file = "botocore_stubs-1.31.56.tar.gz", hash = "sha256:e1510bd361acf755ecace42ee984281adfea6f96695ed22a08a4ceeabfe66f5f"}, ] [package.dependencies] @@ -1366,13 +1366,13 @@ idna2008 = ["idna"] [[package]] name = "s3transfer" -version = "0.6.2" +version = "0.7.0" description = "An Amazon S3 Transfer Manager" optional = false python-versions = ">= 3.7" files = [ - {file = "s3transfer-0.6.2-py3-none-any.whl", hash = "sha256:b014be3a8a2aab98cfe1abc7229cc5a9a0cf05eb9c1f2b86b230fd8df3f78084"}, - {file = "s3transfer-0.6.2.tar.gz", hash = "sha256:cab66d3380cca3e70939ef2255d01cd8aece6a4907a9528740f668c4b0611861"}, + {file = "s3transfer-0.7.0-py3-none-any.whl", hash = "sha256:10d6923c6359175f264811ef4bf6161a3156ce8e350e705396a7557d6293c33a"}, + {file = "s3transfer-0.7.0.tar.gz", hash = "sha256:fd3889a66f5fe17299fe75b82eae6cf722554edca744ca5d5fe308b104883d2e"}, ] [package.dependencies] @@ -1489,13 +1489,13 @@ files = [ [[package]] name = "types-s3transfer" -version = "0.6.2" +version = "0.7.0" description = "Type annotations and code completion for s3transfer" optional = false python-versions = ">=3.7,<4.0" files = [ - {file = "types_s3transfer-0.6.2-py3-none-any.whl", hash = "sha256:1068877b6e59be5226fa3006ae64371ac9d5bc590dfdbd9c66fd0a075d3254ac"}, - {file = "types_s3transfer-0.6.2.tar.gz", hash = "sha256:4ba9b483796fdcd026aa162ee03bdcedd2bf7d08e9387c820dcdd158b0102057"}, + {file = "types_s3transfer-0.7.0-py3-none-any.whl", hash = "sha256:ae9ed9273465d9f43da8b96307383da410c6b59c3b2464c88d20b578768e97c6"}, + {file = "types_s3transfer-0.7.0.tar.gz", hash = "sha256:aca0f2486d0a3a5037cd5b8f3e20a4522a29579a8dd183281ff0aa1c4e2c8aa7"}, ] [[package]] From 7aeecb258925e1e0b6ae8297e32732200fd452e3 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Fri, 29 Sep 2023 13:45:35 -0400 Subject: [PATCH 073/101] poetry update --- poetry.lock | 163 +++++++++++++++++++++++-------------------------- pyproject.toml | 8 +-- 2 files changed, 80 insertions(+), 91 deletions(-) diff --git a/poetry.lock b/poetry.lock index 471323aa3..ba5a1bb74 100644 --- a/poetry.lock +++ b/poetry.lock @@ -45,17 +45,17 @@ lxml = ["lxml"] [[package]] name = "boto3" -version = "1.28.56" +version = "1.28.57" description = "The AWS SDK for Python" optional = false python-versions = ">= 3.7" files = [ - {file = "boto3-1.28.56-py3-none-any.whl", hash = "sha256:f5fcb27cdbd08ca38d699f2d2e32d96d1d9fab3368c15c6bc326256612d2cfd7"}, - {file = "boto3-1.28.56.tar.gz", hash = "sha256:b927a7ed335d543c33c15fa63f1076f3fa8422959771c2187da74bc4395ab6e3"}, + {file = "boto3-1.28.57-py3-none-any.whl", hash = "sha256:5ddf24cf52c7fb6aaa332eaa08ae8c2afc8f2d1e8860680728533dd573904e32"}, + {file = "boto3-1.28.57.tar.gz", hash = "sha256:e2d2824ba6459b330d097e94039a9c4f96ae3f4bcdc731d620589ad79dcd16d3"}, ] [package.dependencies] -botocore = ">=1.31.56,<1.32.0" +botocore = ">=1.31.57,<1.32.0" jmespath = ">=0.7.1,<2.0.0" s3transfer = ">=0.7.0,<0.8.0" @@ -64,13 +64,13 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] [[package]] name = "boto3-stubs" -version = "1.28.56" -description = "Type annotations for boto3 1.28.56 generated with mypy-boto3-builder 7.19.0" +version = "1.28.57" +description = "Type annotations for boto3 1.28.57 generated with mypy-boto3-builder 7.19.0" optional = false python-versions = ">=3.7" files = [ - {file = "boto3-stubs-1.28.56.tar.gz", hash = "sha256:2b77d4b9e944ffa4308822dc0327269f47cc4527a97903a3c5d14b9abed4eaad"}, - {file = "boto3_stubs-1.28.56-py3-none-any.whl", hash = "sha256:e9c1f32463cf8e6ba56588f7c0f5cae52190b87eb19744ad1067a717be1c80b3"}, + {file = "boto3-stubs-1.28.57.tar.gz", hash = "sha256:61cd792792c2a16d70801d187ebed705c39962a70e90b5dc0f33d04001fb39ae"}, + {file = "boto3_stubs-1.28.57-py3-none-any.whl", hash = "sha256:b389a693e33d75ed19d38b4a2ba99d25f4f2045a3136f816d7fec4d27a71ddb4"}, ] [package.dependencies] @@ -84,7 +84,7 @@ account = ["mypy-boto3-account (>=1.28.0,<1.29.0)"] acm = ["mypy-boto3-acm (>=1.28.0,<1.29.0)"] acm-pca = ["mypy-boto3-acm-pca (>=1.28.0,<1.29.0)"] alexaforbusiness = ["mypy-boto3-alexaforbusiness (>=1.28.0,<1.29.0)"] -all = ["mypy-boto3-accessanalyzer (>=1.28.0,<1.29.0)", "mypy-boto3-account (>=1.28.0,<1.29.0)", "mypy-boto3-acm (>=1.28.0,<1.29.0)", "mypy-boto3-acm-pca (>=1.28.0,<1.29.0)", "mypy-boto3-alexaforbusiness (>=1.28.0,<1.29.0)", "mypy-boto3-amp (>=1.28.0,<1.29.0)", "mypy-boto3-amplify (>=1.28.0,<1.29.0)", "mypy-boto3-amplifybackend (>=1.28.0,<1.29.0)", "mypy-boto3-amplifyuibuilder (>=1.28.0,<1.29.0)", "mypy-boto3-apigateway (>=1.28.0,<1.29.0)", "mypy-boto3-apigatewaymanagementapi (>=1.28.0,<1.29.0)", "mypy-boto3-apigatewayv2 (>=1.28.0,<1.29.0)", "mypy-boto3-appconfig (>=1.28.0,<1.29.0)", "mypy-boto3-appconfigdata (>=1.28.0,<1.29.0)", "mypy-boto3-appfabric (>=1.28.0,<1.29.0)", "mypy-boto3-appflow (>=1.28.0,<1.29.0)", "mypy-boto3-appintegrations (>=1.28.0,<1.29.0)", "mypy-boto3-application-autoscaling (>=1.28.0,<1.29.0)", "mypy-boto3-application-insights (>=1.28.0,<1.29.0)", "mypy-boto3-applicationcostprofiler (>=1.28.0,<1.29.0)", "mypy-boto3-appmesh (>=1.28.0,<1.29.0)", "mypy-boto3-apprunner (>=1.28.0,<1.29.0)", "mypy-boto3-appstream (>=1.28.0,<1.29.0)", "mypy-boto3-appsync (>=1.28.0,<1.29.0)", "mypy-boto3-arc-zonal-shift (>=1.28.0,<1.29.0)", "mypy-boto3-athena (>=1.28.0,<1.29.0)", "mypy-boto3-auditmanager (>=1.28.0,<1.29.0)", "mypy-boto3-autoscaling (>=1.28.0,<1.29.0)", "mypy-boto3-autoscaling-plans (>=1.28.0,<1.29.0)", "mypy-boto3-backup (>=1.28.0,<1.29.0)", "mypy-boto3-backup-gateway (>=1.28.0,<1.29.0)", "mypy-boto3-backupstorage (>=1.28.0,<1.29.0)", "mypy-boto3-batch (>=1.28.0,<1.29.0)", "mypy-boto3-billingconductor (>=1.28.0,<1.29.0)", "mypy-boto3-braket (>=1.28.0,<1.29.0)", "mypy-boto3-budgets (>=1.28.0,<1.29.0)", "mypy-boto3-ce (>=1.28.0,<1.29.0)", "mypy-boto3-chime (>=1.28.0,<1.29.0)", "mypy-boto3-chime-sdk-identity (>=1.28.0,<1.29.0)", "mypy-boto3-chime-sdk-media-pipelines (>=1.28.0,<1.29.0)", "mypy-boto3-chime-sdk-meetings (>=1.28.0,<1.29.0)", "mypy-boto3-chime-sdk-messaging (>=1.28.0,<1.29.0)", "mypy-boto3-chime-sdk-voice (>=1.28.0,<1.29.0)", "mypy-boto3-cleanrooms (>=1.28.0,<1.29.0)", "mypy-boto3-cloud9 (>=1.28.0,<1.29.0)", "mypy-boto3-cloudcontrol (>=1.28.0,<1.29.0)", "mypy-boto3-clouddirectory (>=1.28.0,<1.29.0)", "mypy-boto3-cloudformation (>=1.28.0,<1.29.0)", "mypy-boto3-cloudfront (>=1.28.0,<1.29.0)", "mypy-boto3-cloudhsm (>=1.28.0,<1.29.0)", "mypy-boto3-cloudhsmv2 (>=1.28.0,<1.29.0)", "mypy-boto3-cloudsearch (>=1.28.0,<1.29.0)", "mypy-boto3-cloudsearchdomain (>=1.28.0,<1.29.0)", "mypy-boto3-cloudtrail (>=1.28.0,<1.29.0)", "mypy-boto3-cloudtrail-data (>=1.28.0,<1.29.0)", "mypy-boto3-cloudwatch (>=1.28.0,<1.29.0)", "mypy-boto3-codeartifact (>=1.28.0,<1.29.0)", "mypy-boto3-codebuild (>=1.28.0,<1.29.0)", "mypy-boto3-codecatalyst (>=1.28.0,<1.29.0)", "mypy-boto3-codecommit (>=1.28.0,<1.29.0)", "mypy-boto3-codedeploy (>=1.28.0,<1.29.0)", "mypy-boto3-codeguru-reviewer (>=1.28.0,<1.29.0)", "mypy-boto3-codeguru-security (>=1.28.0,<1.29.0)", "mypy-boto3-codeguruprofiler (>=1.28.0,<1.29.0)", "mypy-boto3-codepipeline (>=1.28.0,<1.29.0)", "mypy-boto3-codestar (>=1.28.0,<1.29.0)", "mypy-boto3-codestar-connections (>=1.28.0,<1.29.0)", "mypy-boto3-codestar-notifications (>=1.28.0,<1.29.0)", "mypy-boto3-cognito-identity (>=1.28.0,<1.29.0)", "mypy-boto3-cognito-idp (>=1.28.0,<1.29.0)", "mypy-boto3-cognito-sync (>=1.28.0,<1.29.0)", "mypy-boto3-comprehend (>=1.28.0,<1.29.0)", "mypy-boto3-comprehendmedical (>=1.28.0,<1.29.0)", "mypy-boto3-compute-optimizer (>=1.28.0,<1.29.0)", "mypy-boto3-config (>=1.28.0,<1.29.0)", "mypy-boto3-connect (>=1.28.0,<1.29.0)", "mypy-boto3-connect-contact-lens (>=1.28.0,<1.29.0)", "mypy-boto3-connectcampaigns (>=1.28.0,<1.29.0)", "mypy-boto3-connectcases (>=1.28.0,<1.29.0)", "mypy-boto3-connectparticipant (>=1.28.0,<1.29.0)", "mypy-boto3-controltower (>=1.28.0,<1.29.0)", "mypy-boto3-cur (>=1.28.0,<1.29.0)", "mypy-boto3-customer-profiles (>=1.28.0,<1.29.0)", "mypy-boto3-databrew (>=1.28.0,<1.29.0)", "mypy-boto3-dataexchange (>=1.28.0,<1.29.0)", "mypy-boto3-datapipeline (>=1.28.0,<1.29.0)", "mypy-boto3-datasync (>=1.28.0,<1.29.0)", "mypy-boto3-dax (>=1.28.0,<1.29.0)", "mypy-boto3-detective (>=1.28.0,<1.29.0)", "mypy-boto3-devicefarm (>=1.28.0,<1.29.0)", "mypy-boto3-devops-guru (>=1.28.0,<1.29.0)", "mypy-boto3-directconnect (>=1.28.0,<1.29.0)", "mypy-boto3-discovery (>=1.28.0,<1.29.0)", "mypy-boto3-dlm (>=1.28.0,<1.29.0)", "mypy-boto3-dms (>=1.28.0,<1.29.0)", "mypy-boto3-docdb (>=1.28.0,<1.29.0)", "mypy-boto3-docdb-elastic (>=1.28.0,<1.29.0)", "mypy-boto3-drs (>=1.28.0,<1.29.0)", "mypy-boto3-ds (>=1.28.0,<1.29.0)", "mypy-boto3-dynamodb (>=1.28.0,<1.29.0)", "mypy-boto3-dynamodbstreams (>=1.28.0,<1.29.0)", "mypy-boto3-ebs (>=1.28.0,<1.29.0)", "mypy-boto3-ec2 (>=1.28.0,<1.29.0)", "mypy-boto3-ec2-instance-connect (>=1.28.0,<1.29.0)", "mypy-boto3-ecr (>=1.28.0,<1.29.0)", "mypy-boto3-ecr-public (>=1.28.0,<1.29.0)", "mypy-boto3-ecs (>=1.28.0,<1.29.0)", "mypy-boto3-efs (>=1.28.0,<1.29.0)", "mypy-boto3-eks (>=1.28.0,<1.29.0)", "mypy-boto3-elastic-inference (>=1.28.0,<1.29.0)", "mypy-boto3-elasticache (>=1.28.0,<1.29.0)", "mypy-boto3-elasticbeanstalk (>=1.28.0,<1.29.0)", "mypy-boto3-elastictranscoder (>=1.28.0,<1.29.0)", "mypy-boto3-elb (>=1.28.0,<1.29.0)", "mypy-boto3-elbv2 (>=1.28.0,<1.29.0)", "mypy-boto3-emr (>=1.28.0,<1.29.0)", "mypy-boto3-emr-containers (>=1.28.0,<1.29.0)", "mypy-boto3-emr-serverless (>=1.28.0,<1.29.0)", "mypy-boto3-entityresolution (>=1.28.0,<1.29.0)", "mypy-boto3-es (>=1.28.0,<1.29.0)", "mypy-boto3-events (>=1.28.0,<1.29.0)", "mypy-boto3-evidently (>=1.28.0,<1.29.0)", "mypy-boto3-finspace (>=1.28.0,<1.29.0)", "mypy-boto3-finspace-data (>=1.28.0,<1.29.0)", "mypy-boto3-firehose (>=1.28.0,<1.29.0)", "mypy-boto3-fis (>=1.28.0,<1.29.0)", "mypy-boto3-fms (>=1.28.0,<1.29.0)", "mypy-boto3-forecast (>=1.28.0,<1.29.0)", "mypy-boto3-forecastquery (>=1.28.0,<1.29.0)", "mypy-boto3-frauddetector (>=1.28.0,<1.29.0)", "mypy-boto3-fsx (>=1.28.0,<1.29.0)", "mypy-boto3-gamelift (>=1.28.0,<1.29.0)", "mypy-boto3-gamesparks (>=1.28.0,<1.29.0)", "mypy-boto3-glacier (>=1.28.0,<1.29.0)", "mypy-boto3-globalaccelerator (>=1.28.0,<1.29.0)", "mypy-boto3-glue (>=1.28.0,<1.29.0)", "mypy-boto3-grafana (>=1.28.0,<1.29.0)", "mypy-boto3-greengrass (>=1.28.0,<1.29.0)", "mypy-boto3-greengrassv2 (>=1.28.0,<1.29.0)", "mypy-boto3-groundstation (>=1.28.0,<1.29.0)", "mypy-boto3-guardduty (>=1.28.0,<1.29.0)", "mypy-boto3-health (>=1.28.0,<1.29.0)", "mypy-boto3-healthlake (>=1.28.0,<1.29.0)", "mypy-boto3-honeycode (>=1.28.0,<1.29.0)", "mypy-boto3-iam (>=1.28.0,<1.29.0)", "mypy-boto3-identitystore (>=1.28.0,<1.29.0)", "mypy-boto3-imagebuilder (>=1.28.0,<1.29.0)", "mypy-boto3-importexport (>=1.28.0,<1.29.0)", "mypy-boto3-inspector (>=1.28.0,<1.29.0)", "mypy-boto3-inspector2 (>=1.28.0,<1.29.0)", "mypy-boto3-internetmonitor (>=1.28.0,<1.29.0)", "mypy-boto3-iot (>=1.28.0,<1.29.0)", "mypy-boto3-iot-data (>=1.28.0,<1.29.0)", "mypy-boto3-iot-jobs-data (>=1.28.0,<1.29.0)", "mypy-boto3-iot-roborunner (>=1.28.0,<1.29.0)", "mypy-boto3-iot1click-devices (>=1.28.0,<1.29.0)", "mypy-boto3-iot1click-projects (>=1.28.0,<1.29.0)", "mypy-boto3-iotanalytics (>=1.28.0,<1.29.0)", "mypy-boto3-iotdeviceadvisor (>=1.28.0,<1.29.0)", "mypy-boto3-iotevents (>=1.28.0,<1.29.0)", "mypy-boto3-iotevents-data (>=1.28.0,<1.29.0)", "mypy-boto3-iotfleethub (>=1.28.0,<1.29.0)", "mypy-boto3-iotfleetwise (>=1.28.0,<1.29.0)", "mypy-boto3-iotsecuretunneling (>=1.28.0,<1.29.0)", "mypy-boto3-iotsitewise (>=1.28.0,<1.29.0)", "mypy-boto3-iotthingsgraph (>=1.28.0,<1.29.0)", "mypy-boto3-iottwinmaker (>=1.28.0,<1.29.0)", "mypy-boto3-iotwireless (>=1.28.0,<1.29.0)", "mypy-boto3-ivs (>=1.28.0,<1.29.0)", "mypy-boto3-ivs-realtime (>=1.28.0,<1.29.0)", "mypy-boto3-ivschat (>=1.28.0,<1.29.0)", "mypy-boto3-kafka (>=1.28.0,<1.29.0)", "mypy-boto3-kafkaconnect (>=1.28.0,<1.29.0)", "mypy-boto3-kendra (>=1.28.0,<1.29.0)", "mypy-boto3-kendra-ranking (>=1.28.0,<1.29.0)", "mypy-boto3-keyspaces (>=1.28.0,<1.29.0)", "mypy-boto3-kinesis (>=1.28.0,<1.29.0)", "mypy-boto3-kinesis-video-archived-media (>=1.28.0,<1.29.0)", "mypy-boto3-kinesis-video-media (>=1.28.0,<1.29.0)", "mypy-boto3-kinesis-video-signaling (>=1.28.0,<1.29.0)", "mypy-boto3-kinesis-video-webrtc-storage (>=1.28.0,<1.29.0)", "mypy-boto3-kinesisanalytics (>=1.28.0,<1.29.0)", "mypy-boto3-kinesisanalyticsv2 (>=1.28.0,<1.29.0)", "mypy-boto3-kinesisvideo (>=1.28.0,<1.29.0)", "mypy-boto3-kms (>=1.28.0,<1.29.0)", "mypy-boto3-lakeformation (>=1.28.0,<1.29.0)", "mypy-boto3-lambda (>=1.28.0,<1.29.0)", "mypy-boto3-lex-models (>=1.28.0,<1.29.0)", "mypy-boto3-lex-runtime (>=1.28.0,<1.29.0)", "mypy-boto3-lexv2-models (>=1.28.0,<1.29.0)", "mypy-boto3-lexv2-runtime (>=1.28.0,<1.29.0)", "mypy-boto3-license-manager (>=1.28.0,<1.29.0)", "mypy-boto3-license-manager-linux-subscriptions (>=1.28.0,<1.29.0)", "mypy-boto3-license-manager-user-subscriptions (>=1.28.0,<1.29.0)", "mypy-boto3-lightsail (>=1.28.0,<1.29.0)", "mypy-boto3-location (>=1.28.0,<1.29.0)", "mypy-boto3-logs (>=1.28.0,<1.29.0)", "mypy-boto3-lookoutequipment (>=1.28.0,<1.29.0)", "mypy-boto3-lookoutmetrics (>=1.28.0,<1.29.0)", "mypy-boto3-lookoutvision (>=1.28.0,<1.29.0)", "mypy-boto3-m2 (>=1.28.0,<1.29.0)", "mypy-boto3-machinelearning (>=1.28.0,<1.29.0)", "mypy-boto3-macie (>=1.28.0,<1.29.0)", "mypy-boto3-macie2 (>=1.28.0,<1.29.0)", "mypy-boto3-managedblockchain (>=1.28.0,<1.29.0)", "mypy-boto3-managedblockchain-query (>=1.28.0,<1.29.0)", "mypy-boto3-marketplace-catalog (>=1.28.0,<1.29.0)", "mypy-boto3-marketplace-entitlement (>=1.28.0,<1.29.0)", "mypy-boto3-marketplacecommerceanalytics (>=1.28.0,<1.29.0)", "mypy-boto3-mediaconnect (>=1.28.0,<1.29.0)", "mypy-boto3-mediaconvert (>=1.28.0,<1.29.0)", "mypy-boto3-medialive (>=1.28.0,<1.29.0)", "mypy-boto3-mediapackage (>=1.28.0,<1.29.0)", "mypy-boto3-mediapackage-vod (>=1.28.0,<1.29.0)", "mypy-boto3-mediapackagev2 (>=1.28.0,<1.29.0)", "mypy-boto3-mediastore (>=1.28.0,<1.29.0)", "mypy-boto3-mediastore-data (>=1.28.0,<1.29.0)", "mypy-boto3-mediatailor (>=1.28.0,<1.29.0)", "mypy-boto3-medical-imaging (>=1.28.0,<1.29.0)", "mypy-boto3-memorydb (>=1.28.0,<1.29.0)", "mypy-boto3-meteringmarketplace (>=1.28.0,<1.29.0)", "mypy-boto3-mgh (>=1.28.0,<1.29.0)", "mypy-boto3-mgn (>=1.28.0,<1.29.0)", "mypy-boto3-migration-hub-refactor-spaces (>=1.28.0,<1.29.0)", "mypy-boto3-migrationhub-config (>=1.28.0,<1.29.0)", "mypy-boto3-migrationhuborchestrator (>=1.28.0,<1.29.0)", "mypy-boto3-migrationhubstrategy (>=1.28.0,<1.29.0)", "mypy-boto3-mobile (>=1.28.0,<1.29.0)", "mypy-boto3-mq (>=1.28.0,<1.29.0)", "mypy-boto3-mturk (>=1.28.0,<1.29.0)", "mypy-boto3-mwaa (>=1.28.0,<1.29.0)", "mypy-boto3-neptune (>=1.28.0,<1.29.0)", "mypy-boto3-neptunedata (>=1.28.0,<1.29.0)", "mypy-boto3-network-firewall (>=1.28.0,<1.29.0)", "mypy-boto3-networkmanager (>=1.28.0,<1.29.0)", "mypy-boto3-nimble (>=1.28.0,<1.29.0)", "mypy-boto3-oam (>=1.28.0,<1.29.0)", "mypy-boto3-omics (>=1.28.0,<1.29.0)", "mypy-boto3-opensearch (>=1.28.0,<1.29.0)", "mypy-boto3-opensearchserverless (>=1.28.0,<1.29.0)", "mypy-boto3-opsworks (>=1.28.0,<1.29.0)", "mypy-boto3-opsworkscm (>=1.28.0,<1.29.0)", "mypy-boto3-organizations (>=1.28.0,<1.29.0)", "mypy-boto3-osis (>=1.28.0,<1.29.0)", "mypy-boto3-outposts (>=1.28.0,<1.29.0)", "mypy-boto3-panorama (>=1.28.0,<1.29.0)", "mypy-boto3-payment-cryptography (>=1.28.0,<1.29.0)", "mypy-boto3-payment-cryptography-data (>=1.28.0,<1.29.0)", "mypy-boto3-pca-connector-ad (>=1.28.0,<1.29.0)", "mypy-boto3-personalize (>=1.28.0,<1.29.0)", "mypy-boto3-personalize-events (>=1.28.0,<1.29.0)", "mypy-boto3-personalize-runtime (>=1.28.0,<1.29.0)", "mypy-boto3-pi (>=1.28.0,<1.29.0)", "mypy-boto3-pinpoint (>=1.28.0,<1.29.0)", "mypy-boto3-pinpoint-email (>=1.28.0,<1.29.0)", "mypy-boto3-pinpoint-sms-voice (>=1.28.0,<1.29.0)", "mypy-boto3-pinpoint-sms-voice-v2 (>=1.28.0,<1.29.0)", "mypy-boto3-pipes (>=1.28.0,<1.29.0)", "mypy-boto3-polly (>=1.28.0,<1.29.0)", "mypy-boto3-pricing (>=1.28.0,<1.29.0)", "mypy-boto3-privatenetworks (>=1.28.0,<1.29.0)", "mypy-boto3-proton (>=1.28.0,<1.29.0)", "mypy-boto3-qldb (>=1.28.0,<1.29.0)", "mypy-boto3-qldb-session (>=1.28.0,<1.29.0)", "mypy-boto3-quicksight (>=1.28.0,<1.29.0)", "mypy-boto3-ram (>=1.28.0,<1.29.0)", "mypy-boto3-rbin (>=1.28.0,<1.29.0)", "mypy-boto3-rds (>=1.28.0,<1.29.0)", "mypy-boto3-rds-data (>=1.28.0,<1.29.0)", "mypy-boto3-redshift (>=1.28.0,<1.29.0)", "mypy-boto3-redshift-data (>=1.28.0,<1.29.0)", "mypy-boto3-redshift-serverless (>=1.28.0,<1.29.0)", "mypy-boto3-rekognition (>=1.28.0,<1.29.0)", "mypy-boto3-resiliencehub (>=1.28.0,<1.29.0)", "mypy-boto3-resource-explorer-2 (>=1.28.0,<1.29.0)", "mypy-boto3-resource-groups (>=1.28.0,<1.29.0)", "mypy-boto3-resourcegroupstaggingapi (>=1.28.0,<1.29.0)", "mypy-boto3-robomaker (>=1.28.0,<1.29.0)", "mypy-boto3-rolesanywhere (>=1.28.0,<1.29.0)", "mypy-boto3-route53 (>=1.28.0,<1.29.0)", "mypy-boto3-route53-recovery-cluster (>=1.28.0,<1.29.0)", "mypy-boto3-route53-recovery-control-config (>=1.28.0,<1.29.0)", "mypy-boto3-route53-recovery-readiness (>=1.28.0,<1.29.0)", "mypy-boto3-route53domains (>=1.28.0,<1.29.0)", "mypy-boto3-route53resolver (>=1.28.0,<1.29.0)", "mypy-boto3-rum (>=1.28.0,<1.29.0)", "mypy-boto3-s3 (>=1.28.0,<1.29.0)", "mypy-boto3-s3control (>=1.28.0,<1.29.0)", "mypy-boto3-s3outposts (>=1.28.0,<1.29.0)", "mypy-boto3-sagemaker (>=1.28.0,<1.29.0)", "mypy-boto3-sagemaker-a2i-runtime (>=1.28.0,<1.29.0)", "mypy-boto3-sagemaker-edge (>=1.28.0,<1.29.0)", "mypy-boto3-sagemaker-featurestore-runtime (>=1.28.0,<1.29.0)", "mypy-boto3-sagemaker-geospatial (>=1.28.0,<1.29.0)", "mypy-boto3-sagemaker-metrics (>=1.28.0,<1.29.0)", "mypy-boto3-sagemaker-runtime (>=1.28.0,<1.29.0)", "mypy-boto3-savingsplans (>=1.28.0,<1.29.0)", "mypy-boto3-scheduler (>=1.28.0,<1.29.0)", "mypy-boto3-schemas (>=1.28.0,<1.29.0)", "mypy-boto3-sdb (>=1.28.0,<1.29.0)", "mypy-boto3-secretsmanager (>=1.28.0,<1.29.0)", "mypy-boto3-securityhub (>=1.28.0,<1.29.0)", "mypy-boto3-securitylake (>=1.28.0,<1.29.0)", "mypy-boto3-serverlessrepo (>=1.28.0,<1.29.0)", "mypy-boto3-service-quotas (>=1.28.0,<1.29.0)", "mypy-boto3-servicecatalog (>=1.28.0,<1.29.0)", "mypy-boto3-servicecatalog-appregistry (>=1.28.0,<1.29.0)", "mypy-boto3-servicediscovery (>=1.28.0,<1.29.0)", "mypy-boto3-ses (>=1.28.0,<1.29.0)", "mypy-boto3-sesv2 (>=1.28.0,<1.29.0)", "mypy-boto3-shield (>=1.28.0,<1.29.0)", "mypy-boto3-signer (>=1.28.0,<1.29.0)", "mypy-boto3-simspaceweaver (>=1.28.0,<1.29.0)", "mypy-boto3-sms (>=1.28.0,<1.29.0)", "mypy-boto3-sms-voice (>=1.28.0,<1.29.0)", "mypy-boto3-snow-device-management (>=1.28.0,<1.29.0)", "mypy-boto3-snowball (>=1.28.0,<1.29.0)", "mypy-boto3-sns (>=1.28.0,<1.29.0)", "mypy-boto3-sqs (>=1.28.0,<1.29.0)", "mypy-boto3-ssm (>=1.28.0,<1.29.0)", "mypy-boto3-ssm-contacts (>=1.28.0,<1.29.0)", "mypy-boto3-ssm-incidents (>=1.28.0,<1.29.0)", "mypy-boto3-ssm-sap (>=1.28.0,<1.29.0)", "mypy-boto3-sso (>=1.28.0,<1.29.0)", "mypy-boto3-sso-admin (>=1.28.0,<1.29.0)", "mypy-boto3-sso-oidc (>=1.28.0,<1.29.0)", "mypy-boto3-stepfunctions (>=1.28.0,<1.29.0)", "mypy-boto3-storagegateway (>=1.28.0,<1.29.0)", "mypy-boto3-sts (>=1.28.0,<1.29.0)", "mypy-boto3-support (>=1.28.0,<1.29.0)", "mypy-boto3-support-app (>=1.28.0,<1.29.0)", "mypy-boto3-swf (>=1.28.0,<1.29.0)", "mypy-boto3-synthetics (>=1.28.0,<1.29.0)", "mypy-boto3-textract (>=1.28.0,<1.29.0)", "mypy-boto3-timestream-query (>=1.28.0,<1.29.0)", "mypy-boto3-timestream-write (>=1.28.0,<1.29.0)", "mypy-boto3-tnb (>=1.28.0,<1.29.0)", "mypy-boto3-transcribe (>=1.28.0,<1.29.0)", "mypy-boto3-transfer (>=1.28.0,<1.29.0)", "mypy-boto3-translate (>=1.28.0,<1.29.0)", "mypy-boto3-verifiedpermissions (>=1.28.0,<1.29.0)", "mypy-boto3-voice-id (>=1.28.0,<1.29.0)", "mypy-boto3-vpc-lattice (>=1.28.0,<1.29.0)", "mypy-boto3-waf (>=1.28.0,<1.29.0)", "mypy-boto3-waf-regional (>=1.28.0,<1.29.0)", "mypy-boto3-wafv2 (>=1.28.0,<1.29.0)", "mypy-boto3-wellarchitected (>=1.28.0,<1.29.0)", "mypy-boto3-wisdom (>=1.28.0,<1.29.0)", "mypy-boto3-workdocs (>=1.28.0,<1.29.0)", "mypy-boto3-worklink (>=1.28.0,<1.29.0)", "mypy-boto3-workmail (>=1.28.0,<1.29.0)", "mypy-boto3-workmailmessageflow (>=1.28.0,<1.29.0)", "mypy-boto3-workspaces (>=1.28.0,<1.29.0)", "mypy-boto3-workspaces-web (>=1.28.0,<1.29.0)", "mypy-boto3-xray (>=1.28.0,<1.29.0)"] +all = ["mypy-boto3-accessanalyzer (>=1.28.0,<1.29.0)", "mypy-boto3-account (>=1.28.0,<1.29.0)", "mypy-boto3-acm (>=1.28.0,<1.29.0)", "mypy-boto3-acm-pca (>=1.28.0,<1.29.0)", "mypy-boto3-alexaforbusiness (>=1.28.0,<1.29.0)", "mypy-boto3-amp (>=1.28.0,<1.29.0)", "mypy-boto3-amplify (>=1.28.0,<1.29.0)", "mypy-boto3-amplifybackend (>=1.28.0,<1.29.0)", "mypy-boto3-amplifyuibuilder (>=1.28.0,<1.29.0)", "mypy-boto3-apigateway (>=1.28.0,<1.29.0)", "mypy-boto3-apigatewaymanagementapi (>=1.28.0,<1.29.0)", "mypy-boto3-apigatewayv2 (>=1.28.0,<1.29.0)", "mypy-boto3-appconfig (>=1.28.0,<1.29.0)", "mypy-boto3-appconfigdata (>=1.28.0,<1.29.0)", "mypy-boto3-appfabric (>=1.28.0,<1.29.0)", "mypy-boto3-appflow (>=1.28.0,<1.29.0)", "mypy-boto3-appintegrations (>=1.28.0,<1.29.0)", "mypy-boto3-application-autoscaling (>=1.28.0,<1.29.0)", "mypy-boto3-application-insights (>=1.28.0,<1.29.0)", "mypy-boto3-applicationcostprofiler (>=1.28.0,<1.29.0)", "mypy-boto3-appmesh (>=1.28.0,<1.29.0)", "mypy-boto3-apprunner (>=1.28.0,<1.29.0)", "mypy-boto3-appstream (>=1.28.0,<1.29.0)", "mypy-boto3-appsync (>=1.28.0,<1.29.0)", "mypy-boto3-arc-zonal-shift (>=1.28.0,<1.29.0)", "mypy-boto3-athena (>=1.28.0,<1.29.0)", "mypy-boto3-auditmanager (>=1.28.0,<1.29.0)", "mypy-boto3-autoscaling (>=1.28.0,<1.29.0)", "mypy-boto3-autoscaling-plans (>=1.28.0,<1.29.0)", "mypy-boto3-backup (>=1.28.0,<1.29.0)", "mypy-boto3-backup-gateway (>=1.28.0,<1.29.0)", "mypy-boto3-backupstorage (>=1.28.0,<1.29.0)", "mypy-boto3-batch (>=1.28.0,<1.29.0)", "mypy-boto3-bedrock (>=1.28.0,<1.29.0)", "mypy-boto3-bedrock-runtime (>=1.28.0,<1.29.0)", "mypy-boto3-billingconductor (>=1.28.0,<1.29.0)", "mypy-boto3-braket (>=1.28.0,<1.29.0)", "mypy-boto3-budgets (>=1.28.0,<1.29.0)", "mypy-boto3-ce (>=1.28.0,<1.29.0)", "mypy-boto3-chime (>=1.28.0,<1.29.0)", "mypy-boto3-chime-sdk-identity (>=1.28.0,<1.29.0)", "mypy-boto3-chime-sdk-media-pipelines (>=1.28.0,<1.29.0)", "mypy-boto3-chime-sdk-meetings (>=1.28.0,<1.29.0)", "mypy-boto3-chime-sdk-messaging (>=1.28.0,<1.29.0)", "mypy-boto3-chime-sdk-voice (>=1.28.0,<1.29.0)", "mypy-boto3-cleanrooms (>=1.28.0,<1.29.0)", "mypy-boto3-cloud9 (>=1.28.0,<1.29.0)", "mypy-boto3-cloudcontrol (>=1.28.0,<1.29.0)", "mypy-boto3-clouddirectory (>=1.28.0,<1.29.0)", "mypy-boto3-cloudformation (>=1.28.0,<1.29.0)", "mypy-boto3-cloudfront (>=1.28.0,<1.29.0)", "mypy-boto3-cloudhsm (>=1.28.0,<1.29.0)", "mypy-boto3-cloudhsmv2 (>=1.28.0,<1.29.0)", "mypy-boto3-cloudsearch (>=1.28.0,<1.29.0)", "mypy-boto3-cloudsearchdomain (>=1.28.0,<1.29.0)", "mypy-boto3-cloudtrail (>=1.28.0,<1.29.0)", "mypy-boto3-cloudtrail-data (>=1.28.0,<1.29.0)", "mypy-boto3-cloudwatch (>=1.28.0,<1.29.0)", "mypy-boto3-codeartifact (>=1.28.0,<1.29.0)", "mypy-boto3-codebuild (>=1.28.0,<1.29.0)", "mypy-boto3-codecatalyst (>=1.28.0,<1.29.0)", "mypy-boto3-codecommit (>=1.28.0,<1.29.0)", "mypy-boto3-codedeploy (>=1.28.0,<1.29.0)", "mypy-boto3-codeguru-reviewer (>=1.28.0,<1.29.0)", "mypy-boto3-codeguru-security (>=1.28.0,<1.29.0)", "mypy-boto3-codeguruprofiler (>=1.28.0,<1.29.0)", "mypy-boto3-codepipeline (>=1.28.0,<1.29.0)", "mypy-boto3-codestar (>=1.28.0,<1.29.0)", "mypy-boto3-codestar-connections (>=1.28.0,<1.29.0)", "mypy-boto3-codestar-notifications (>=1.28.0,<1.29.0)", "mypy-boto3-cognito-identity (>=1.28.0,<1.29.0)", "mypy-boto3-cognito-idp (>=1.28.0,<1.29.0)", "mypy-boto3-cognito-sync (>=1.28.0,<1.29.0)", "mypy-boto3-comprehend (>=1.28.0,<1.29.0)", "mypy-boto3-comprehendmedical (>=1.28.0,<1.29.0)", "mypy-boto3-compute-optimizer (>=1.28.0,<1.29.0)", "mypy-boto3-config (>=1.28.0,<1.29.0)", "mypy-boto3-connect (>=1.28.0,<1.29.0)", "mypy-boto3-connect-contact-lens (>=1.28.0,<1.29.0)", "mypy-boto3-connectcampaigns (>=1.28.0,<1.29.0)", "mypy-boto3-connectcases (>=1.28.0,<1.29.0)", "mypy-boto3-connectparticipant (>=1.28.0,<1.29.0)", "mypy-boto3-controltower (>=1.28.0,<1.29.0)", "mypy-boto3-cur (>=1.28.0,<1.29.0)", "mypy-boto3-customer-profiles (>=1.28.0,<1.29.0)", "mypy-boto3-databrew (>=1.28.0,<1.29.0)", "mypy-boto3-dataexchange (>=1.28.0,<1.29.0)", "mypy-boto3-datapipeline (>=1.28.0,<1.29.0)", "mypy-boto3-datasync (>=1.28.0,<1.29.0)", "mypy-boto3-dax (>=1.28.0,<1.29.0)", "mypy-boto3-detective (>=1.28.0,<1.29.0)", "mypy-boto3-devicefarm (>=1.28.0,<1.29.0)", "mypy-boto3-devops-guru (>=1.28.0,<1.29.0)", "mypy-boto3-directconnect (>=1.28.0,<1.29.0)", "mypy-boto3-discovery (>=1.28.0,<1.29.0)", "mypy-boto3-dlm (>=1.28.0,<1.29.0)", "mypy-boto3-dms (>=1.28.0,<1.29.0)", "mypy-boto3-docdb (>=1.28.0,<1.29.0)", "mypy-boto3-docdb-elastic (>=1.28.0,<1.29.0)", "mypy-boto3-drs (>=1.28.0,<1.29.0)", "mypy-boto3-ds (>=1.28.0,<1.29.0)", "mypy-boto3-dynamodb (>=1.28.0,<1.29.0)", "mypy-boto3-dynamodbstreams (>=1.28.0,<1.29.0)", "mypy-boto3-ebs (>=1.28.0,<1.29.0)", "mypy-boto3-ec2 (>=1.28.0,<1.29.0)", "mypy-boto3-ec2-instance-connect (>=1.28.0,<1.29.0)", "mypy-boto3-ecr (>=1.28.0,<1.29.0)", "mypy-boto3-ecr-public (>=1.28.0,<1.29.0)", "mypy-boto3-ecs (>=1.28.0,<1.29.0)", "mypy-boto3-efs (>=1.28.0,<1.29.0)", "mypy-boto3-eks (>=1.28.0,<1.29.0)", "mypy-boto3-elastic-inference (>=1.28.0,<1.29.0)", "mypy-boto3-elasticache (>=1.28.0,<1.29.0)", "mypy-boto3-elasticbeanstalk (>=1.28.0,<1.29.0)", "mypy-boto3-elastictranscoder (>=1.28.0,<1.29.0)", "mypy-boto3-elb (>=1.28.0,<1.29.0)", "mypy-boto3-elbv2 (>=1.28.0,<1.29.0)", "mypy-boto3-emr (>=1.28.0,<1.29.0)", "mypy-boto3-emr-containers (>=1.28.0,<1.29.0)", "mypy-boto3-emr-serverless (>=1.28.0,<1.29.0)", "mypy-boto3-entityresolution (>=1.28.0,<1.29.0)", "mypy-boto3-es (>=1.28.0,<1.29.0)", "mypy-boto3-events (>=1.28.0,<1.29.0)", "mypy-boto3-evidently (>=1.28.0,<1.29.0)", "mypy-boto3-finspace (>=1.28.0,<1.29.0)", "mypy-boto3-finspace-data (>=1.28.0,<1.29.0)", "mypy-boto3-firehose (>=1.28.0,<1.29.0)", "mypy-boto3-fis (>=1.28.0,<1.29.0)", "mypy-boto3-fms (>=1.28.0,<1.29.0)", "mypy-boto3-forecast (>=1.28.0,<1.29.0)", "mypy-boto3-forecastquery (>=1.28.0,<1.29.0)", "mypy-boto3-frauddetector (>=1.28.0,<1.29.0)", "mypy-boto3-fsx (>=1.28.0,<1.29.0)", "mypy-boto3-gamelift (>=1.28.0,<1.29.0)", "mypy-boto3-gamesparks (>=1.28.0,<1.29.0)", "mypy-boto3-glacier (>=1.28.0,<1.29.0)", "mypy-boto3-globalaccelerator (>=1.28.0,<1.29.0)", "mypy-boto3-glue (>=1.28.0,<1.29.0)", "mypy-boto3-grafana (>=1.28.0,<1.29.0)", "mypy-boto3-greengrass (>=1.28.0,<1.29.0)", "mypy-boto3-greengrassv2 (>=1.28.0,<1.29.0)", "mypy-boto3-groundstation (>=1.28.0,<1.29.0)", "mypy-boto3-guardduty (>=1.28.0,<1.29.0)", "mypy-boto3-health (>=1.28.0,<1.29.0)", "mypy-boto3-healthlake (>=1.28.0,<1.29.0)", "mypy-boto3-honeycode (>=1.28.0,<1.29.0)", "mypy-boto3-iam (>=1.28.0,<1.29.0)", "mypy-boto3-identitystore (>=1.28.0,<1.29.0)", "mypy-boto3-imagebuilder (>=1.28.0,<1.29.0)", "mypy-boto3-importexport (>=1.28.0,<1.29.0)", "mypy-boto3-inspector (>=1.28.0,<1.29.0)", "mypy-boto3-inspector2 (>=1.28.0,<1.29.0)", "mypy-boto3-internetmonitor (>=1.28.0,<1.29.0)", "mypy-boto3-iot (>=1.28.0,<1.29.0)", "mypy-boto3-iot-data (>=1.28.0,<1.29.0)", "mypy-boto3-iot-jobs-data (>=1.28.0,<1.29.0)", "mypy-boto3-iot-roborunner (>=1.28.0,<1.29.0)", "mypy-boto3-iot1click-devices (>=1.28.0,<1.29.0)", "mypy-boto3-iot1click-projects (>=1.28.0,<1.29.0)", "mypy-boto3-iotanalytics (>=1.28.0,<1.29.0)", "mypy-boto3-iotdeviceadvisor (>=1.28.0,<1.29.0)", "mypy-boto3-iotevents (>=1.28.0,<1.29.0)", "mypy-boto3-iotevents-data (>=1.28.0,<1.29.0)", "mypy-boto3-iotfleethub (>=1.28.0,<1.29.0)", "mypy-boto3-iotfleetwise (>=1.28.0,<1.29.0)", "mypy-boto3-iotsecuretunneling (>=1.28.0,<1.29.0)", "mypy-boto3-iotsitewise (>=1.28.0,<1.29.0)", "mypy-boto3-iotthingsgraph (>=1.28.0,<1.29.0)", "mypy-boto3-iottwinmaker (>=1.28.0,<1.29.0)", "mypy-boto3-iotwireless (>=1.28.0,<1.29.0)", "mypy-boto3-ivs (>=1.28.0,<1.29.0)", "mypy-boto3-ivs-realtime (>=1.28.0,<1.29.0)", "mypy-boto3-ivschat (>=1.28.0,<1.29.0)", "mypy-boto3-kafka (>=1.28.0,<1.29.0)", "mypy-boto3-kafkaconnect (>=1.28.0,<1.29.0)", "mypy-boto3-kendra (>=1.28.0,<1.29.0)", "mypy-boto3-kendra-ranking (>=1.28.0,<1.29.0)", "mypy-boto3-keyspaces (>=1.28.0,<1.29.0)", "mypy-boto3-kinesis (>=1.28.0,<1.29.0)", "mypy-boto3-kinesis-video-archived-media (>=1.28.0,<1.29.0)", "mypy-boto3-kinesis-video-media (>=1.28.0,<1.29.0)", "mypy-boto3-kinesis-video-signaling (>=1.28.0,<1.29.0)", "mypy-boto3-kinesis-video-webrtc-storage (>=1.28.0,<1.29.0)", "mypy-boto3-kinesisanalytics (>=1.28.0,<1.29.0)", "mypy-boto3-kinesisanalyticsv2 (>=1.28.0,<1.29.0)", "mypy-boto3-kinesisvideo (>=1.28.0,<1.29.0)", "mypy-boto3-kms (>=1.28.0,<1.29.0)", "mypy-boto3-lakeformation (>=1.28.0,<1.29.0)", "mypy-boto3-lambda (>=1.28.0,<1.29.0)", "mypy-boto3-lex-models (>=1.28.0,<1.29.0)", "mypy-boto3-lex-runtime (>=1.28.0,<1.29.0)", "mypy-boto3-lexv2-models (>=1.28.0,<1.29.0)", "mypy-boto3-lexv2-runtime (>=1.28.0,<1.29.0)", "mypy-boto3-license-manager (>=1.28.0,<1.29.0)", "mypy-boto3-license-manager-linux-subscriptions (>=1.28.0,<1.29.0)", "mypy-boto3-license-manager-user-subscriptions (>=1.28.0,<1.29.0)", "mypy-boto3-lightsail (>=1.28.0,<1.29.0)", "mypy-boto3-location (>=1.28.0,<1.29.0)", "mypy-boto3-logs (>=1.28.0,<1.29.0)", "mypy-boto3-lookoutequipment (>=1.28.0,<1.29.0)", "mypy-boto3-lookoutmetrics (>=1.28.0,<1.29.0)", "mypy-boto3-lookoutvision (>=1.28.0,<1.29.0)", "mypy-boto3-m2 (>=1.28.0,<1.29.0)", "mypy-boto3-machinelearning (>=1.28.0,<1.29.0)", "mypy-boto3-macie (>=1.28.0,<1.29.0)", "mypy-boto3-macie2 (>=1.28.0,<1.29.0)", "mypy-boto3-managedblockchain (>=1.28.0,<1.29.0)", "mypy-boto3-managedblockchain-query (>=1.28.0,<1.29.0)", "mypy-boto3-marketplace-catalog (>=1.28.0,<1.29.0)", "mypy-boto3-marketplace-entitlement (>=1.28.0,<1.29.0)", "mypy-boto3-marketplacecommerceanalytics (>=1.28.0,<1.29.0)", "mypy-boto3-mediaconnect (>=1.28.0,<1.29.0)", "mypy-boto3-mediaconvert (>=1.28.0,<1.29.0)", "mypy-boto3-medialive (>=1.28.0,<1.29.0)", "mypy-boto3-mediapackage (>=1.28.0,<1.29.0)", "mypy-boto3-mediapackage-vod (>=1.28.0,<1.29.0)", "mypy-boto3-mediapackagev2 (>=1.28.0,<1.29.0)", "mypy-boto3-mediastore (>=1.28.0,<1.29.0)", "mypy-boto3-mediastore-data (>=1.28.0,<1.29.0)", "mypy-boto3-mediatailor (>=1.28.0,<1.29.0)", "mypy-boto3-medical-imaging (>=1.28.0,<1.29.0)", "mypy-boto3-memorydb (>=1.28.0,<1.29.0)", "mypy-boto3-meteringmarketplace (>=1.28.0,<1.29.0)", "mypy-boto3-mgh (>=1.28.0,<1.29.0)", "mypy-boto3-mgn (>=1.28.0,<1.29.0)", "mypy-boto3-migration-hub-refactor-spaces (>=1.28.0,<1.29.0)", "mypy-boto3-migrationhub-config (>=1.28.0,<1.29.0)", "mypy-boto3-migrationhuborchestrator (>=1.28.0,<1.29.0)", "mypy-boto3-migrationhubstrategy (>=1.28.0,<1.29.0)", "mypy-boto3-mobile (>=1.28.0,<1.29.0)", "mypy-boto3-mq (>=1.28.0,<1.29.0)", "mypy-boto3-mturk (>=1.28.0,<1.29.0)", "mypy-boto3-mwaa (>=1.28.0,<1.29.0)", "mypy-boto3-neptune (>=1.28.0,<1.29.0)", "mypy-boto3-neptunedata (>=1.28.0,<1.29.0)", "mypy-boto3-network-firewall (>=1.28.0,<1.29.0)", "mypy-boto3-networkmanager (>=1.28.0,<1.29.0)", "mypy-boto3-nimble (>=1.28.0,<1.29.0)", "mypy-boto3-oam (>=1.28.0,<1.29.0)", "mypy-boto3-omics (>=1.28.0,<1.29.0)", "mypy-boto3-opensearch (>=1.28.0,<1.29.0)", "mypy-boto3-opensearchserverless (>=1.28.0,<1.29.0)", "mypy-boto3-opsworks (>=1.28.0,<1.29.0)", "mypy-boto3-opsworkscm (>=1.28.0,<1.29.0)", "mypy-boto3-organizations (>=1.28.0,<1.29.0)", "mypy-boto3-osis (>=1.28.0,<1.29.0)", "mypy-boto3-outposts (>=1.28.0,<1.29.0)", "mypy-boto3-panorama (>=1.28.0,<1.29.0)", "mypy-boto3-payment-cryptography (>=1.28.0,<1.29.0)", "mypy-boto3-payment-cryptography-data (>=1.28.0,<1.29.0)", "mypy-boto3-pca-connector-ad (>=1.28.0,<1.29.0)", "mypy-boto3-personalize (>=1.28.0,<1.29.0)", "mypy-boto3-personalize-events (>=1.28.0,<1.29.0)", "mypy-boto3-personalize-runtime (>=1.28.0,<1.29.0)", "mypy-boto3-pi (>=1.28.0,<1.29.0)", "mypy-boto3-pinpoint (>=1.28.0,<1.29.0)", "mypy-boto3-pinpoint-email (>=1.28.0,<1.29.0)", "mypy-boto3-pinpoint-sms-voice (>=1.28.0,<1.29.0)", "mypy-boto3-pinpoint-sms-voice-v2 (>=1.28.0,<1.29.0)", "mypy-boto3-pipes (>=1.28.0,<1.29.0)", "mypy-boto3-polly (>=1.28.0,<1.29.0)", "mypy-boto3-pricing (>=1.28.0,<1.29.0)", "mypy-boto3-privatenetworks (>=1.28.0,<1.29.0)", "mypy-boto3-proton (>=1.28.0,<1.29.0)", "mypy-boto3-qldb (>=1.28.0,<1.29.0)", "mypy-boto3-qldb-session (>=1.28.0,<1.29.0)", "mypy-boto3-quicksight (>=1.28.0,<1.29.0)", "mypy-boto3-ram (>=1.28.0,<1.29.0)", "mypy-boto3-rbin (>=1.28.0,<1.29.0)", "mypy-boto3-rds (>=1.28.0,<1.29.0)", "mypy-boto3-rds-data (>=1.28.0,<1.29.0)", "mypy-boto3-redshift (>=1.28.0,<1.29.0)", "mypy-boto3-redshift-data (>=1.28.0,<1.29.0)", "mypy-boto3-redshift-serverless (>=1.28.0,<1.29.0)", "mypy-boto3-rekognition (>=1.28.0,<1.29.0)", "mypy-boto3-resiliencehub (>=1.28.0,<1.29.0)", "mypy-boto3-resource-explorer-2 (>=1.28.0,<1.29.0)", "mypy-boto3-resource-groups (>=1.28.0,<1.29.0)", "mypy-boto3-resourcegroupstaggingapi (>=1.28.0,<1.29.0)", "mypy-boto3-robomaker (>=1.28.0,<1.29.0)", "mypy-boto3-rolesanywhere (>=1.28.0,<1.29.0)", "mypy-boto3-route53 (>=1.28.0,<1.29.0)", "mypy-boto3-route53-recovery-cluster (>=1.28.0,<1.29.0)", "mypy-boto3-route53-recovery-control-config (>=1.28.0,<1.29.0)", "mypy-boto3-route53-recovery-readiness (>=1.28.0,<1.29.0)", "mypy-boto3-route53domains (>=1.28.0,<1.29.0)", "mypy-boto3-route53resolver (>=1.28.0,<1.29.0)", "mypy-boto3-rum (>=1.28.0,<1.29.0)", "mypy-boto3-s3 (>=1.28.0,<1.29.0)", "mypy-boto3-s3control (>=1.28.0,<1.29.0)", "mypy-boto3-s3outposts (>=1.28.0,<1.29.0)", "mypy-boto3-sagemaker (>=1.28.0,<1.29.0)", "mypy-boto3-sagemaker-a2i-runtime (>=1.28.0,<1.29.0)", "mypy-boto3-sagemaker-edge (>=1.28.0,<1.29.0)", "mypy-boto3-sagemaker-featurestore-runtime (>=1.28.0,<1.29.0)", "mypy-boto3-sagemaker-geospatial (>=1.28.0,<1.29.0)", "mypy-boto3-sagemaker-metrics (>=1.28.0,<1.29.0)", "mypy-boto3-sagemaker-runtime (>=1.28.0,<1.29.0)", "mypy-boto3-savingsplans (>=1.28.0,<1.29.0)", "mypy-boto3-scheduler (>=1.28.0,<1.29.0)", "mypy-boto3-schemas (>=1.28.0,<1.29.0)", "mypy-boto3-sdb (>=1.28.0,<1.29.0)", "mypy-boto3-secretsmanager (>=1.28.0,<1.29.0)", "mypy-boto3-securityhub (>=1.28.0,<1.29.0)", "mypy-boto3-securitylake (>=1.28.0,<1.29.0)", "mypy-boto3-serverlessrepo (>=1.28.0,<1.29.0)", "mypy-boto3-service-quotas (>=1.28.0,<1.29.0)", "mypy-boto3-servicecatalog (>=1.28.0,<1.29.0)", "mypy-boto3-servicecatalog-appregistry (>=1.28.0,<1.29.0)", "mypy-boto3-servicediscovery (>=1.28.0,<1.29.0)", "mypy-boto3-ses (>=1.28.0,<1.29.0)", "mypy-boto3-sesv2 (>=1.28.0,<1.29.0)", "mypy-boto3-shield (>=1.28.0,<1.29.0)", "mypy-boto3-signer (>=1.28.0,<1.29.0)", "mypy-boto3-simspaceweaver (>=1.28.0,<1.29.0)", "mypy-boto3-sms (>=1.28.0,<1.29.0)", "mypy-boto3-sms-voice (>=1.28.0,<1.29.0)", "mypy-boto3-snow-device-management (>=1.28.0,<1.29.0)", "mypy-boto3-snowball (>=1.28.0,<1.29.0)", "mypy-boto3-sns (>=1.28.0,<1.29.0)", "mypy-boto3-sqs (>=1.28.0,<1.29.0)", "mypy-boto3-ssm (>=1.28.0,<1.29.0)", "mypy-boto3-ssm-contacts (>=1.28.0,<1.29.0)", "mypy-boto3-ssm-incidents (>=1.28.0,<1.29.0)", "mypy-boto3-ssm-sap (>=1.28.0,<1.29.0)", "mypy-boto3-sso (>=1.28.0,<1.29.0)", "mypy-boto3-sso-admin (>=1.28.0,<1.29.0)", "mypy-boto3-sso-oidc (>=1.28.0,<1.29.0)", "mypy-boto3-stepfunctions (>=1.28.0,<1.29.0)", "mypy-boto3-storagegateway (>=1.28.0,<1.29.0)", "mypy-boto3-sts (>=1.28.0,<1.29.0)", "mypy-boto3-support (>=1.28.0,<1.29.0)", "mypy-boto3-support-app (>=1.28.0,<1.29.0)", "mypy-boto3-swf (>=1.28.0,<1.29.0)", "mypy-boto3-synthetics (>=1.28.0,<1.29.0)", "mypy-boto3-textract (>=1.28.0,<1.29.0)", "mypy-boto3-timestream-query (>=1.28.0,<1.29.0)", "mypy-boto3-timestream-write (>=1.28.0,<1.29.0)", "mypy-boto3-tnb (>=1.28.0,<1.29.0)", "mypy-boto3-transcribe (>=1.28.0,<1.29.0)", "mypy-boto3-transfer (>=1.28.0,<1.29.0)", "mypy-boto3-translate (>=1.28.0,<1.29.0)", "mypy-boto3-verifiedpermissions (>=1.28.0,<1.29.0)", "mypy-boto3-voice-id (>=1.28.0,<1.29.0)", "mypy-boto3-vpc-lattice (>=1.28.0,<1.29.0)", "mypy-boto3-waf (>=1.28.0,<1.29.0)", "mypy-boto3-waf-regional (>=1.28.0,<1.29.0)", "mypy-boto3-wafv2 (>=1.28.0,<1.29.0)", "mypy-boto3-wellarchitected (>=1.28.0,<1.29.0)", "mypy-boto3-wisdom (>=1.28.0,<1.29.0)", "mypy-boto3-workdocs (>=1.28.0,<1.29.0)", "mypy-boto3-worklink (>=1.28.0,<1.29.0)", "mypy-boto3-workmail (>=1.28.0,<1.29.0)", "mypy-boto3-workmailmessageflow (>=1.28.0,<1.29.0)", "mypy-boto3-workspaces (>=1.28.0,<1.29.0)", "mypy-boto3-workspaces-web (>=1.28.0,<1.29.0)", "mypy-boto3-xray (>=1.28.0,<1.29.0)"] amp = ["mypy-boto3-amp (>=1.28.0,<1.29.0)"] amplify = ["mypy-boto3-amplify (>=1.28.0,<1.29.0)"] amplifybackend = ["mypy-boto3-amplifybackend (>=1.28.0,<1.29.0)"] @@ -113,8 +113,10 @@ backup = ["mypy-boto3-backup (>=1.28.0,<1.29.0)"] backup-gateway = ["mypy-boto3-backup-gateway (>=1.28.0,<1.29.0)"] backupstorage = ["mypy-boto3-backupstorage (>=1.28.0,<1.29.0)"] batch = ["mypy-boto3-batch (>=1.28.0,<1.29.0)"] +bedrock = ["mypy-boto3-bedrock (>=1.28.0,<1.29.0)"] +bedrock-runtime = ["mypy-boto3-bedrock-runtime (>=1.28.0,<1.29.0)"] billingconductor = ["mypy-boto3-billingconductor (>=1.28.0,<1.29.0)"] -boto3 = ["boto3 (==1.28.56)", "botocore (==1.31.56)"] +boto3 = ["boto3 (==1.28.57)", "botocore (==1.31.57)"] braket = ["mypy-boto3-braket (>=1.28.0,<1.29.0)"] budgets = ["mypy-boto3-budgets (>=1.28.0,<1.29.0)"] ce = ["mypy-boto3-ce (>=1.28.0,<1.29.0)"] @@ -440,13 +442,13 @@ xray = ["mypy-boto3-xray (>=1.28.0,<1.29.0)"] [[package]] name = "botocore" -version = "1.31.56" +version = "1.31.57" description = "Low-level, data-driven core of boto 3." optional = false python-versions = ">= 3.7" files = [ - {file = "botocore-1.31.56-py3-none-any.whl", hash = "sha256:66c686e4eda7051ffcc9357d9075390c8ab2f95a2977669039618ee186fb533b"}, - {file = "botocore-1.31.56.tar.gz", hash = "sha256:70252cd8abc2fe9b791328e187620f5a3911545e2520486b01ecfad31f41b9cb"}, + {file = "botocore-1.31.57-py3-none-any.whl", hash = "sha256:af006248276ff8e19e3ec7214478f6257035eb40aed865e405486500471ae71b"}, + {file = "botocore-1.31.57.tar.gz", hash = "sha256:301436174635bec739b225b840fc365ca00e5c1a63e5b2a19ee679d204e01b78"}, ] [package.dependencies] @@ -459,13 +461,13 @@ crt = ["awscrt (==0.16.26)"] [[package]] name = "botocore-stubs" -version = "1.31.56" +version = "1.31.57" description = "Type annotations and code completion for botocore" optional = false python-versions = ">=3.7,<4.0" files = [ - {file = "botocore_stubs-1.31.56-py3-none-any.whl", hash = "sha256:e8f7273f57c09ef47f8bdc89765b151f43a46b3c29c52fb58e9982115ed14d84"}, - {file = "botocore_stubs-1.31.56.tar.gz", hash = "sha256:e1510bd361acf755ecace42ee984281adfea6f96695ed22a08a4ceeabfe66f5f"}, + {file = "botocore_stubs-1.31.57-py3-none-any.whl", hash = "sha256:bce87eb261f6ad9a43ae1d6946fa48582a99685642a9edb9a56b50ac113b3177"}, + {file = "botocore_stubs-1.31.57.tar.gz", hash = "sha256:e28f3ca7a6279f01dcec4663980be80aa93dc4eb2efcc2396859ceb647623b83"}, ] [package.dependencies] @@ -485,75 +487,63 @@ files = [ [[package]] name = "cffi" -version = "1.15.1" +version = "1.16.0" description = "Foreign Function Interface for Python calling C code." optional = false -python-versions = "*" +python-versions = ">=3.8" files = [ - {file = "cffi-1.15.1-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:a66d3508133af6e8548451b25058d5812812ec3798c886bf38ed24a98216fab2"}, - {file = "cffi-1.15.1-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:470c103ae716238bbe698d67ad020e1db9d9dba34fa5a899b5e21577e6d52ed2"}, - {file = "cffi-1.15.1-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:9ad5db27f9cabae298d151c85cf2bad1d359a1b9c686a275df03385758e2f914"}, - {file = "cffi-1.15.1-cp27-cp27m-win32.whl", hash = "sha256:b3bbeb01c2b273cca1e1e0c5df57f12dce9a4dd331b4fa1635b8bec26350bde3"}, - {file = "cffi-1.15.1-cp27-cp27m-win_amd64.whl", hash = "sha256:e00b098126fd45523dd056d2efba6c5a63b71ffe9f2bbe1a4fe1716e1d0c331e"}, - {file = "cffi-1.15.1-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:d61f4695e6c866a23a21acab0509af1cdfd2c013cf256bbf5b6b5e2695827162"}, - {file = "cffi-1.15.1-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:ed9cb427ba5504c1dc15ede7d516b84757c3e3d7868ccc85121d9310d27eed0b"}, - {file = "cffi-1.15.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:39d39875251ca8f612b6f33e6b1195af86d1b3e60086068be9cc053aa4376e21"}, - {file = "cffi-1.15.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:285d29981935eb726a4399badae8f0ffdff4f5050eaa6d0cfc3f64b857b77185"}, - {file = "cffi-1.15.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3eb6971dcff08619f8d91607cfc726518b6fa2a9eba42856be181c6d0d9515fd"}, - {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:21157295583fe8943475029ed5abdcf71eb3911894724e360acff1d61c1d54bc"}, - {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5635bd9cb9731e6d4a1132a498dd34f764034a8ce60cef4f5319c0541159392f"}, - {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2012c72d854c2d03e45d06ae57f40d78e5770d252f195b93f581acf3ba44496e"}, - {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd86c085fae2efd48ac91dd7ccffcfc0571387fe1193d33b6394db7ef31fe2a4"}, - {file = "cffi-1.15.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:fa6693661a4c91757f4412306191b6dc88c1703f780c8234035eac011922bc01"}, - {file = "cffi-1.15.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:59c0b02d0a6c384d453fece7566d1c7e6b7bae4fc5874ef2ef46d56776d61c9e"}, - {file = "cffi-1.15.1-cp310-cp310-win32.whl", hash = "sha256:cba9d6b9a7d64d4bd46167096fc9d2f835e25d7e4c121fb2ddfc6528fb0413b2"}, - {file = "cffi-1.15.1-cp310-cp310-win_amd64.whl", hash = "sha256:ce4bcc037df4fc5e3d184794f27bdaab018943698f4ca31630bc7f84a7b69c6d"}, - {file = "cffi-1.15.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3d08afd128ddaa624a48cf2b859afef385b720bb4b43df214f85616922e6a5ac"}, - {file = "cffi-1.15.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3799aecf2e17cf585d977b780ce79ff0dc9b78d799fc694221ce814c2c19db83"}, - {file = "cffi-1.15.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a591fe9e525846e4d154205572a029f653ada1a78b93697f3b5a8f1f2bc055b9"}, - {file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3548db281cd7d2561c9ad9984681c95f7b0e38881201e157833a2342c30d5e8c"}, - {file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:91fc98adde3d7881af9b59ed0294046f3806221863722ba7d8d120c575314325"}, - {file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:94411f22c3985acaec6f83c6df553f2dbe17b698cc7f8ae751ff2237d96b9e3c"}, - {file = "cffi-1.15.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:03425bdae262c76aad70202debd780501fabeaca237cdfddc008987c0e0f59ef"}, - {file = "cffi-1.15.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:cc4d65aeeaa04136a12677d3dd0b1c0c94dc43abac5860ab33cceb42b801c1e8"}, - {file = "cffi-1.15.1-cp311-cp311-win32.whl", hash = "sha256:a0f100c8912c114ff53e1202d0078b425bee3649ae34d7b070e9697f93c5d52d"}, - {file = "cffi-1.15.1-cp311-cp311-win_amd64.whl", hash = "sha256:04ed324bda3cda42b9b695d51bb7d54b680b9719cfab04227cdd1e04e5de3104"}, - {file = "cffi-1.15.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50a74364d85fd319352182ef59c5c790484a336f6db772c1a9231f1c3ed0cbd7"}, - {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e263d77ee3dd201c3a142934a086a4450861778baaeeb45db4591ef65550b0a6"}, - {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cec7d9412a9102bdc577382c3929b337320c4c4c4849f2c5cdd14d7368c5562d"}, - {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4289fc34b2f5316fbb762d75362931e351941fa95fa18789191b33fc4cf9504a"}, - {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:173379135477dc8cac4bc58f45db08ab45d228b3363adb7af79436135d028405"}, - {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:6975a3fac6bc83c4a65c9f9fcab9e47019a11d3d2cf7f3c0d03431bf145a941e"}, - {file = "cffi-1.15.1-cp36-cp36m-win32.whl", hash = "sha256:2470043b93ff09bf8fb1d46d1cb756ce6132c54826661a32d4e4d132e1977adf"}, - {file = "cffi-1.15.1-cp36-cp36m-win_amd64.whl", hash = "sha256:30d78fbc8ebf9c92c9b7823ee18eb92f2e6ef79b45ac84db507f52fbe3ec4497"}, - {file = "cffi-1.15.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:198caafb44239b60e252492445da556afafc7d1e3ab7a1fb3f0584ef6d742375"}, - {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5ef34d190326c3b1f822a5b7a45f6c4535e2f47ed06fec77d3d799c450b2651e"}, - {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8102eaf27e1e448db915d08afa8b41d6c7ca7a04b7d73af6514df10a3e74bd82"}, - {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5df2768244d19ab7f60546d0c7c63ce1581f7af8b5de3eb3004b9b6fc8a9f84b"}, - {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a8c4917bd7ad33e8eb21e9a5bbba979b49d9a97acb3a803092cbc1133e20343c"}, - {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e2642fe3142e4cc4af0799748233ad6da94c62a8bec3a6648bf8ee68b1c7426"}, - {file = "cffi-1.15.1-cp37-cp37m-win32.whl", hash = "sha256:e229a521186c75c8ad9490854fd8bbdd9a0c9aa3a524326b55be83b54d4e0ad9"}, - {file = "cffi-1.15.1-cp37-cp37m-win_amd64.whl", hash = "sha256:a0b71b1b8fbf2b96e41c4d990244165e2c9be83d54962a9a1d118fd8657d2045"}, - {file = "cffi-1.15.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:320dab6e7cb2eacdf0e658569d2575c4dad258c0fcc794f46215e1e39f90f2c3"}, - {file = "cffi-1.15.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1e74c6b51a9ed6589199c787bf5f9875612ca4a8a0785fb2d4a84429badaf22a"}, - {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5c84c68147988265e60416b57fc83425a78058853509c1b0629c180094904a5"}, - {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3b926aa83d1edb5aa5b427b4053dc420ec295a08e40911296b9eb1b6170f6cca"}, - {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:87c450779d0914f2861b8526e035c5e6da0a3199d8f1add1a665e1cbc6fc6d02"}, - {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4f2c9f67e9821cad2e5f480bc8d83b8742896f1242dba247911072d4fa94c192"}, - {file = "cffi-1.15.1-cp38-cp38-win32.whl", hash = "sha256:8b7ee99e510d7b66cdb6c593f21c043c248537a32e0bedf02e01e9553a172314"}, - {file = "cffi-1.15.1-cp38-cp38-win_amd64.whl", hash = "sha256:00a9ed42e88df81ffae7a8ab6d9356b371399b91dbdf0c3cb1e84c03a13aceb5"}, - {file = "cffi-1.15.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:54a2db7b78338edd780e7ef7f9f6c442500fb0d41a5a4ea24fff1c929d5af585"}, - {file = "cffi-1.15.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:fcd131dd944808b5bdb38e6f5b53013c5aa4f334c5cad0c72742f6eba4b73db0"}, - {file = "cffi-1.15.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7473e861101c9e72452f9bf8acb984947aa1661a7704553a9f6e4baa5ba64415"}, - {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6c9a799e985904922a4d207a94eae35c78ebae90e128f0c4e521ce339396be9d"}, - {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3bcde07039e586f91b45c88f8583ea7cf7a0770df3a1649627bf598332cb6984"}, - {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:33ab79603146aace82c2427da5ca6e58f2b3f2fb5da893ceac0c42218a40be35"}, - {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d598b938678ebf3c67377cdd45e09d431369c3b1a5b331058c338e201f12b27"}, - {file = "cffi-1.15.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:db0fbb9c62743ce59a9ff687eb5f4afbe77e5e8403d6697f7446e5f609976f76"}, - {file = "cffi-1.15.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:98d85c6a2bef81588d9227dde12db8a7f47f639f4a17c9ae08e773aa9c697bf3"}, - {file = "cffi-1.15.1-cp39-cp39-win32.whl", hash = "sha256:40f4774f5a9d4f5e344f31a32b5096977b5d48560c5592e2f3d2c4374bd543ee"}, - {file = "cffi-1.15.1-cp39-cp39-win_amd64.whl", hash = "sha256:70df4e3b545a17496c9b3f41f5115e69a4f2e77e94e1d2a8e1070bc0c38c8a3c"}, - {file = "cffi-1.15.1.tar.gz", hash = "sha256:d400bfb9a37b1351253cb402671cea7e89bdecc294e8016a707f6d1d8ac934f9"}, + {file = "cffi-1.16.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6b3d6606d369fc1da4fd8c357d026317fbb9c9b75d36dc16e90e84c26854b088"}, + {file = "cffi-1.16.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ac0f5edd2360eea2f1daa9e26a41db02dd4b0451b48f7c318e217ee092a213e9"}, + {file = "cffi-1.16.0-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7e61e3e4fa664a8588aa25c883eab612a188c725755afff6289454d6362b9673"}, + {file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a72e8961a86d19bdb45851d8f1f08b041ea37d2bd8d4fd19903bc3083d80c896"}, + {file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5b50bf3f55561dac5438f8e70bfcdfd74543fd60df5fa5f62d94e5867deca684"}, + {file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7651c50c8c5ef7bdb41108b7b8c5a83013bfaa8a935590c5d74627c047a583c7"}, + {file = "cffi-1.16.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e4108df7fe9b707191e55f33efbcb2d81928e10cea45527879a4749cbe472614"}, + {file = "cffi-1.16.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:32c68ef735dbe5857c810328cb2481e24722a59a2003018885514d4c09af9743"}, + {file = "cffi-1.16.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:673739cb539f8cdaa07d92d02efa93c9ccf87e345b9a0b556e3ecc666718468d"}, + {file = "cffi-1.16.0-cp310-cp310-win32.whl", hash = "sha256:9f90389693731ff1f659e55c7d1640e2ec43ff725cc61b04b2f9c6d8d017df6a"}, + {file = "cffi-1.16.0-cp310-cp310-win_amd64.whl", hash = "sha256:e6024675e67af929088fda399b2094574609396b1decb609c55fa58b028a32a1"}, + {file = "cffi-1.16.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b84834d0cf97e7d27dd5b7f3aca7b6e9263c56308ab9dc8aae9784abb774d404"}, + {file = "cffi-1.16.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1b8ebc27c014c59692bb2664c7d13ce7a6e9a629be20e54e7271fa696ff2b417"}, + {file = "cffi-1.16.0-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ee07e47c12890ef248766a6e55bd38ebfb2bb8edd4142d56db91b21ea68b7627"}, + {file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8a9d3ebe49f084ad71f9269834ceccbf398253c9fac910c4fd7053ff1386936"}, + {file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e70f54f1796669ef691ca07d046cd81a29cb4deb1e5f942003f401c0c4a2695d"}, + {file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5bf44d66cdf9e893637896c7faa22298baebcd18d1ddb6d2626a6e39793a1d56"}, + {file = "cffi-1.16.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7b78010e7b97fef4bee1e896df8a4bbb6712b7f05b7ef630f9d1da00f6444d2e"}, + {file = "cffi-1.16.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:c6a164aa47843fb1b01e941d385aab7215563bb8816d80ff3a363a9f8448a8dc"}, + {file = "cffi-1.16.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e09f3ff613345df5e8c3667da1d918f9149bd623cd9070c983c013792a9a62eb"}, + {file = "cffi-1.16.0-cp311-cp311-win32.whl", hash = "sha256:2c56b361916f390cd758a57f2e16233eb4f64bcbeee88a4881ea90fca14dc6ab"}, + {file = "cffi-1.16.0-cp311-cp311-win_amd64.whl", hash = "sha256:db8e577c19c0fda0beb7e0d4e09e0ba74b1e4c092e0e40bfa12fe05b6f6d75ba"}, + {file = "cffi-1.16.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:fa3a0128b152627161ce47201262d3140edb5a5c3da88d73a1b790a959126956"}, + {file = "cffi-1.16.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:68e7c44931cc171c54ccb702482e9fc723192e88d25a0e133edd7aff8fcd1f6e"}, + {file = "cffi-1.16.0-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:abd808f9c129ba2beda4cfc53bde801e5bcf9d6e0f22f095e45327c038bfe68e"}, + {file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:88e2b3c14bdb32e440be531ade29d3c50a1a59cd4e51b1dd8b0865c54ea5d2e2"}, + {file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fcc8eb6d5902bb1cf6dc4f187ee3ea80a1eba0a89aba40a5cb20a5087d961357"}, + {file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b7be2d771cdba2942e13215c4e340bfd76398e9227ad10402a8767ab1865d2e6"}, + {file = "cffi-1.16.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e715596e683d2ce000574bae5d07bd522c781a822866c20495e52520564f0969"}, + {file = "cffi-1.16.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:2d92b25dbf6cae33f65005baf472d2c245c050b1ce709cc4588cdcdd5495b520"}, + {file = "cffi-1.16.0-cp312-cp312-win32.whl", hash = "sha256:b2ca4e77f9f47c55c194982e10f058db063937845bb2b7a86c84a6cfe0aefa8b"}, + {file = "cffi-1.16.0-cp312-cp312-win_amd64.whl", hash = "sha256:68678abf380b42ce21a5f2abde8efee05c114c2fdb2e9eef2efdb0257fba1235"}, + {file = "cffi-1.16.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0c9ef6ff37e974b73c25eecc13952c55bceed9112be2d9d938ded8e856138bcc"}, + {file = "cffi-1.16.0-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a09582f178759ee8128d9270cd1344154fd473bb77d94ce0aeb2a93ebf0feaf0"}, + {file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e760191dd42581e023a68b758769e2da259b5d52e3103c6060ddc02c9edb8d7b"}, + {file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:80876338e19c951fdfed6198e70bc88f1c9758b94578d5a7c4c91a87af3cf31c"}, + {file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a6a14b17d7e17fa0d207ac08642c8820f84f25ce17a442fd15e27ea18d67c59b"}, + {file = "cffi-1.16.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6602bc8dc6f3a9e02b6c22c4fc1e47aa50f8f8e6d3f78a5e16ac33ef5fefa324"}, + {file = "cffi-1.16.0-cp38-cp38-win32.whl", hash = "sha256:131fd094d1065b19540c3d72594260f118b231090295d8c34e19a7bbcf2e860a"}, + {file = "cffi-1.16.0-cp38-cp38-win_amd64.whl", hash = "sha256:31d13b0f99e0836b7ff893d37af07366ebc90b678b6664c955b54561fc36ef36"}, + {file = "cffi-1.16.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:582215a0e9adbe0e379761260553ba11c58943e4bbe9c36430c4ca6ac74b15ed"}, + {file = "cffi-1.16.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b29ebffcf550f9da55bec9e02ad430c992a87e5f512cd63388abb76f1036d8d2"}, + {file = "cffi-1.16.0-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dc9b18bf40cc75f66f40a7379f6a9513244fe33c0e8aa72e2d56b0196a7ef872"}, + {file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9cb4a35b3642fc5c005a6755a5d17c6c8b6bcb6981baf81cea8bfbc8903e8ba8"}, + {file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b86851a328eedc692acf81fb05444bdf1891747c25af7529e39ddafaf68a4f3f"}, + {file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c0f31130ebc2d37cdd8e44605fb5fa7ad59049298b3f745c74fa74c62fbfcfc4"}, + {file = "cffi-1.16.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f8e709127c6c77446a8c0a8c8bf3c8ee706a06cd44b1e827c3e6a2ee6b8c098"}, + {file = "cffi-1.16.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:748dcd1e3d3d7cd5443ef03ce8685043294ad6bd7c02a38d1bd367cfd968e000"}, + {file = "cffi-1.16.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8895613bcc094d4a1b2dbe179d88d7fb4a15cee43c052e8885783fac397d91fe"}, + {file = "cffi-1.16.0-cp39-cp39-win32.whl", hash = "sha256:ed86a35631f7bfbb28e108dd96773b9d5a6ce4811cf6ea468bb6a359b256b1e4"}, + {file = "cffi-1.16.0-cp39-cp39-win_amd64.whl", hash = "sha256:3686dffb02459559c74dd3d81748269ffb0eb027c39a6fc99502de37d501faa8"}, + {file = "cffi-1.16.0.tar.gz", hash = "sha256:bcb3ef43e58665bbda2fb198698fcae6776483e0c4a631aa5647806c25e02cc0"}, ] [package.dependencies] @@ -917,7 +907,6 @@ files = [ name = "jsonc-parser" version = "1.1.5" description = "A lightweight, native tool for parsing .jsonc files" -category = "main" optional = false python-versions = ">=3.5" files = [ @@ -1554,13 +1543,13 @@ testing = ["coverage (>=5.0)", "pytest", "pytest-cover"] [[package]] name = "wcwidth" -version = "0.2.6" +version = "0.2.7" description = "Measures the displayed width of unicode strings in a terminal" optional = false python-versions = "*" files = [ - {file = "wcwidth-0.2.6-py2.py3-none-any.whl", hash = "sha256:795b138f6875577cd91bba52baf9e445cd5118fd32723b460e30a0af30ea230e"}, - {file = "wcwidth-0.2.6.tar.gz", hash = "sha256:a5220780a404dbe3353789870978e472cfe477761f06ee55077256e509b156d0"}, + {file = "wcwidth-0.2.7-py2.py3-none-any.whl", hash = "sha256:fabf3e32999d9b0dab7d19d845149f326f04fe29bac67709ee071dbd92640a36"}, + {file = "wcwidth-0.2.7.tar.gz", hash = "sha256:1b6d30a98ddd5ce9bbdb33658191fd2423fc9da203fe3ef1855407dcb7ee4e26"}, ] [[package]] @@ -1618,4 +1607,4 @@ tests = ["PasteDeploy", "WSGIProxy2", "coverage", "mock", "nose (<1.3.0)", "pyqu [metadata] lock-version = "2.0" python-versions = ">=3.8,<3.12" -content-hash = "6cdcf4e9fab04bbcaa83c3054b61356b4830b55c67e2a362dd1af635ae75a688" +content-hash = "cece532417080e22830d01f30a6589df2f8e94465896e57cd14e77ae91f89b4a" diff --git a/pyproject.toml b/pyproject.toml index 96a3fe66a..0b5f9d7fc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,8 +38,8 @@ classifiers = [ [tool.poetry.dependencies] python = ">=3.8,<3.12" -boto3 = "^1.28.54" -botocore = "^1.31.54" +boto3 = "^1.28.57" +botocore = "^1.31.57" # The DCIC portals (cgap-portal and fourfront) are very particular about which ElasticSearch version. # This value is intentionally pinned and must not be changed casually. elasticsearch = "7.13.4" @@ -64,8 +64,8 @@ tqdm = "^4.65.0" [tool.poetry.dev-dependencies] -boto3-stubs = "^1.28.54" -botocore-stubs = "^1.31.54" +boto3-stubs = "^1.28.57" +botocore-stubs = "^1.31.57" coverage = ">=7.2.3" # Loaded manually in GA workflow for coverage because a dependency on 2to3 # in its docopts dependency makes a problem for laoding it here in poetry. -kmp 7-Apr-2023 From 3d25f5616e39647c90b791fb1e73ea306feedf23 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Sat, 14 Oct 2023 18:26:24 -0400 Subject: [PATCH 074/101] WIP --- dcicutils/bundle_utils.py | 641 ++++++++------ dcicutils/lang_utils.py | 4 +- dcicutils/misc_utils.py | 7 + dcicutils/sheet_utils.py | 31 +- .../sample_inserts/institution.json | 1 + test/data_files/sample_inserts/project.json | 1 + test/data_files/sample_inserts/user.json | 1 + .../sample_schemas/institution.json | 340 ++++++++ test/data_files/sample_schemas/project.json | 592 +++++++++++++ test/data_files/sample_schemas/user.json | 817 ++++++++++++++++++ test/test_bundle_utils.py | 480 +++++----- 11 files changed, 2430 insertions(+), 485 deletions(-) create mode 100644 test/data_files/sample_inserts/institution.json create mode 100644 test/data_files/sample_inserts/project.json create mode 100644 test/data_files/sample_inserts/user.json create mode 100644 test/data_files/sample_schemas/institution.json create mode 100644 test/data_files/sample_schemas/project.json create mode 100644 test/data_files/sample_schemas/user.json diff --git a/dcicutils/bundle_utils.py b/dcicutils/bundle_utils.py index 2462dedb6..9cac46157 100644 --- a/dcicutils/bundle_utils.py +++ b/dcicutils/bundle_utils.py @@ -1,44 +1,67 @@ import contextlib import copy # import os -import uuid +# import uuid +# import warnings -from typing import Any, Dict, List, Optional, Type, Union +from typing import Any, Dict, List, Optional, Union # , Type from .common import AnyJsonData # , Regexp, CsvReader from .env_utils import EnvUtils, public_env_name -from .ff_utils import get_schema +from .ff_utils import get_schema, get_metadata from .lang_utils import there_are -from .misc_utils import AbstractVirtualApp, ignored, PRINT +from .misc_utils import AbstractVirtualApp, ignored, PRINT, to_camel_case from .sheet_utils import ( - Header, Headers, ParsedHeader, ParsedHeaders, SheetCellValue, SheetRow, TabbedSheetData, # SheetData, - prefer_number, - LoadTableError, - TableSetManagerRegistry, AbstractTableSetManager, BasicTableSetManager, - CsvManager, TsvManager, XlsxManager, - SimpleJsonInsertsManager, SimpleYamlInsertsManager, SimpleJsonLinesInsertsManager, - TabbedJsonInsertsManager, TabbedYamlInsertsManager, - InsertsDirectoryManager, + TabbedSchemas, LoadTableError, prefer_number, + Header, Headers, TabbedHeaders, + ParsedHeader, ParsedHeaders, TabbedParsedHeaders, + SheetCellValue, TabbedSheetData, # SheetRow, SheetData, + TableSetManagerRegistry, AbstractTableSetManager, # BasicTableSetManager, + # CsvManager, TsvManager, XlsxManager, + # SimpleJsonInsertsManager, SimpleYamlInsertsManager, SimpleJsonLinesInsertsManager, + # TabbedJsonInsertsManager, TabbedYamlInsertsManager, + # InsertsDirectoryManager, + load_table_set ) from .task_utils import pmap -@contextlib.contextmanager -def deferred_problems(): - problems = [] +# @contextlib.contextmanager +# def deferred_problems(): +# problems = [] +# +# def note_problem(problem): +# problems.append(problem) +# +# yield note_problem +# +# if problems: +# for problem in problems: +# PRINT(f"Problem: {problem}") +# raise Exception(there_are(problems, kind='problem while compiling hints', tense='past', show=False)) - def note_problems(problem): - problems.append(problem) - yield note_problems +class TypeHintContext: - if problems: - for problem in problems: - PRINT(f"Problem: {problem}") - raise Exception(there_are(problems, kind='problem while compiling hints', tense='past', show=False)) + @classmethod + def schema_exists(cls, schema_name: str) -> bool: # noQA - PyCharm complains wrongly about return value + ignored(schema_name) + raise NotImplementedError(f"{cls.__name__}.schema_exists(...) is not implemented.") + @classmethod + def validate_ref(cls, item_type: str, item_ref: str) -> str: # noQA - PyCharm complains wrongly about return value + ignored(item_type, item_ref) + raise NotImplementedError(f"{cls.__name__}.validate_ref(...) is not implemented.") -class AbstractItemManager(AbstractTableSetManager): + @classmethod + def note_problem(cls, problem: str): + ignored(problem) + raise NotImplementedError(f"{cls.__name__}.note_problem(...) is not implemented.") + def __str__(self): + return f"<{self.__class__.__name__} {id(self)}>" + + +class ValidationProblem(Exception): pass @@ -55,15 +78,37 @@ def __repr__(self): class BoolHint(TypeHint): + # We could use other ways to do this, such as initial substring, but this is more likely to be right. + # Then again, we might want to consder accepting athers like 'yes/no', 'y/n', 'on/off', '1/0'. + TRUE_VALUES = ['true', 't'] + FALSE_VALUES = ['false', 'f'] + def apply_hint(self, value): if isinstance(value, str) and value: - if 'true'.startswith(value.lower()): + l_value = value.lower() + if l_value in self.TRUE_VALUES: return True - elif 'false'.startswith(value.lower()): + elif l_value in self.FALSE_VALUES: return False return super().apply_hint(value) +class NumHint(TypeHint): + + PREFERENCE_MAP = {'number': 'num', 'integer': 'int', 'float': 'float'} + + def __init__(self, declared_type): + self.preferred_type = self.PREFERENCE_MAP.get(declared_type) + + def apply_hint(self, value): + if isinstance(value, str) and value: + if self.preferred_type: + return prefer_number(value, kind=self.preferred_type) + else: + return value + return super().apply_hint(value) + + class EnumHint(TypeHint): def __str__(self): @@ -90,9 +135,29 @@ def apply_hint(self, value): return super().apply_hint(value) +class RefHint(TypeHint): + + def __str__(self): + return f"" + + def __init__(self, schema_name: str, context: TypeHintContext): + self.schema_name = schema_name + self.context = context + + def apply_hint(self, value): + if not self.context.validate_ref(item_type=self.schema_name, item_ref=value): + raise ValidationProblem(f"Unable to validate {self.schema_name} reference: {value!r}") + return value + + OptionalTypeHints = List[Optional[TypeHint]] +class AbstractStructureManager(AbstractTableSetManager): + + pass + + class ItemTools: """ Implements operations on table-related data without pre-supposing the specific representation of the table. @@ -131,7 +196,7 @@ def parse_sheet_header(cls, header: Header) -> ParsedHeader: return result @classmethod - def parse_sheet_headers(cls, headers: Headers): + def parse_sheet_headers(cls, headers: Headers) -> ParsedHeaders: return [cls.parse_sheet_header(header) for header in headers] @@ -171,7 +236,7 @@ def assure_patch_prototype_shape(cls, *, parent: Union[Dict, List], keys: Parsed INSTAGUIDS_ENABLED = False # Experimental feature not enabled by default @classmethod - def parse_item_value(cls, value: SheetCellValue, context=None) -> AnyJsonData: + def parse_item_value(cls, value: SheetCellValue) -> AnyJsonData: # TODO: Remodularize this for easier testing and more Schema-driven effect # Doug asks that this be broken up into different mechanisms, more modular and separately testable. # I pretty much agree with that. I'm just waiting for suggestions on what kinds of features are desired. @@ -190,27 +255,15 @@ def parse_item_value(cls, value: SheetCellValue, context=None) -> AnyJsonData: else: if value.endswith("|"): # Use 'foo|' for ['foo'] value = value[:-1] - return [cls.parse_item_value(subvalue, context=context) for subvalue in value.split('|')] - elif cls.INSTAGUIDS_ENABLED and context is not None and value.startswith('#'): - # Note that this clause MUST follow '|' clause above so '#foo|#bar' isn't seen as instaguid - return cls.get_instaguid(value, context=context) + return [cls.parse_item_value(subvalue) for subvalue in value.split('|')] else: # Doug points out that the schema might not agree, might want a string representation of a number. # At this semantic layer, this might be a bad choice. - return prefer_number(value) + # return prefer_number(value) + return value else: # presumably a number (int or float) return value - @classmethod - def get_instaguid(cls, guid_placeholder: str, *, context: Optional[Dict] = None): - if context is None: - return guid_placeholder - else: - referent = context.get(guid_placeholder) - if not referent: - context[guid_placeholder] = referent = str(uuid.uuid4()) - return referent - @classmethod def set_path_value(cls, datum: Union[List, Dict], path: ParsedHeader, value: Any, force: bool = False): if (value is None or value == '') and not force: @@ -222,7 +275,7 @@ def set_path_value(cls, datum: Union[List, Dict], path: ParsedHeader, value: Any cls.set_path_value(datum[key], more_path, value) @classmethod - def find_type_hint(cls, parsed_header: Optional[ParsedHeader], schema: Any): + def find_type_hint(cls, parsed_header: Optional[ParsedHeader], schema: Any, context: Optional[TypeHintContext] = None): def finder(subheader, subschema): if not parsed_header: @@ -240,6 +293,11 @@ def finder(subheader, subschema): if enum: mapping = {e.lower(): e for e in enum} return EnumHint(mapping) + link_to = def1.get('linkTo') + if link_to and context.schema_exists(link_to): + return RefHint(schema_name=link_to, context=context) + elif t in ('integer', 'float', 'number'): + return NumHint(declared_type=t) elif t == 'boolean': return BoolHint() else: @@ -252,51 +310,53 @@ def finder(subheader, subschema): return finder(subheader=parsed_header, subschema=schema) -ITEM_MANAGER_REGISTRY = TableSetManagerRegistry() - - -class SchemaAutoloadMixin(AbstractTableSetManager): +class SchemaManager: SCHEMA_CACHE = {} # Shared cache. Do not override. Use .clear_schema_cache() to clear it. - CACHE_SCHEMAS = True # Controls whether we're doing caching at all - AUTOLOAD_SCHEMAS_DEFAULT = True - - def __init__(self, filename: str, autoload_schemas: Optional[bool] = None, portal_env: Optional[str] = None, - portal_vapp: Optional[AbstractVirtualApp] = None, **kwargs): - # This setup must be in place before the class initialization is done (via the super call). - self.autoload_schemas: bool = self.AUTOLOAD_SCHEMAS_DEFAULT if autoload_schemas is None else autoload_schemas - if self.autoload_schemas: # If autoload_schemas is False, we don't care about doing this defaulting. - if portal_env is None and portal_vapp is None: - portal_env = public_env_name(EnvUtils.PRD_ENV_NAME) - PRINT(f"The portal_env was not explicitly supplied. Schemas will come from portal_env={portal_env!r}.") - self.portal_env: Optional[str] = portal_env - self.portal_vapp: Optional[AbstractVirtualApp] = portal_vapp - super().__init__(filename=filename, **kwargs) - - def fetch_relevant_schemas(self, schema_names: List[str]): + + @classmethod + @contextlib.contextmanager + def fresh_schema_manager_context_for_testing(cls): + old_schema_cache = cls.SCHEMA_CACHE + try: + cls.SCHEMA_CACHE = {} + yield + finally: + cls.SCHEMA_CACHE = old_schema_cache + + def __init__(self, portal_env: Optional[str] = None, portal_vapp: Optional[AbstractVirtualApp] = None): + if portal_env is None and portal_vapp is None: + portal_env = public_env_name(EnvUtils.PRD_ENV_NAME) + PRINT(f"The portal_env was not explicitly supplied. Schemas will come from portal_env={portal_env!r}.") + self.portal_env = portal_env + self.portal_vapp = portal_vapp + + def fetch_relevant_schemas(self, schema_names: List[str], schemas: Optional[TabbedSchemas] = None): + if schemas is None: + schemas = {} # The schema_names argument is not normally given, but it is there for easier testing def fetch_schema(schema_name): - schema = self.fetch_schema(schema_name, portal_env=self.portal_env, portal_vapp=self.portal_vapp) + schema = schemas.get(schema_name) + schema = (self.fetch_schema(schema_name, portal_env=self.portal_env, portal_vapp=self.portal_vapp) + if schema is None + else schema) return schema_name, schema - if self.autoload_schemas and (self.portal_env or self.portal_vapp): - autoloaded = {tab_name: schema - for tab_name, schema in pmap(fetch_schema, schema_names)} - return autoloaded - else: - return {} + return {schema_name: schema + for schema_name, schema in pmap(fetch_schema, schema_names)} + + @classmethod + def schema_exists(cls, schema_name: str, *, portal_env: Optional[str] = None, + portal_vapp: Optional[AbstractVirtualApp] = None): + return bool(cls.fetch_schema(schema_name=schema_name, portal_env=portal_env, portal_vapp=portal_vapp)) @classmethod def fetch_schema(cls, schema_name: str, *, portal_env: Optional[str] = None, portal_vapp: Optional[AbstractVirtualApp] = None): - def just_fetch_it(): - return get_schema(schema_name, portal_env=portal_env, portal_vapp=portal_vapp) - if cls.CACHE_SCHEMAS: - schema: Optional[AnyJsonData] = cls.SCHEMA_CACHE.get(schema_name) - if schema is None: - cls.SCHEMA_CACHE[schema_name] = schema = just_fetch_it() - return schema - else: - return just_fetch_it() + schema: Optional[AnyJsonData] = cls.SCHEMA_CACHE.get(schema_name) + if schema is None and schema_name not in cls.SCHEMA_CACHE: # If None is already stored, don't look it up again + schema = get_schema(schema_name, portal_env=portal_env, portal_vapp=portal_vapp) + cls.SCHEMA_CACHE[schema_name] = schema + return schema @classmethod def clear_schema_cache(cls): @@ -304,35 +364,248 @@ def clear_schema_cache(cls): cls.SCHEMA_CACHE.pop(key, None) -class ItemManagerMixin(SchemaAutoloadMixin, AbstractItemManager, BasicTableSetManager): +ITEM_MANAGER_REGISTRY = TableSetManagerRegistry() + +PatchPrototype = Dict +TabbedPatchPrototypes = Dict[str, PatchPrototype] + + +def extract_tabbed_headers(data: TabbedSheetData) -> TabbedHeaders: + result: TabbedHeaders = {} + for tab, rows in data.items(): + if rows: + # Data is homogeneous, so whatever the headers for the first row should be the same for all + headers: List[str] = list(rows[0].keys()) + else: + # If there's no data in the tab, there are also no headers + headers: List[str] = [] + result[tab] = headers + return result + + +class TableInflater: """ - This can add functionality to a reader such as an XlsxManager or a CsvManager in order to make its rows - get handled like Items instead of just flat table rows. + This tool can be used independently of the item tools. It doesn't involve schemas, but it does allow the + inflation of a table with dotted names to structures. e.g., a table with headers mother.name, mother.age, + father.name, and father.age, as in + data = load_table_set() + to bring in the flat representation with: + {"mother.name": , "mother.age": , ...} + one can use inflate(data) to get: + {"mother": {"name": , "age": }, + "father: {"name": , "age": }} + Note, too, that although data != inflate(data), once inflated, inflate(inflate(data)) == inflate(data). """ - def __init__(self, filename: str, schemas: Optional[Dict[str, AnyJsonData]] = None, **kwargs): - super().__init__(filename=filename, **kwargs) - self.patch_prototypes_by_tab_name: Dict[str, Dict] = {} - self.parsed_headers_by_tab_name: Dict[str, ParsedHeaders] = {} - self.type_hints_by_tab_name: Dict[str, OptionalTypeHints] = {} - self._schemas = schemas - self._instaguid_context_table: Dict[str, str] = {} + def __init__(self, tabbed_sheet_data: TabbedSheetData): + self.tabbed_sheet_data: TabbedSheetData = tabbed_sheet_data + self.headers_by_tab_name: TabbedHeaders = extract_tabbed_headers(tabbed_sheet_data) + self.parsed_headers_by_tab_name: TabbedParsedHeaders = { + tab_name: ItemTools.parse_sheet_headers(headers) + for tab_name, headers in self.headers_by_tab_name.items() + } + self.patch_prototypes_by_tab_name: TabbedPatchPrototypes = { + tab_name: ItemTools.compute_patch_prototype(parsed_headers) + for tab_name, parsed_headers in self.parsed_headers_by_tab_name.items() + } @property - def schemas(self): - schemas = self._schemas - if schemas is None: - self._schemas = schemas = self.fetch_relevant_schemas(self.tab_names) - return schemas + def tab_names(self): + return list(self.tabbed_sheet_data.keys()) + + def inflate_tabs(self): + return {tab_name: self.inflate_tab(tab_name) + for tab_name in self.tab_names} + + def inflate_tab(self, tab_name: str): + prototype = self.patch_prototypes_by_tab_name[tab_name] + parsed_headers = self.parsed_headers_by_tab_name[tab_name] + result = [self.inflate_row(row, prototype=prototype, parsed_headers=parsed_headers) + for row in self.tabbed_sheet_data[tab_name]] + return result + + @classmethod + def inflate_row(cls, row: Dict, *, prototype: Dict, parsed_headers: ParsedHeaders): + patch_item = copy.deepcopy(prototype) + for column_number, column_value in enumerate(row.values()): + parsed_value = ItemTools.parse_item_value(column_value) + ItemTools.set_path_value(patch_item, parsed_headers[column_number], parsed_value) + return patch_item - def sheet_patch_prototype(self, tab_name: str) -> Dict: - return self.patch_prototypes_by_tab_name[tab_name] + @classmethod + def inflate(cls, tabbed_sheet_data: TabbedSheetData): + inflater = cls(tabbed_sheet_data) + inflated = inflater.inflate_tabs() + return inflated + + +inflate = TableInflater.inflate + + +def load_table_structures(filename: str, tab_name: Optional[str] = None, escaping: Optional[bool] = None, + prefer_number: bool = True, **kwargs): + """This differs from load_table_set only in that it inflates the content. It does not apply schemas.""" + tabbed_rows = load_table_set(filename=filename, tab_name=tab_name, escaping=escaping, prefer_number=prefer_number, + **kwargs) + tabbed_structures = inflate(tabbed_rows) + return tabbed_structures + + +class TabbedItemTable: + + def __init__(self, tabbed_sheet_data: TabbedSheetData, + portal_env: Optional[str] = None, portal_vapp: Optional[AbstractVirtualApp] = None): + self.portal_env = portal_env + self.portal_vapp = portal_vapp + self.headers_by_tab_name: Dict[str, str] = { + tab_name: list(rows[0].keys()) if rows else [] + for tab_name, rows in tabbed_sheet_data.items() + } + self.lookup_tables_by_tab_name: Dict[str, Dict[str, Dict]] = { + tab_name: self.build_lookup_table_for_tab(tab_name, rows=rows) + for tab_name, rows in tabbed_sheet_data.items() + } + + def build_lookup_table_for_tab(self, tab_name: str, *, rows: List[Dict]) -> Dict[str, Dict]: + # TODO: It might be enough to just return the keys as a set, not a full dict + schema = get_schema(tab_name, portal_env=self.portal_env, portal_vapp=self.portal_vapp) + possible_identifying_properties = set(schema.get("identifyingProperties") or []) | {'uuid'} + identifying_properties = [prop + for prop in self.headers_by_tab_name[tab_name] + if prop in possible_identifying_properties] + if not identifying_properties: + # Maybe issue a warning here that we're going to lose + empty_lookup_table: Dict[str, Dict] = {} + return empty_lookup_table + lookup_table: Dict[str, Dict] = {} + for row in rows: + for identifying_property in identifying_properties: + value = row.get(identifying_property) + if value is not '' and value is not None: + lookup_table[str(value)] = row + return lookup_table + + def contains_ref(self, item_type, item_ref): + ref = self.resolve_ref(item_type=item_type, item_ref=item_ref) + if ref is None: + return False + else: + return True + + def resolve_ref(self, item_type, item_ref): + lookup_table = self.lookup_tables_by_tab_name.get(item_type) + if lookup_table: # Is it a type we're tracking? + return lookup_table.get(item_ref) or None + else: # Apparently some stray type not in our tables + return None + + +class TableChecker(TableInflater, TypeHintContext): + + def __init__(self, tabbed_sheet_data: TabbedSheetData, schemas: Optional[TabbedSchemas] = None, + portal_env: Optional[str] = None, portal_vapp: Optional[AbstractVirtualApp] = None): + self.portal_env = portal_env + self.portal_vapp = portal_vapp + self._problems: List[str] = [] + super().__init__(tabbed_sheet_data=tabbed_sheet_data) + self.schema_manager = SchemaManager(portal_env=portal_env, portal_vapp=portal_vapp) + self.schemas = self.schema_manager.fetch_relevant_schemas(self.tab_names, schemas=schemas) + self.type_hints_by_tab_name: Dict[str, OptionalTypeHints] = { + tab_name: self.compile_type_hints(tab_name) + for tab_name in self.tab_names + } + self.tabbed_item_table = TabbedItemTable(tabbed_sheet_data, portal_env=portal_env, portal_vapp=portal_vapp) + # self.lookup_tables_by_tab_name: Dict[str, Dict[str, Dict]] = { + # tab_name: self.build_lookup_table_for_tab(tab_name, rows=rows) + # for tab_name, rows in tabbed_sheet_data.items() + # } + + def note_problem(self, problem: str): + self._problems.append(problem) + + def raise_any_pending_problems(self): + problems = self._problems + if problems: + for problem in problems: + PRINT(f"Problem: {problem}") + raise Exception(there_are(problems, kind='problem while compiling hints', tense='past', show=False)) + + def check_tabs(self): + result = {tab_name: self.check_tab(tab_name) + for tab_name in self.tab_names} + # At this point, doing the checking will have already raised certain errors, if those errors interfere + # with continued checking, but some smaller problems may have been deferred until the end, so we have to + # check for and raise an error for any such pending problems now. + self.raise_any_pending_problems() + return result - def sheet_parsed_headers(self, tab_name: str) -> ParsedHeaders: - return self.parsed_headers_by_tab_name[tab_name] + def validate_ref(self, item_type, item_ref): + if self.tabbed_item_table.contains_ref(item_type=item_type, item_ref=item_ref): + return True + # lookup_table = self.lookup_tables_by_tab_name.get(item_type) + # if lookup_table: + # if item_ref in lookup_table: + # return True + try: + info = get_metadata(f"/{to_camel_case(item_type)}/{item_ref}") + # Basically return True if there's a value at all, + # but still check it's not an error message that didn't get raised. + return isinstance(info, dict) and 'uuid' in info + except Exception: + return False + + def schema_exists(self, schema_name: str) -> bool: + return self.schema_manager.schema_exists(schema_name) + + # def build_lookup_table_for_tab(self, tab_name: str, *, rows: List[Dict]) -> Dict[str, Dict]: + # # TODO: It might be enough to just return the keys as a set, not a full dict + # schema = get_schema(tab_name, portal_env=self.portal_env, portal_vapp=self.portal_vapp) + # possible_identifying_properties = set(schema.get("identifyingProperties") or []) | {'uuid'} + # identifying_properties = [prop + # for prop in self.headers_by_tab_name[tab_name] + # if prop in possible_identifying_properties] + # if not identifying_properties: + # # Maybe issue a warning here that we're going to lose + # empty_lookup_table: Dict[str, Dict] = {} + # return empty_lookup_table + # lookup_table: Dict[str, Dict] = {} + # for row in rows: + # for identifying_property in identifying_properties: + # value = row.get(identifying_property) + # if value is not '' and value is not None: + # lookup_table[str(value)] = row + # return lookup_table + + def check_tab(self, tab_name: str): + prototype = self.patch_prototypes_by_tab_name[tab_name] + parsed_headers = self.parsed_headers_by_tab_name[tab_name] + type_hints = self.type_hints_by_tab_name[tab_name] + result = [self.check_row(row, tab_name=tab_name, row_number=row_number, prototype=prototype, + parsed_headers=parsed_headers, type_hints=type_hints) + for row_number, row in enumerate(self.tabbed_sheet_data[tab_name])] + return result + + def check_row(self, row: Dict, *, tab_name: str, row_number: int, prototype: Dict, + parsed_headers: ParsedHeaders, type_hints: OptionalTypeHints): + patch_item = copy.deepcopy(prototype) + for column_number, column_value in enumerate(row.values()): + parsed_value = ItemTools.parse_item_value(column_value) + type_hint = type_hints[column_number] + if type_hint: + try: + parsed_value = type_hint.apply_hint(parsed_value) + except ValidationProblem as e: + headers = self.headers_by_tab_name[tab_name] + column_name = headers[column_number] + self.note_problem(f"{tab_name}[{row_number}].{column_name}: {e}") + ItemTools.set_path_value(patch_item, parsed_headers[column_number], parsed_value) + return patch_item - def sheet_type_hints(self, tab_name: str) -> OptionalTypeHints: - return self.type_hints_by_tab_name[tab_name] + @classmethod + def check(cls, tabbed_sheet_data: TabbedSheetData, schemas: Optional[TabbedSchemas] = None, **kwargs): + checker = cls(tabbed_sheet_data, schemas=schemas, **kwargs) + checked = checker.check_tabs() + return checked class SheetState: @@ -340,165 +613,33 @@ def __init__(self, parsed_headers: ParsedHeaders, type_hints: OptionalTypeHints) self.parsed_headers = parsed_headers self.type_hints = type_hints - def _compile_type_hints(self, tab_name: str): - parsed_headers = self.sheet_parsed_headers(tab_name) + def compile_type_hints(self, tab_name: str) -> OptionalTypeHints: + parsed_headers = self.parsed_headers_by_tab_name[tab_name] schema = self.schemas.get(tab_name) - with deferred_problems() as note_problem: - for required_header in self._schema_required_headers(schema): - if required_header not in parsed_headers: - note_problem("Missing required header") - type_hints = [ItemTools.find_type_hint(parsed_header, schema) if schema else None + for required_header in self._schema_required_headers(schema): + if required_header not in parsed_headers: + self.note_problem("Missing required header") + type_hints = [ItemTools.find_type_hint(parsed_header, schema, context=self) if schema else None for parsed_header in parsed_headers] - self.type_hints_by_tab_name[tab_name] = type_hints + return type_hints @classmethod def _schema_required_headers(cls, schema): ignored(schema) return [] # TODO: Make this compute a list of required headers (in parsed header form) - def _compile_sheet_headers(self, tab_name: str): - headers = self.headers_by_tab_name[tab_name] - parsed_headers = ItemTools.parse_sheet_headers(headers) - self.parsed_headers_by_tab_name[tab_name] = parsed_headers - prototype = ItemTools.compute_patch_prototype(parsed_headers) - self.patch_prototypes_by_tab_name[tab_name] = prototype - - def _create_tab_processor_state(self, tab_name: str) -> SheetState: - super()._create_tab_processor_state(tab_name) - # This will create state that allows us to efficiently assign values in the right place on each row - # by setting up a prototype we can copy and then drop values into. - self._compile_sheet_headers(tab_name) - self._compile_type_hints(tab_name) - return self.SheetState(parsed_headers=self.sheet_parsed_headers(tab_name), - type_hints=self.sheet_type_hints(tab_name)) - - def _process_row(self, tab_name: str, state: SheetState, row_data: SheetRow) -> AnyJsonData: - parsed_headers = state.parsed_headers - type_hints = state.type_hints - patch_item = copy.deepcopy(self.sheet_patch_prototype(tab_name)) - for i, value in enumerate(row_data): - parsed_value = self.parse_cell_value(value) - type_hint = type_hints[i] - if type_hint: - parsed_value = type_hint.apply_hint(parsed_value) - ItemTools.set_path_value(patch_item, parsed_headers[i], parsed_value) - return patch_item - - def parse_cell_value(self, value: SheetCellValue) -> AnyJsonData: - return ItemTools.parse_item_value(value, context=self._instaguid_context_table) - - -class InsertsItemMixin(AbstractItemManager): # ItemManagerMixin isn't really appropriate here - """ - This class is used for inserts directories and other JSON-like data that will be literally used as an Item - without semantic pre-processing. In other words, these classes will not be pre-checked for semantic correctness - but instead assumed to have been checked by other means. - """ - - AUTOLOAD_SCHEMAS_DEFAULT = False # Has no effect, but someone might inspect the value. - - def __init__(self, filename: str, *, autoload_schemas: Optional[bool] = None, portal_env: Optional[str] = None, - portal_vapp: Optional[AbstractVirtualApp] = None, schemas: Optional[Dict[str, AnyJsonData]] = None, - **kwargs): - ignored(portal_env, portal_vapp) # Would only be used if autoload_schemas was true, and we don't allow that. - if schemas not in [None, {}]: - raise ValueError(f"{self.__class__.__name__} does not allow schemas={schemas!r}.") - if autoload_schemas not in [None, False]: - raise ValueError(f"{self.__class__.__name__} does not allow autoload_schemas={autoload_schemas!r}.") - super().__init__(filename=filename, **kwargs) - - -@ITEM_MANAGER_REGISTRY.register() -class TabbedJsonInsertsItemManager(InsertsItemMixin, TabbedJsonInsertsManager): - pass - - -@ITEM_MANAGER_REGISTRY.register() -class SimpleJsonInsertsItemManager(InsertsItemMixin, SimpleJsonInsertsManager): - pass - - -@ITEM_MANAGER_REGISTRY.register() -class TabbedYamlInsertsItemManager(InsertsItemMixin, TabbedYamlInsertsManager): - pass - - -@ITEM_MANAGER_REGISTRY.register() -class SimpleYamlInsertsItemManager(InsertsItemMixin, SimpleYamlInsertsManager): - pass - - -@ITEM_MANAGER_REGISTRY.register() -class XlsxItemManager(ItemManagerMixin, XlsxManager): - """ - This layers item-style row processing functionality on an XLSX file. - """ - pass + def create_tab_processor_state(self, tab_name: str) -> SheetState: + # This will create state that allows us to efficiently assign values in the right place on each row + return self.SheetState(parsed_headers=self.parsed_headers_by_tab_name[tab_name], + type_hints=self.type_hints_by_tab_name[tab_name]) -@ITEM_MANAGER_REGISTRY.register() -class SimpleJsonLinesInsertsItemManager(InsertsItemMixin, SimpleJsonLinesInsertsManager): - pass - - -@ITEM_MANAGER_REGISTRY.register(regexp="^(.*/)?(|[^/]*[-_])inserts/?$") -class InsertsDirectoryItemManager(InsertsItemMixin, InsertsDirectoryManager): - pass +check = TableChecker.check -@ITEM_MANAGER_REGISTRY.register() -class CsvItemManager(ItemManagerMixin, CsvManager): - """ - This layers item-style row processing functionality on a CSV file. - """ - pass - - -@ITEM_MANAGER_REGISTRY.register() -class TsvItemManager(ItemManagerMixin, TsvManager): - """ - This layers item-style row processing functionality on a TSV file. - """ - pass - - -class ItemManager(AbstractTableSetManager): - """ - This class will open a .xlsx or .csv file and load its content in our standard format. - (See more detailed description in AbstractTableManager.) - """ - - @classmethod - def create_implementation_manager(cls, filename: str, **kwargs) -> AbstractItemManager: - reader_agent_class: Type[AbstractTableSetManager] = ITEM_MANAGER_REGISTRY.manager_for_filename(filename) - if not issubclass(reader_agent_class, AbstractItemManager): - raise ValueError(f"ItemManager unexpectedly found reader agent class {reader_agent_class}.") - reader_agent_class: Type[AbstractItemManager] - reader_agent = reader_agent_class(filename=filename, **kwargs) - return reader_agent - - @classmethod - def load(cls, filename: str, tab_name: Optional[str] = None, escaping: Optional[bool] = None, - schemas: Optional[Dict] = None, autoload_schemas: Optional[bool] = None, - portal_env: Optional[str] = None, portal_vapp: Optional[AbstractVirtualApp] = None, - **kwargs) -> TabbedSheetData: - """ - Given a filename and various options, loads the items associated with that filename. - - :param filename: The name of the file to load. - :param tab_name: For files that lack multiple tabs (such as .csv or .tsv), - the tab name to associate with the data. - :param escaping: Whether to perform escape processing on backslashes. - :param schemas: A set of schemas to use instead of trying to load them. - :param autoload_schemas: Whether to try autoloading schemas. - :param portal_env: A portal to consult to find schemas (usually if calling from the outside of a portal). - :param portal_vapp: A vapp to use (usually if calling from within a portal). - """ - manager = cls.create_implementation_manager(filename=filename, tab_name=tab_name, escaping=escaping, - schemas=schemas, autoload_schemas=autoload_schemas, - portal_env=portal_env, portal_vapp=portal_vapp, - **kwargs) - return manager.load_content() - - -load_items = ItemManager.load +def load_items(filename: str, tab_name: Optional[str] = None, escaping: Optional[bool] = None, + schemas: Optional[TabbedSchemas] = None, **kwargs): + tabbed_rows = load_table_set(filename=filename, tab_name=tab_name, escaping=escaping, prefer_number=False, + **kwargs) + checked_items = check(tabbed_rows, schemas=schemas) + return checked_items diff --git a/dcicutils/lang_utils.py b/dcicutils/lang_utils.py index 2fda7dc9f..753241120 100644 --- a/dcicutils/lang_utils.py +++ b/dcicutils/lang_utils.py @@ -107,7 +107,7 @@ def _adjust_ending(cls, word, strip_chars, add_suffix): f"^an?[ -]+([^ -].*)$", re.IGNORECASE) - _NOUN_WITH_THAT_OR_WHICH_QUALIFIER = re.compile("^(.*[^,])(,|)[ ]+(that|which)[ ]+(.*)$", re.IGNORECASE) + _NOUN_WITH_CLAUSE_QUALIFIER = re.compile("^(.*[^,])(,|)[ ]+(that|which|while)[ ]+(.*)$", re.IGNORECASE) _IS_QUALIFIER = re.compile("^(is|was|has)[ ]+(.*)$", re.IGNORECASE) @classmethod @@ -125,7 +125,7 @@ def string_pluralize(cls, word: str, allow_some=False) -> str: capitalize = word[0].isupper() upcase = word.isupper() # capitalize and not any(ch.islower() for ch in word) - qual_matched = cls._NOUN_WITH_THAT_OR_WHICH_QUALIFIER.match(word) + qual_matched = cls._NOUN_WITH_CLAUSE_QUALIFIER.match(word) if qual_matched: qualified, comma, connective, qualifier = qual_matched.groups() word = qualified diff --git a/dcicutils/misc_utils.py b/dcicutils/misc_utils.py index d232fa67e..a520e3700 100644 --- a/dcicutils/misc_utils.py +++ b/dcicutils/misc_utils.py @@ -1358,6 +1358,13 @@ def to_camel_case(s): return snake_case_to_camel_case(s) +def to_snake_case(s): + """ + Converts a string that might be in snake_case or CamelCase into CamelCase. + """ + return camel_case_to_snake_case(to_camel_case(s)) + + def capitalize1(s): """ Capitalizes the first letter of a string and leaves the others alone. diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index ef66abad1..a2fa9670c 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -28,13 +28,16 @@ Header = str Headers = List[str] +TabbedHeaders = Dict[str, Headers] ParsedHeader = List[Union[str, int]] ParsedHeaders = List[ParsedHeader] +TabbedParsedHeaders = Dict[str, ParsedHeaders] SheetCellValue = Union[int, float, str] SheetRow = List[SheetCellValue] CsvReader = type(csv.reader(TemporaryFile())) SheetData = List[dict] TabbedSheetData = Dict[str, SheetData] +TabbedSchemas = Dict[str, Dict] class LoadFailure(Exception): @@ -72,21 +75,23 @@ def unwanted_kwargs(*, context, kwargs, context_plural=False, detailed=False): f" {maybe_pluralize(unwanted, 'keyword argument')} {conjoined_list(unwanted)}.") -def prefer_number(value: SheetCellValue): +def prefer_number(value: SheetCellValue, kind='num'): if isinstance(value, str): # the given value might be an int or float, in which case just fall through if not value: return None value = value ch0 = value[0] if ch0 == '+' or ch0 == '-' or ch0.isdigit(): - try: - return int(value) - except Exception: - pass - try: - return float(value) - except Exception: - pass + if kind == 'num' or kind == 'int': + try: + return int(value) + except Exception: + pass + if kind == 'num' or kind == 'float': + try: + return float(value) + except Exception: + pass # If we couldn't parse it as an int or float, fall through to returning the original value pass return value @@ -247,7 +252,8 @@ def load(cls, filename: str, **kwargs) -> AnyJsonData: table_set_manager: SemanticTableSetManager = cls(filename=filename, **kwargs) return table_set_manager.load_content() - def __init__(self, filename: str, **kwargs): + def __init__(self, filename: str, prefer_number: bool = True, **kwargs): + self.prefer_number: bool = prefer_number super().__init__(filename=filename, **kwargs) def _raw_row_generator_for_tab_name(self, tab_name: str) -> Iterable[SheetRow]: @@ -274,9 +280,8 @@ def load_content(self) -> AnyJsonData: self.content_by_tab_name[tab_name] = sheet_content return self.content_by_tab_name - @classmethod - def parse_cell_value(cls, value: SheetCellValue) -> AnyJsonData: - return prefer_number(value) + def parse_cell_value(self, value: SheetCellValue) -> AnyJsonData: + return prefer_number(value) if self.prefer_number else value class TableSetManagerRegistry: diff --git a/test/data_files/sample_inserts/institution.json b/test/data_files/sample_inserts/institution.json new file mode 100644 index 000000000..0637a088a --- /dev/null +++ b/test/data_files/sample_inserts/institution.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/test/data_files/sample_inserts/project.json b/test/data_files/sample_inserts/project.json new file mode 100644 index 000000000..0637a088a --- /dev/null +++ b/test/data_files/sample_inserts/project.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/test/data_files/sample_inserts/user.json b/test/data_files/sample_inserts/user.json new file mode 100644 index 000000000..fe51488c7 --- /dev/null +++ b/test/data_files/sample_inserts/user.json @@ -0,0 +1 @@ +[] diff --git a/test/data_files/sample_schemas/institution.json b/test/data_files/sample_schemas/institution.json new file mode 100644 index 000000000..c46ebe4bf --- /dev/null +++ b/test/data_files/sample_schemas/institution.json @@ -0,0 +1,340 @@ +{ + "title": "Institution", + "$id": "/profiles/institution.json", + "$schema": "https://json-schema.org/draft/2020-12/schema", + "type": "object", + "required": [ + "name", + "title" + ], + "identifyingProperties": [ + "name", + "uuid", + "title", + "aliases" + ], + "additionalProperties": false, + "mixinProperties": [ + { + "$ref": "mixins.json#/schema_version" + }, + { + "$ref": "mixins.json#/uuid" + }, + { + "$ref": "mixins.json#/aliases" + }, + { + "$ref": "mixins.json#/submitted" + }, + { + "$ref": "mixins.json#/modified" + }, + { + "$ref": "mixins.json#/status" + }, + { + "$ref": "mixins.json#/static_embeds" + } + ], + "properties": { + "static_headers": { + "title": "Static Headers", + "description": "Array of linkTos for static sections to be displayed at the top of an item page", + "type": "array", + "uniqueItems": true, + "permission": "restricted_fields", + "items": { + "title": "Static Header", + "description": "Static section displayed at the top of an item page", + "type": "string", + "linkTo": "UserContent" + } + }, + "static_content": { + "title": "Static Content", + "description": "Array of objects containing linkTo UserContent and 'position' to be placed on Item view(s).", + "type": "array", + "uniqueItems": true, + "permission": "restricted_fields", + "items": { + "title": "Static Content Definition", + "description": "Link to UserContent Item plus location.", + "type": "object", + "required": [ + "location", + "content" + ], + "properties": { + "content": { + "type": "string", + "linkTo": "UserContent", + "title": "Link to Content", + "description": "A UserContent Item." + }, + "location": { + "type": "string", + "title": "Location of Content", + "description": "Where this content should be displayed. Item schemas could potentially define an enum to contrain values.", + "default": "header" + }, + "description": { + "type": "string", + "title": "Description", + "description": "Description or note about this content. Might be displayed as a footnote or caption, if applicable for view." + } + } + } + }, + "status": { + "title": "Status", + "type": "string", + "default": "shared", + "permission": "restricted_fields", + "enum": [ + "shared", + "obsolete", + "current", + "inactive", + "in review", + "deleted" + ] + }, + "last_modified": { + "title": "Last Modified", + "exclude_from": [ + "FFedit-create" + ], + "type": "object", + "additionalProperties": false, + "lookup": 1000, + "properties": { + "date_modified": { + "title": "Date Modified", + "description": "Do not submit, value is assigned by the server. The date the object is modified.", + "type": "string", + "anyOf": [ + { + "format": "date-time" + }, + { + "format": "date" + } + ], + "permission": "restricted_fields" + }, + "modified_by": { + "title": "Modified By", + "description": "Do not submit, value is assigned by the server. The user that modfied the object.", + "type": "string", + "linkTo": "User", + "permission": "restricted_fields" + } + } + }, + "date_created": { + "rdfs:subPropertyOf": "dc:created", + "title": "Date Created", + "lookup": 1000, + "exclude_from": [ + "FFedit-create" + ], + "type": "string", + "anyOf": [ + { + "format": "date-time" + }, + { + "format": "date" + } + ], + "serverDefault": "now", + "permission": "restricted_fields" + }, + "submitted_by": { + "rdfs:subPropertyOf": "dc:creator", + "title": "Submitted By", + "exclude_from": [ + "FFedit-create" + ], + "type": "string", + "linkTo": "User", + "lookup": 1000, + "serverDefault": "userid", + "permission": "restricted_fields" + }, + "aliases": { + "title": "Aliases", + "description": "Institution-specific ID (e.g. bgm:cohort-1234-a).", + "type": "array", + "comment": "Colon separated lab name and lab identifier, no slash. (e.g. dcic-lab:42).", + "lookup": 1, + "uniqueItems": true, + "ff_flag": "clear clone", + "items": { + "uniqueKey": "alias", + "title": "ID Alias", + "description": "Institution-specific ID (e.g. bgm:cohort-1234-a).", + "type": "string", + "pattern": "^[^\\s\\\\\\/]+:[^\\s\\\\\\/]+$" + } + }, + "uuid": { + "title": "UUID", + "type": "string", + "format": "uuid", + "exclude_from": [ + "FFedit-create" + ], + "serverDefault": "uuid4", + "permission": "restricted_fields", + "requestMethod": "POST" + }, + "schema_version": { + "title": "Schema Version", + "internal_comment": "Do not submit, value is assigned by the server. The version of the JSON schema that the server uses to validate the object. Schema version indicates generation of schema used to save version to to enable upgrade steps to work. Individual schemas should set the default.", + "type": "string", + "exclude_from": [ + "FFedit-create" + ], + "pattern": "^\\d+(\\.\\d+)*$", + "requestMethod": [], + "default": "1" + }, + "name": { + "title": "Name", + "description": "A unique name for the institution.", + "type": "string", + "uniqueKey": true, + "permission": "restricted_fields", + "pattern": "^[A-Za-z0-9\\-]+$" + }, + "title": { + "title": "Title", + "description": "A unique title for the instituion.", + "type": "string", + "permission": "restricted_fields" + }, + "pi": { + "title": "P.I.", + "description": "Principle Investigator of the institution.", + "type": "string", + "linkTo": "User" + }, + "contact_persons": { + "title": "Institution Contacts", + "description": "Users designated as points of contact for this institution.", + "type": "array", + "uniqueItems": true, + "items": { + "title": "Institution Contact", + "description": "A User associated with the institution who is also a point of contact.", + "type": "string", + "linkTo": "User" + } + }, + "address1": { + "title": "Address line 1", + "type": "string" + }, + "address2": { + "title": "Address line 2", + "type": "string" + }, + "city": { + "title": "City", + "type": "string" + }, + "state": { + "title": "State/Province/Region", + "type": "string" + }, + "country": { + "title": "Country", + "type": "string" + }, + "postal_code": { + "title": "ZIP/Postal code", + "type": "string", + "format": "postal-code" + }, + "fax": { + "title": "Fax number", + "description": "A fax number for the lab (with country code).", + "type": "string", + "format": "phone" + }, + "phone1": { + "title": "Primary phone number", + "description": "Primary phone number (with country code).", + "type": "string", + "format": "phone" + }, + "phone2": { + "title": "Alternate phone number", + "description": "Alternative phone number (with country code).", + "type": "string", + "format": "phone" + }, + "url": { + "title": "Website URL", + "description": "An external resource with additional information about the instiution.", + "type": "string", + "format": "uri" + }, + "@id": { + "title": "ID", + "type": "string", + "calculatedProperty": true + }, + "@type": { + "title": "Type", + "type": "array", + "items": { + "type": "string" + }, + "calculatedProperty": true + }, + "principals_allowed": { + "title": "principals_allowed", + "description": "Calculated permissions used for ES filtering", + "type": "object", + "properties": { + "view": { + "type": "string" + }, + "edit": { + "type": "string" + } + }, + "calculatedProperty": true + }, + "display_title": { + "title": "Display Title", + "description": "A calculated title for every object in 4DN", + "type": "string", + "calculatedProperty": true + } + }, + "columns": { + "pi": { + "title": "P.I." + }, + "state": { + "title": "State" + }, + "country": { + "title": "Country" + } + }, + "excludedColumns": [ + "institution.display_title" + ], + "@type": [ + "JSONSchema" + ], + "rdfs:seeAlso": "/terms/Institution", + "children": [], + "rdfs:subClassOf": "/profiles/Item.json", + "isAbstract": false +} \ No newline at end of file diff --git a/test/data_files/sample_schemas/project.json b/test/data_files/sample_schemas/project.json new file mode 100644 index 000000000..5a24e2f9d --- /dev/null +++ b/test/data_files/sample_schemas/project.json @@ -0,0 +1,592 @@ +{ + "title": "Project", + "$id": "/profiles/project.json", + "$schema": "https://json-schema.org/draft/2020-12/schema", + "required": [ + "name", + "title" + ], + "identifyingProperties": [ + "uuid", + "name", + "title", + "aliases" + ], + "additionalProperties": false, + "mixinProperties": [ + { + "$ref": "mixins.json#/schema_version" + }, + { + "$ref": "mixins.json#/uuid" + }, + { + "$ref": "mixins.json#/aliases" + }, + { + "$ref": "mixins.json#/submitted" + }, + { + "$ref": "mixins.json#/modified" + }, + { + "$ref": "mixins.json#/tags" + }, + { + "$ref": "mixins.json#/status" + }, + { + "$ref": "mixins.json#/static_embeds" + } + ], + "type": "object", + "properties": { + "static_headers": { + "title": "Static Headers", + "description": "Array of linkTos for static sections to be displayed at the top of an item page", + "type": "array", + "uniqueItems": true, + "permission": "restricted_fields", + "items": { + "title": "Static Header", + "description": "Static section displayed at the top of an item page", + "type": "string", + "linkTo": "UserContent" + } + }, + "static_content": { + "title": "Static Content", + "description": "Array of objects containing linkTo UserContent and 'position' to be placed on Item view(s).", + "type": "array", + "uniqueItems": true, + "permission": "restricted_fields", + "items": { + "title": "Static Content Definition", + "description": "Link to UserContent Item plus location.", + "type": "object", + "required": [ + "location", + "content" + ], + "properties": { + "content": { + "type": "string", + "linkTo": "UserContent", + "title": "Link to Content", + "description": "A UserContent Item." + }, + "location": { + "type": "string", + "title": "Location of Content", + "description": "Where this content should be displayed. Item schemas could potentially define an enum to contrain values.", + "default": "header" + }, + "description": { + "type": "string", + "title": "Description", + "description": "Description or note about this content. Might be displayed as a footnote or caption, if applicable for view." + } + } + } + }, + "status": { + "title": "Status", + "type": "string", + "default": "shared", + "permission": "restricted_fields", + "enum": [ + "shared", + "obsolete", + "current", + "inactive", + "in review", + "deleted" + ] + }, + "tags": { + "title": "Tags", + "description": "Key words that can tag an item - useful for filtering.", + "type": "array", + "lookup": 1000, + "uniqueItems": true, + "ff_flag": "clear clone", + "items": { + "title": "Tag", + "description": "A tag for the item.", + "type": "string", + "minLength": 1, + "maxLength": 50, + "pattern": "^[a-zA-Z0-9_\\-][a-zA-Z0-9_\\-\\s]+[a-zA-Z0-9_\\-]$" + } + }, + "last_modified": { + "title": "Last Modified", + "exclude_from": [ + "FFedit-create" + ], + "type": "object", + "additionalProperties": false, + "lookup": 1000, + "properties": { + "date_modified": { + "title": "Date Modified", + "description": "Do not submit, value is assigned by the server. The date the object is modified.", + "type": "string", + "anyOf": [ + { + "format": "date-time" + }, + { + "format": "date" + } + ], + "permission": "restricted_fields" + }, + "modified_by": { + "title": "Modified By", + "description": "Do not submit, value is assigned by the server. The user that modfied the object.", + "type": "string", + "linkTo": "User", + "permission": "restricted_fields" + } + } + }, + "date_created": { + "rdfs:subPropertyOf": "dc:created", + "title": "Date Created", + "lookup": 1000, + "exclude_from": [ + "FFedit-create" + ], + "type": "string", + "anyOf": [ + { + "format": "date-time" + }, + { + "format": "date" + } + ], + "serverDefault": "now", + "permission": "restricted_fields" + }, + "submitted_by": { + "rdfs:subPropertyOf": "dc:creator", + "title": "Submitted By", + "exclude_from": [ + "FFedit-create" + ], + "type": "string", + "linkTo": "User", + "lookup": 1000, + "serverDefault": "userid", + "permission": "restricted_fields" + }, + "aliases": { + "title": "Aliases", + "description": "Institution-specific ID (e.g. bgm:cohort-1234-a).", + "type": "array", + "comment": "Colon separated lab name and lab identifier, no slash. (e.g. dcic-lab:42).", + "lookup": 1, + "uniqueItems": true, + "ff_flag": "clear clone", + "items": { + "uniqueKey": "alias", + "title": "ID Alias", + "description": "Institution-specific ID (e.g. bgm:cohort-1234-a).", + "type": "string", + "pattern": "^[^\\s\\\\\\/]+:[^\\s\\\\\\/]+$" + } + }, + "uuid": { + "title": "UUID", + "type": "string", + "format": "uuid", + "exclude_from": [ + "FFedit-create" + ], + "serverDefault": "uuid4", + "permission": "restricted_fields", + "requestMethod": "POST" + }, + "schema_version": { + "title": "Schema Version", + "internal_comment": "Do not submit, value is assigned by the server. The version of the JSON schema that the server uses to validate the object. Schema version indicates generation of schema used to save version to to enable upgrade steps to work. Individual schemas should set the default.", + "type": "string", + "exclude_from": [ + "FFedit-create" + ], + "pattern": "^\\d+(\\.\\d+)*$", + "requestMethod": [], + "default": "1" + }, + "name": { + "title": "Name", + "description": "The name of the project.", + "type": "string", + "lookup": 20, + "uniqueKey": true, + "permission": "restricted_fields", + "pattern": "^[A-Za-z0-9\\-]+$" + }, + "title": { + "title": "Title", + "description": "A title for the instituion.", + "type": "string", + "permission": "restricted_fields" + }, + "description": { + "title": "Description", + "type": "string", + "formInput": "textarea", + "lookup": 40 + }, + "start_date": { + "title": "Start date", + "description": "YYYY-MM-DD formatted date.", + "comment": "Date can be submitted as YYYY-MM-DD or YYYY-MM-DDTHH:MM:SSTZD (TZD is the time zone designator; use Z to express time in UTC or for time expressed in local time add a time zone offset froaam UTC +HH:MM or -HH:MM).", + "type": "string", + "lookup": 50, + "anyOf": [ + { + "format": "date-time" + }, + { + "format": "date" + } + ] + }, + "end_date": { + "title": "End date", + "description": "YYYY-MM-DD formatted date.", + "comment": "Date can be submitted as YYYY-MM-DD or YYYY-MM-DDTHH:MM:SSTZD (TZD is the time zone designator; use Z to express time in UTC or for time expressed in local time add a time zone offset from UTC +HH:MM or -HH:MM).", + "type": "string", + "lookup": 60, + "anyOf": [ + { + "format": "date-time" + }, + { + "format": "date" + } + ] + }, + "url": { + "@type": "@id", + "title": "URL", + "description": "An external resource with additional information about the project.", + "type": "string", + "format": "uri", + "lookup": 70 + }, + "pi": { + "title": "Project Lead", + "description": "The leader of the project.", + "type": "string", + "lookup": 80, + "linkTo": "User" + }, + "lifecycle_management_active": { + "title": "Lifecycle management", + "type": "boolean", + "description": "Whether lifecycle management is active or not", + "default": false + }, + "lifecycle_policy": { + "title": "Lifecycle policy", + "description": "Contains rules for how different categories of files are managed during their lifetime", + "type": "object", + "patternProperties": { + "^(?!(ignore)$).+$": { + "type": "object", + "description": "Custom lifecycle policy", + "comment": "Ensure custom lifecycle policies have the right structure", + "minProperties": 1, + "additionalProperties": false, + "properties": { + "move_to_infrequent_access_after": { + "title": "Move to Infrequent Access after", + "description": "Time (in months) after which files are moved to Infrequent Access", + "type": "integer", + "minimum": 0 + }, + "move_to_glacier_after": { + "title": "Move to Glacier Instant Retrieval after", + "description": "Time (in months) after which files are moved to Glacier Instant Retrieval", + "type": "integer", + "minimum": 0 + }, + "move_to_deep_archive_after": { + "title": "Move to Deep Archive after", + "description": "Time (in months) after which files are moved to Glacier Deep Archive", + "type": "integer", + "minimum": 0 + }, + "expire_after": { + "title": "Expire after", + "description": "Time (in months) after which files are deleted", + "type": "integer", + "minimum": 0 + } + } + } + }, + "additionalProperties": false, + "properties": { + "short_term_access_long_term_archive": { + "title": "short_term_access_long_term_archive", + "description": "Files that require shorter term accessibility and are long term archived after that", + "type": "object", + "minProperties": 1, + "additionalProperties": false, + "properties": { + "move_to_infrequent_access_after": { + "title": "Move to Infrequent Access after", + "description": "Time (in months) after which files are moved to Infrequent Access", + "type": "integer", + "minimum": 0 + }, + "move_to_glacier_after": { + "title": "Move to Glacier Instant Retrieval after", + "description": "Time (in months) after which files are moved to Glacier Instant Retrieval", + "type": "integer", + "minimum": 0 + }, + "move_to_deep_archive_after": { + "title": "Move to Deep Archive after", + "description": "Time (in months) after which files are moved to Glacier Deep Archive", + "type": "integer", + "minimum": 0 + }, + "expire_after": { + "title": "Expire after", + "description": "Time (in months) after which files are deleted", + "type": "integer", + "minimum": 0 + } + } + }, + "long_term_access_long_term_archive": { + "title": "long_term_access_long_term_archive", + "description": "Files that require longer term accessibility and are long term archived after that", + "type": "object", + "minProperties": 1, + "additionalProperties": false, + "properties": { + "move_to_infrequent_access_after": { + "title": "Move to Infrequent Access after", + "description": "Time (in months) after which files are moved to Infrequent Access", + "type": "integer", + "minimum": 0 + }, + "move_to_glacier_after": { + "title": "Move to Glacier Instant Retrieval after", + "description": "Time (in months) after which files are moved to Glacier Instant Retrieval", + "type": "integer", + "minimum": 0 + }, + "move_to_deep_archive_after": { + "title": "Move to Deep Archive after", + "description": "Time (in months) after which files are moved to Glacier Deep Archive", + "type": "integer", + "minimum": 0 + }, + "expire_after": { + "title": "Expire after", + "description": "Time (in months) after which files are deleted", + "type": "integer", + "minimum": 0 + } + } + }, + "long_term_access": { + "title": "long_term_access", + "description": "Files that require long term accessibility and are deleted after that", + "type": "object", + "minProperties": 1, + "additionalProperties": false, + "properties": { + "move_to_infrequent_access_after": { + "title": "Move to Infrequent Access after", + "description": "Time (in months) after which files are moved to Infrequent Access", + "type": "integer", + "minimum": 0 + }, + "move_to_glacier_after": { + "title": "Move to Glacier Instant Retrieval after", + "description": "Time (in months) after which files are moved to Glacier Instant Retrieval", + "type": "integer", + "minimum": 0 + }, + "move_to_deep_archive_after": { + "title": "Move to Deep Archive after", + "description": "Time (in months) after which files are moved to Glacier Deep Archive", + "type": "integer", + "minimum": 0 + }, + "expire_after": { + "title": "Expire after", + "description": "Time (in months) after which files are deleted", + "type": "integer", + "minimum": 0 + } + } + }, + "short_term_access": { + "title": "short_term_access", + "description": "Files that require short term accessibility and are deleted after that", + "type": "object", + "minProperties": 1, + "additionalProperties": false, + "properties": { + "move_to_infrequent_access_after": { + "title": "Move to Infrequent Access after", + "description": "Time (in months) after which files are moved to Infrequent Access", + "type": "integer", + "minimum": 0 + }, + "move_to_glacier_after": { + "title": "Move to Glacier Instant Retrieval after", + "description": "Time (in months) after which files are moved to Glacier Instant Retrieval", + "type": "integer", + "minimum": 0 + }, + "move_to_deep_archive_after": { + "title": "Move to Deep Archive after", + "description": "Time (in months) after which files are moved to Glacier Deep Archive", + "type": "integer", + "minimum": 0 + }, + "expire_after": { + "title": "Expire after", + "description": "Time (in months) after which files are deleted", + "type": "integer", + "minimum": 0 + } + } + }, + "long_term_archive": { + "title": "long_term_archive", + "description": "Files that are immediately archived long term and deleted after that", + "type": "object", + "minProperties": 1, + "additionalProperties": false, + "properties": { + "move_to_infrequent_access_after": { + "title": "Move to Infrequent Access after", + "description": "Time (in months) after which files are moved to Infrequent Access", + "type": "integer", + "minimum": 0 + }, + "move_to_glacier_after": { + "title": "Move to Glacier Instant Retrieval after", + "description": "Time (in months) after which files are moved to Glacier Instant Retrieval", + "type": "integer", + "minimum": 0 + }, + "move_to_deep_archive_after": { + "title": "Move to Deep Archive after", + "description": "Time (in months) after which files are moved to Glacier Deep Archive", + "type": "integer", + "minimum": 0 + }, + "expire_after": { + "title": "Expire after", + "description": "Time (in months) after which files are deleted", + "type": "integer", + "minimum": 0 + } + } + }, + "short_term_archive": { + "title": "short_term_archive", + "description": "Files that are immediately archived short term and deleted after that", + "type": "object", + "minProperties": 1, + "additionalProperties": false, + "properties": { + "move_to_infrequent_access_after": { + "title": "Move to Infrequent Access after", + "description": "Time (in months) after which files are moved to Infrequent Access", + "type": "integer", + "minimum": 0 + }, + "move_to_glacier_after": { + "title": "Move to Glacier Instant Retrieval after", + "description": "Time (in months) after which files are moved to Glacier Instant Retrieval", + "type": "integer", + "minimum": 0 + }, + "move_to_deep_archive_after": { + "title": "Move to Deep Archive after", + "description": "Time (in months) after which files are moved to Glacier Deep Archive", + "type": "integer", + "minimum": 0 + }, + "expire_after": { + "title": "Expire after", + "description": "Time (in months) after which files are deleted", + "type": "integer", + "minimum": 0 + } + } + }, + "no_storage": { + "title": "no_storage", + "description": "Files that are deleted immediately", + "type": "object", + "minProperties": 1, + "additionalProperties": false, + "properties": { + "expire_after": { + "title": "Expire after", + "description": "Time (in months) after which files are deleted", + "type": "integer", + "minimum": 0 + } + } + } + } + }, + "@id": { + "title": "ID", + "type": "string", + "calculatedProperty": true + }, + "@type": { + "title": "Type", + "type": "array", + "items": { + "type": "string" + }, + "calculatedProperty": true + }, + "principals_allowed": { + "title": "principals_allowed", + "description": "Calculated permissions used for ES filtering", + "type": "object", + "properties": { + "view": { + "type": "string" + }, + "edit": { + "type": "string" + } + }, + "calculatedProperty": true + }, + "display_title": { + "title": "Display Title", + "description": "A calculated title for every object in 4DN", + "type": "string", + "calculatedProperty": true + } + }, + "@type": [ + "JSONSchema" + ], + "rdfs:seeAlso": "/terms/Project", + "children": [], + "rdfs:subClassOf": "/profiles/Item.json", + "isAbstract": false +} \ No newline at end of file diff --git a/test/data_files/sample_schemas/user.json b/test/data_files/sample_schemas/user.json new file mode 100644 index 000000000..e9aad7532 --- /dev/null +++ b/test/data_files/sample_schemas/user.json @@ -0,0 +1,817 @@ +{ + "title": "User", + "$id": "/profiles/user.json", + "$schema": "https://json-schema.org/draft/2020-12/schema", + "type": "object", + "required": [ + "email", + "first_name", + "last_name" + ], + "identifyingProperties": [ + "uuid", + "email", + "aliases" + ], + "additionalProperties": false, + "mixinProperties": [ + { + "$ref": "mixins.json#/schema_version" + }, + { + "$ref": "mixins.json#/uuid" + }, + { + "$ref": "mixins.json#/aliases" + }, + { + "$ref": "mixins.json#/submitted" + }, + { + "$ref": "mixins.json#/modified" + } + ], + "properties": { + "last_modified": { + "title": "Last Modified", + "exclude_from": [ + "FFedit-create" + ], + "type": "object", + "additionalProperties": false, + "lookup": 1000, + "properties": { + "date_modified": { + "title": "Date Modified", + "description": "Do not submit, value is assigned by the server. The date the object is modified.", + "type": "string", + "anyOf": [ + { + "format": "date-time" + }, + { + "format": "date" + } + ], + "permission": "restricted_fields" + }, + "modified_by": { + "title": "Modified By", + "description": "Do not submit, value is assigned by the server. The user that modfied the object.", + "type": "string", + "linkTo": "User", + "permission": "restricted_fields" + } + } + }, + "date_created": { + "rdfs:subPropertyOf": "dc:created", + "title": "Date Created", + "lookup": 1000, + "exclude_from": [ + "FFedit-create" + ], + "type": "string", + "anyOf": [ + { + "format": "date-time" + }, + { + "format": "date" + } + ], + "serverDefault": "now", + "permission": "restricted_fields" + }, + "submitted_by": { + "rdfs:subPropertyOf": "dc:creator", + "title": "Submitted By", + "exclude_from": [ + "FFedit-create" + ], + "type": "string", + "linkTo": "User", + "lookup": 1000, + "serverDefault": "userid", + "permission": "restricted_fields" + }, + "aliases": { + "title": "Aliases", + "description": "Institution-specific ID (e.g. bgm:cohort-1234-a).", + "type": "array", + "comment": "Colon separated lab name and lab identifier, no slash. (e.g. dcic-lab:42).", + "lookup": 1, + "uniqueItems": true, + "ff_flag": "clear clone", + "items": { + "uniqueKey": "alias", + "title": "ID Alias", + "description": "Institution-specific ID (e.g. bgm:cohort-1234-a).", + "type": "string", + "pattern": "^[^\\s\\\\\\/]+:[^\\s\\\\\\/]+$" + } + }, + "uuid": { + "title": "UUID", + "type": "string", + "format": "uuid", + "exclude_from": [ + "FFedit-create" + ], + "serverDefault": "uuid4", + "permission": "restricted_fields", + "requestMethod": "POST" + }, + "schema_version": { + "title": "Schema Version", + "internal_comment": "Do not submit, value is assigned by the server. The version of the JSON schema that the server uses to validate the object. Schema version indicates generation of schema used to save version to to enable upgrade steps to work. Individual schemas should set the default.", + "type": "string", + "exclude_from": [ + "FFedit-create" + ], + "pattern": "^\\d+(\\.\\d+)*$", + "requestMethod": [], + "default": "1" + }, + "status": { + "title": "Status", + "type": "string", + "default": "current", + "permission": "restricted_fields", + "enum": [ + "current", + "deleted", + "inactive", + "revoked" + ] + }, + "institution": { + "title": "Institution", + "description": "Institution of whomever created this User Item in the system.", + "exclude_from": [ + "FFedit-create" + ], + "type": "string", + "linkTo": "Institution" + }, + "project": { + "title": "Project", + "description": "Project of whomever created this User Item in the system.", + "comment": "The value of this property determines who can potentially edit the User", + "exclude_from": [ + "FFedit-create" + ], + "type": "string", + "linkTo": "Project" + }, + "project_roles": { + "title": "Projects and Roles", + "description": "Projects that a User is part of and the role(s) within that Project a User has", + "type": "array", + "permission": "restricted_fields", + "items": { + "title": "Project and Role", + "type": "object", + "additionalProperties": false, + "required": [ + "project" + ], + "properties": { + "project": { + "title": "Project", + "description": "Values here are used to authorize view and edit permissions of other Items in the database - eventually in combination with role", + "type": "string", + "linkTo": "Project" + }, + "role": { + "title": "Role", + "description": "The role of the user in the project.", + "type": "string", + "default": "unknown", + "enum": [ + "clinician", + "scientist", + "developer", + "director", + "project_member", + "patient", + "other", + "unknown" + ] + } + } + } + }, + "user_institution": { + "title": "User's Institution", + "description": "The primary institution the user is associated with", + "type": "string", + "linkTo": "Institution" + }, + "email": { + "title": "Account Email", + "description": "Email used to log in to the 4DN Portal.", + "type": "string", + "format": "email", + "lookup": 20, + "uniqueKey": true + }, + "first_name": { + "title": "First name", + "description": "The user's first (given) name.", + "type": "string", + "lookup": 30 + }, + "last_name": { + "title": "Last name", + "description": "The user's last (family) name.", + "type": "string", + "lookup": 40 + }, + "job_title": { + "title": "Job Title", + "type": "string", + "comment": "Can be user supplied - purely informational", + "lookup": 41 + }, + "groups": { + "title": "Groups", + "description": "Additional access control groups", + "note": "USE WITH CAUTION - currently how we add admin access to a user", + "type": "array", + "lookup": 80, + "uniqueItems": true, + "permission": "restricted_fields", + "items": { + "type": "string" + } + }, + "preferred_email": { + "title": "Preferred Contact Email", + "description": "Email to contact by, if different from account/sign-in e-mail address", + "type": "string", + "format": "email", + "lookup": 45 + }, + "phone1": { + "title": "Primary phone number", + "description": "The user's primary phone number (with country code).", + "type": "string", + "format": "phone", + "lookup": 100, + "pattern": "[+]?[\\d]{10,36}((\\sx|\\sext|\\sextension)(\\s)?[\\d]{1,7})?$" + }, + "phone2": { + "title": "Alternate phone number", + "description": "The user's secondary phone number (with country code).", + "type": "string", + "format": "phone" + }, + "fax": { + "title": "Fax number", + "description": "A FAX number for the user (with country code).", + "type": "string", + "format": "phone" + }, + "skype": { + "title": "Skype ID", + "type": "string", + "lookup": 110 + }, + "google": { + "title": "Google ID", + "type": "string", + "lookup": 120 + }, + "timezone": { + "title": "Timezone", + "description": "The timezone the user is associated with.", + "type": "string", + "lookup": 130, + "default": "US/Eastern", + "enum": [ + "Africa/Abidjan", + "Africa/Accra", + "Africa/Addis_Ababa", + "Africa/Algiers", + "Africa/Asmara", + "Africa/Bamako", + "Africa/Bangui", + "Africa/Banjul", + "Africa/Bissau", + "Africa/Blantyre", + "Africa/Brazzaville", + "Africa/Bujumbura", + "Africa/Cairo", + "Africa/Casablanca", + "Africa/Ceuta", + "Africa/Conakry", + "Africa/Dakar", + "Africa/Dar_es_Salaam", + "Africa/Djibouti", + "Africa/Douala", + "Africa/El_Aaiun", + "Africa/Freetown", + "Africa/Gaborone", + "Africa/Harare", + "Africa/Johannesburg", + "Africa/Juba", + "Africa/Kampala", + "Africa/Khartoum", + "Africa/Kigali", + "Africa/Kinshasa", + "Africa/Lagos", + "Africa/Libreville", + "Africa/Lome", + "Africa/Luanda", + "Africa/Lubumbashi", + "Africa/Lusaka", + "Africa/Malabo", + "Africa/Maputo", + "Africa/Maseru", + "Africa/Mbabane", + "Africa/Mogadishu", + "Africa/Monrovia", + "Africa/Nairobi", + "Africa/Ndjamena", + "Africa/Niamey", + "Africa/Nouakchott", + "Africa/Ouagadougou", + "Africa/Porto-Novo", + "Africa/Sao_Tome", + "Africa/Tripoli", + "Africa/Tunis", + "Africa/Windhoek", + "America/Adak", + "America/Anchorage", + "America/Anguilla", + "America/Antigua", + "America/Araguaina", + "America/Argentina/Buenos_Aires", + "America/Argentina/Catamarca", + "America/Argentina/Cordoba", + "America/Argentina/Jujuy", + "America/Argentina/La_Rioja", + "America/Argentina/Mendoza", + "America/Argentina/Rio_Gallegos", + "America/Argentina/Salta", + "America/Argentina/San_Juan", + "America/Argentina/San_Luis", + "America/Argentina/Tucuman", + "America/Argentina/Ushuaia", + "America/Aruba", + "America/Asuncion", + "America/Atikokan", + "America/Bahia", + "America/Bahia_Banderas", + "America/Barbados", + "America/Belem", + "America/Belize", + "America/Blanc-Sablon", + "America/Boa_Vista", + "America/Bogota", + "America/Boise", + "America/Cambridge_Bay", + "America/Campo_Grande", + "America/Cancun", + "America/Caracas", + "America/Cayenne", + "America/Cayman", + "America/Chicago", + "America/Chihuahua", + "America/Costa_Rica", + "America/Creston", + "America/Cuiaba", + "America/Curacao", + "America/Danmarkshavn", + "America/Dawson", + "America/Dawson_Creek", + "America/Denver", + "America/Detroit", + "America/Dominica", + "America/Edmonton", + "America/Eirunepe", + "America/El_Salvador", + "America/Fortaleza", + "America/Glace_Bay", + "America/Godthab", + "America/Goose_Bay", + "America/Grand_Turk", + "America/Grenada", + "America/Guadeloupe", + "America/Guatemala", + "America/Guayaquil", + "America/Guyana", + "America/Halifax", + "America/Havana", + "America/Hermosillo", + "America/Indiana/Indianapolis", + "America/Indiana/Knox", + "America/Indiana/Marengo", + "America/Indiana/Petersburg", + "America/Indiana/Tell_City", + "America/Indiana/Vevay", + "America/Indiana/Vincennes", + "America/Indiana/Winamac", + "America/Inuvik", + "America/Iqaluit", + "America/Jamaica", + "America/Juneau", + "America/Kentucky/Louisville", + "America/Kentucky/Monticello", + "America/Kralendijk", + "America/La_Paz", + "America/Lima", + "America/Los_Angeles", + "America/Lower_Princes", + "America/Maceio", + "America/Managua", + "America/Manaus", + "America/Marigot", + "America/Martinique", + "America/Matamoros", + "America/Mazatlan", + "America/Menominee", + "America/Merida", + "America/Metlakatla", + "America/Mexico_City", + "America/Miquelon", + "America/Moncton", + "America/Monterrey", + "America/Montevideo", + "America/Montreal", + "America/Montserrat", + "America/Nassau", + "America/New_York", + "America/Nipigon", + "America/Nome", + "America/Noronha", + "America/North_Dakota/Beulah", + "America/North_Dakota/Center", + "America/North_Dakota/New_Salem", + "America/Ojinaga", + "America/Panama", + "America/Pangnirtung", + "America/Paramaribo", + "America/Phoenix", + "America/Port-au-Prince", + "America/Port_of_Spain", + "America/Porto_Velho", + "America/Puerto_Rico", + "America/Rainy_River", + "America/Rankin_Inlet", + "America/Recife", + "America/Regina", + "America/Resolute", + "America/Rio_Branco", + "America/Santa_Isabel", + "America/Santarem", + "America/Santiago", + "America/Santo_Domingo", + "America/Sao_Paulo", + "America/Scoresbysund", + "America/Shiprock", + "America/Sitka", + "America/St_Barthelemy", + "America/St_Johns", + "America/St_Kitts", + "America/St_Lucia", + "America/St_Thomas", + "America/St_Vincent", + "America/Swift_Current", + "America/Tegucigalpa", + "America/Thule", + "America/Thunder_Bay", + "America/Tijuana", + "America/Toronto", + "America/Tortola", + "America/Vancouver", + "America/Whitehorse", + "America/Winnipeg", + "America/Yakutat", + "America/Yellowknife", + "Antarctica/Casey", + "Antarctica/Davis", + "Antarctica/DumontDUrville", + "Antarctica/Macquarie", + "Antarctica/Mawson", + "Antarctica/McMurdo", + "Antarctica/Palmer", + "Antarctica/Rothera", + "Antarctica/South_Pole", + "Antarctica/Syowa", + "Antarctica/Vostok", + "Arctic/Longyearbyen", + "Asia/Aden", + "Asia/Almaty", + "Asia/Amman", + "Asia/Anadyr", + "Asia/Aqtau", + "Asia/Aqtobe", + "Asia/Ashgabat", + "Asia/Baghdad", + "Asia/Bahrain", + "Asia/Baku", + "Asia/Bangkok", + "Asia/Beirut", + "Asia/Bishkek", + "Asia/Brunei", + "Asia/Choibalsan", + "Asia/Chongqing", + "Asia/Colombo", + "Asia/Damascus", + "Asia/Dhaka", + "Asia/Dili", + "Asia/Dubai", + "Asia/Dushanbe", + "Asia/Gaza", + "Asia/Harbin", + "Asia/Hebron", + "Asia/Ho_Chi_Minh", + "Asia/Hong_Kong", + "Asia/Hovd", + "Asia/Irkutsk", + "Asia/Jakarta", + "Asia/Jayapura", + "Asia/Jerusalem", + "Asia/Kabul", + "Asia/Kamchatka", + "Asia/Karachi", + "Asia/Kashgar", + "Asia/Kathmandu", + "Asia/Khandyga", + "Asia/Kolkata", + "Asia/Krasnoyarsk", + "Asia/Kuala_Lumpur", + "Asia/Kuching", + "Asia/Kuwait", + "Asia/Macau", + "Asia/Magadan", + "Asia/Makassar", + "Asia/Manila", + "Asia/Muscat", + "Asia/Nicosia", + "Asia/Novokuznetsk", + "Asia/Novosibirsk", + "Asia/Omsk", + "Asia/Oral", + "Asia/Phnom_Penh", + "Asia/Pontianak", + "Asia/Pyongyang", + "Asia/Qatar", + "Asia/Qyzylorda", + "Asia/Rangoon", + "Asia/Riyadh", + "Asia/Sakhalin", + "Asia/Samarkand", + "Asia/Seoul", + "Asia/Shanghai", + "Asia/Singapore", + "Asia/Taipei", + "Asia/Tashkent", + "Asia/Tbilisi", + "Asia/Tehran", + "Asia/Thimphu", + "Asia/Tokyo", + "Asia/Ulaanbaatar", + "Asia/Urumqi", + "Asia/Ust-Nera", + "Asia/Vientiane", + "Asia/Vladivostok", + "Asia/Yakutsk", + "Asia/Yekaterinburg", + "Asia/Yerevan", + "Atlantic/Azores", + "Atlantic/Bermuda", + "Atlantic/Canary", + "Atlantic/Cape_Verde", + "Atlantic/Faroe", + "Atlantic/Madeira", + "Atlantic/Reykjavik", + "Atlantic/South_Georgia", + "Atlantic/St_Helena", + "Atlantic/Stanley", + "Australia/Adelaide", + "Australia/Brisbane", + "Australia/Broken_Hill", + "Australia/Currie", + "Australia/Darwin", + "Australia/Eucla", + "Australia/Hobart", + "Australia/Lindeman", + "Australia/Lord_Howe", + "Australia/Melbourne", + "Australia/Perth", + "Australia/Sydney", + "Canada/Atlantic", + "Canada/Central", + "Canada/Eastern", + "Canada/Mountain", + "Canada/Newfoundland", + "Canada/Pacific", + "Europe/Amsterdam", + "Europe/Andorra", + "Europe/Athens", + "Europe/Belgrade", + "Europe/Berlin", + "Europe/Bratislava", + "Europe/Brussels", + "Europe/Bucharest", + "Europe/Budapest", + "Europe/Busingen", + "Europe/Chisinau", + "Europe/Copenhagen", + "Europe/Dublin", + "Europe/Gibraltar", + "Europe/Guernsey", + "Europe/Helsinki", + "Europe/Isle_of_Man", + "Europe/Istanbul", + "Europe/Jersey", + "Europe/Kaliningrad", + "Europe/Kiev", + "Europe/Lisbon", + "Europe/Ljubljana", + "Europe/London", + "Europe/Luxembourg", + "Europe/Madrid", + "Europe/Malta", + "Europe/Mariehamn", + "Europe/Minsk", + "Europe/Monaco", + "Europe/Moscow", + "Europe/Oslo", + "Europe/Paris", + "Europe/Podgorica", + "Europe/Prague", + "Europe/Riga", + "Europe/Rome", + "Europe/Samara", + "Europe/San_Marino", + "Europe/Sarajevo", + "Europe/Simferopol", + "Europe/Skopje", + "Europe/Sofia", + "Europe/Stockholm", + "Europe/Tallinn", + "Europe/Tirane", + "Europe/Uzhgorod", + "Europe/Vaduz", + "Europe/Vatican", + "Europe/Vienna", + "Europe/Vilnius", + "Europe/Volgograd", + "Europe/Warsaw", + "Europe/Zagreb", + "Europe/Zaporozhye", + "Europe/Zurich", + "GMT", + "Indian/Antananarivo", + "Indian/Chagos", + "Indian/Christmas", + "Indian/Cocos", + "Indian/Comoro", + "Indian/Kerguelen", + "Indian/Mahe", + "Indian/Maldives", + "Indian/Mauritius", + "Indian/Mayotte", + "Indian/Reunion", + "Pacific/Apia", + "Pacific/Auckland", + "Pacific/Chatham", + "Pacific/Chuuk", + "Pacific/Easter", + "Pacific/Efate", + "Pacific/Enderbury", + "Pacific/Fakaofo", + "Pacific/Fiji", + "Pacific/Funafuti", + "Pacific/Galapagos", + "Pacific/Gambier", + "Pacific/Guadalcanal", + "Pacific/Guam", + "Pacific/Honolulu", + "Pacific/Johnston", + "Pacific/Kiritimati", + "Pacific/Kosrae", + "Pacific/Kwajalein", + "Pacific/Majuro", + "Pacific/Marquesas", + "Pacific/Midway", + "Pacific/Nauru", + "Pacific/Niue", + "Pacific/Norfolk", + "Pacific/Noumea", + "Pacific/Pago_Pago", + "Pacific/Palau", + "Pacific/Pitcairn", + "Pacific/Pohnpei", + "Pacific/Port_Moresby", + "Pacific/Rarotonga", + "Pacific/Saipan", + "Pacific/Tahiti", + "Pacific/Tarawa", + "Pacific/Tongatapu", + "Pacific/Wake", + "Pacific/Wallis", + "US/Alaska", + "US/Arizona", + "US/Central", + "US/Eastern", + "US/Hawaii", + "US/Mountain", + "US/Pacific", + "UTC" + ] + }, + "pending_institution": { + "title": "Pending Institution", + "description": "Non-linkTo identifier of an institution this user wants associate with", + "type": "string", + "permission": "restricted_fields" + }, + "was_unauthorized": { + "title": "Was Unauthorized", + "permission": "restricted_fields", + "description": "Flag that is True if user was created with create-unauthorized-user endpoint", + "type": "boolean" + }, + "@id": { + "title": "ID", + "type": "string", + "calculatedProperty": true + }, + "@type": { + "title": "Type", + "type": "array", + "items": { + "type": "string" + }, + "calculatedProperty": true + }, + "principals_allowed": { + "title": "principals_allowed", + "description": "Calculated permissions used for ES filtering", + "type": "object", + "properties": { + "view": { + "type": "string" + }, + "edit": { + "type": "string" + } + }, + "calculatedProperty": true + }, + "display_title": { + "title": "Display Title", + "description": "A calculated title for every object in 4DN", + "type": "string", + "calculatedProperty": true + }, + "title": { + "title": "Title", + "type": "string", + "calculatedProperty": true + }, + "contact_email": { + "title": "Contact Email", + "description": "E-Mail address by which this person should be contacted.", + "type": "string", + "format": "email", + "calculatedProperty": true + } + }, + "columns": { + "user_institution.display_title": { + "title": "Institution" + }, + "project_roles.project": { + "title": "Project" + }, + "project_roles.role": { + "title": "Project Role", + "default_hidden": true + }, + "job_title": { + "title": "Job Title", + "default_hidden": true + } + }, + "facets": { + "user_institution.display_title": { + "title": "User's Institution" + }, + "project_roles.project": { + "title": "User's Project" + } + }, + "@type": [ + "JSONSchema" + ], + "rdfs:seeAlso": "/terms/User", + "children": [], + "rdfs:subClassOf": "/profiles/Item.json", + "isAbstract": false +} \ No newline at end of file diff --git a/test/test_bundle_utils.py b/test/test_bundle_utils.py index 66bb589bd..4569905e7 100644 --- a/test/test_bundle_utils.py +++ b/test/test_bundle_utils.py @@ -1,22 +1,33 @@ +# import contextlib import contextlib import json import os import pytest +import re # from collections import namedtuple from dcicutils import bundle_utils as bundle_utils_module, ff_utils as ff_utils_module from dcicutils.common import AnyJsonData from dcicutils.env_utils import EnvUtils, public_env_name -from dcicutils.misc_utils import is_uuid, local_attrs, NamedObject, AbstractVirtualApp +from dcicutils.misc_utils import ( + ignored, is_uuid, local_attrs, NamedObject, AbstractVirtualApp, to_snake_case, json_file_contents, +) from dcicutils.qa_utils import printed_output, mock_not_called, MockResponse from dcicutils.bundle_utils import ( # High-level interfaces - ItemManager, load_items, ITEM_MANAGER_REGISTRY, + # ItemManager, + inflate, check, + load_table_structures, + load_items, # ITEM_MANAGER_REGISTRY, # Low-level implementation - SchemaAutoloadMixin, - ItemTools, XlsxItemManager, - CsvItemManager, TsvItemManager, - # TypeHint, EnumHint, + # SchemaAutoloadMixin, + SchemaManager, + ItemTools, + TableInflater, TableChecker, TabbedItemTable, + # XlsxItemManager, + # CsvItemManager, TsvItemManager, + NumHint, + TypeHint, EnumHint, BoolHint, ) from dcicutils.sheet_utils import ( @@ -30,6 +41,7 @@ LoadArgumentsError, LoadTableError, # LoadFailure, # Utilities infer_tab_name_from_filename, # prefer_number, unwanted_kwargs, expand_string_escape_sequences, + load_table_set, ) from typing import Dict, Optional from unittest import mock @@ -130,40 +142,19 @@ def test_item_tools_parse_item_value_basic(): assert ItemTools.parse_item_value('alpha|true|false|null||7|1.5') == ['alpha', True, False, None, None, 7, 1.5] -@pytest.mark.parametrize('instaguids_enabled', [True, False]) -def test_item_tools_parse_item_value_guids(instaguids_enabled): - - with local_attrs(ItemTools, INSTAGUIDS_ENABLED=instaguids_enabled): - - sample_simple_field_input = "#foo" - - parsed = ItemTools.parse_item_value(sample_simple_field_input) - assert parsed == sample_simple_field_input +def test_item_tools_parse_item_value_guids(): - context = {} - parsed = ItemTools.parse_item_value(sample_simple_field_input, context=context) - if instaguids_enabled: - assert is_uuid(parsed) - assert parsed == context[sample_simple_field_input] - else: - assert parsed == sample_simple_field_input - assert context == {} + sample_simple_field_input = "#foo" - sample_compound_field_input = '#foo|#bar' - sample_compound_field_list = ['#foo', '#bar'] + parsed = ItemTools.parse_item_value(sample_simple_field_input) + assert parsed == sample_simple_field_input - parsed = ItemTools.parse_item_value(sample_compound_field_input) - assert parsed == sample_compound_field_list + sample_compound_field_input = '#foo|#bar' + sample_compound_field_list = ['#foo', '#bar'] - context = {} - parsed = ItemTools.parse_item_value(sample_compound_field_input, context=context) - assert isinstance(parsed, list) - if instaguids_enabled: - assert all(is_uuid(x) for x in parsed) - assert '#foo' in context and '#bar' in context - else: - assert parsed == sample_compound_field_list - assert context == {} + parsed = ItemTools.parse_item_value(sample_compound_field_input) + assert isinstance(parsed, list) + assert parsed == sample_compound_field_list def test_item_tools_set_path_value(): @@ -224,75 +215,19 @@ def test_item_tools_find_type_hint(): assert actual is None -def test_item_manager_registry_manager_for_filename(): - assert ITEM_MANAGER_REGISTRY.manager_for_filename("xyz/foo.csv") == CsvItemManager - - with pytest.raises(Exception) as exc: - ITEM_MANAGER_REGISTRY.manager_for_filename("xyz/foo.something.missing") - assert str(exc.value) == "Unknown file type: xyz/foo.something.missing" - - -def test_xlsx_item_manager_load_content(): - it = XlsxItemManager(SAMPLE_XLSX_FILE, autoload_schemas=False) - assert it.load_content() == SAMPLE_XLSX_FILE_ITEM_CONTENT - - -def test_xlsx_item_manager_load(): - assert XlsxItemManager.load(SAMPLE_XLSX_FILE, autoload_schemas=False) == SAMPLE_XLSX_FILE_ITEM_CONTENT - - -def test_xlsx_item_manager_load_csv(): - with pytest.raises(LoadArgumentsError) as exc: - XlsxItemManager.load(SAMPLE_CSV_FILE) - assert str(exc.value).startswith('The TableSetManager subclass XlsxItemManager' - ' expects only .xlsx filenames:') - - -def test_csv_item_manager_load_content(): - it = CsvItemManager(SAMPLE_CSV_FILE, autoload_schemas=False) - assert it.load_content() == SAMPLE_CSV_FILE_ITEM_CONTENT +def test_load_table_structures(): + assert load_table_structures(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_ITEM_CONTENT + assert load_table_structures(SAMPLE_CSV_FILE) == SAMPLE_CSV_FILE_ITEM_CONTENT + assert load_table_structures(SAMPLE_TSV_FILE) == SAMPLE_TSV_FILE_ITEM_CONTENT - -def test_csv_item_manager_load(): - assert CsvItemManager.load(SAMPLE_CSV_FILE, autoload_schemas=False) == SAMPLE_CSV_FILE_ITEM_CONTENT - - -def test_csv_item_manager_load_csv(): - with pytest.raises(LoadArgumentsError) as exc: - CsvItemManager.load(SAMPLE_XLSX_FILE, autoload_schemas=False) - assert str(exc.value).startswith('The TableSetManager subclass CsvItemManager' - ' expects only .csv filenames:') - - -def test_tsv_item_manager_load_content(): - it = TsvItemManager(SAMPLE_TSV_FILE, autoload_schemas=False) - assert it.load_content() == SAMPLE_TSV_FILE_ITEM_CONTENT - - -def test_tsv_item_manager_load(): - assert TsvItemManager.load(SAMPLE_TSV_FILE, autoload_schemas=False) == SAMPLE_TSV_FILE_ITEM_CONTENT - - -def test_tsv_item_manager_load_csv(): - with pytest.raises(LoadArgumentsError) as exc: - TsvItemManager.load(SAMPLE_XLSX_FILE, autoload_schemas=False) - assert str(exc.value).startswith('The TableSetManager subclass TsvItemManager' - ' expects only .tsv or .tsv.txt filenames:') - - -def test_item_manager_load(): - assert ItemManager.load(SAMPLE_XLSX_FILE, autoload_schemas=False) == SAMPLE_XLSX_FILE_ITEM_CONTENT - assert ItemManager.load(SAMPLE_CSV_FILE, autoload_schemas=False) == SAMPLE_CSV_FILE_ITEM_CONTENT - assert ItemManager.load(SAMPLE_TSV_FILE, autoload_schemas=False) == SAMPLE_TSV_FILE_ITEM_CONTENT - - loaded = ItemManager.load(SAMPLE_JSON_TABS_FILE, autoload_schemas=False) + loaded = load_table_structures(SAMPLE_JSON_TABS_FILE) print("loaded=", json.dumps(loaded, indent=2)) expected = SAMPLE_JSON_TABS_FILE_ITEM_CONTENT print("expected=", json.dumps(expected, indent=2)) assert loaded == expected with pytest.raises(LoadArgumentsError) as exc: - ItemManager.load("something.else") + load_table_structures("something.else") assert str(exc.value) == "Unknown file type: something.else" @@ -495,50 +430,28 @@ def test_load_items_with_schema_and_instaguids(instaguids_enabled): assert actual == expected # no substitution performed -class SchemaAutoloaderForTesting(SchemaAutoloadMixin): - - def __init__(self, **kwargs): - super().__init__(filename='ignored.file.name', **kwargs) - - -@contextlib.contextmanager -def schema_autoloader_for_testing(**kwargs) -> SchemaAutoloadMixin: - autoloader: Optional[SchemaAutoloadMixin] = None - success = False - try: - autoloader: SchemaAutoloadMixin = SchemaAutoloaderForTesting(**kwargs) - assert autoloader.SCHEMA_CACHE == {}, "The schema cache is not clean." - yield autoloader - success = True - finally: - if autoloader is not None: - autoloader.clear_schema_cache() - assert autoloader.SCHEMA_CACHE == SchemaAutoloadMixin.SCHEMA_CACHE == {} - if not success: - raise - - @using_fresh_ff_state_for_testing() @pytest.mark.integrated @pytest.mark.parametrize('portal_env', [None, 'data']) def test_schema_autoload_mixin_caching(portal_env): - with schema_autoloader_for_testing(portal_env=portal_env) as autoloader: + with SchemaManager.fresh_schema_manager_context_for_testing(): + + schema_manager = SchemaManager(portal_env=portal_env) - assert autoloader.portal_env == 'data' # it should have defaulted even if we didn't supply it + assert schema_manager.portal_env == 'data' # it should have defaulted even if we didn't supply it - assert autoloader.SCHEMA_CACHE == SchemaAutoloadMixin.SCHEMA_CACHE == {} + assert SchemaManager.SCHEMA_CACHE == {} sample_schema_name = 'foo' sample_schema = {'mock_schema_for': 'foo'} with mock.patch.object(bundle_utils_module, "get_schema") as mock_get_schema: mock_get_schema.return_value = sample_schema - assert autoloader.fetch_schema(sample_schema_name, portal_env=autoloader.portal_env) == sample_schema + assert schema_manager.fetch_schema(sample_schema_name, portal_env=schema_manager.portal_env) == sample_schema schema_cache_with_sample_schema = {sample_schema_name: sample_schema} - assert SchemaAutoloadMixin.SCHEMA_CACHE == schema_cache_with_sample_schema - assert autoloader.SCHEMA_CACHE == schema_cache_with_sample_schema + assert SchemaManager.SCHEMA_CACHE == schema_cache_with_sample_schema @using_fresh_ff_state_for_testing() @@ -546,11 +459,13 @@ def test_schema_autoload_mixin_caching(portal_env): @pytest.mark.parametrize('portal_env', [None, 'data']) def test_schema_autoload_mixin_fetch_schema(portal_env): - with schema_autoloader_for_testing(portal_env=portal_env) as autoloader: + with SchemaManager.fresh_schema_manager_context_for_testing(): + + schema_manager = SchemaManager(portal_env=portal_env) - assert autoloader.portal_env == 'data' + assert schema_manager.portal_env == 'data' - user_schema = autoloader.fetch_schema('user', portal_env=autoloader.portal_env) + user_schema = schema_manager.fetch_schema('user', portal_env=schema_manager.portal_env) assert user_schema['$id'] == '/profiles/user.json' assert user_schema['title'] == 'User' @@ -559,34 +474,23 @@ def test_schema_autoload_mixin_fetch_schema(portal_env): @using_fresh_ff_state_for_testing() @pytest.mark.integrated -@pytest.mark.parametrize('autoload_schemas', [True, False]) -@pytest.mark.parametrize('cache_schemas', [True, False]) @pytest.mark.parametrize('portal_env', [None, 'data']) -def test_schema_autoload_mixin_fetch_relevant_schemas(autoload_schemas, cache_schemas, portal_env): +def test_schema_autoload_mixin_fetch_relevant_schemas(portal_env): with printed_output() as printed: - with local_attrs(SchemaAutoloadMixin, CACHE_SCHEMAS=cache_schemas): - with schema_autoloader_for_testing(portal_env=portal_env, autoload_schemas=autoload_schemas) as autoloader: - - assert autoloader.portal_env == ('data' if autoload_schemas or portal_env else None) - - if autoload_schemas: + with SchemaManager.fresh_schema_manager_context_for_testing(): + schema_manager = SchemaManager(portal_env=portal_env) + schemas = schema_manager.fetch_relevant_schemas(['User', 'Lab']) + assert isinstance(schemas, dict) + assert len(schemas) == 2 + assert set(schemas.keys()) == {'User', 'Lab'} - schemas = autoloader.fetch_relevant_schemas(['User', 'Lab']) - assert isinstance(schemas, dict) - assert len(schemas) == 2 - assert set(schemas.keys()) == {'User', 'Lab'} - - else: - - assert autoloader.fetch_relevant_schemas(['User', 'Lab']) == {} - - if portal_env == 'data' or not autoload_schemas: - assert printed.lines == [] - else: - assert printed.lines == [ - "The portal_env was not explicitly supplied. Schemas will come from portal_env='data'." - ] + if portal_env == 'data': + assert printed.lines == [] + else: + assert printed.lines == [ + "The portal_env was not explicitly supplied. Schemas will come from portal_env='data'." + ] SAMPLE_ITEMS_FOR_REAL_SCHEMAS_FILE = os.path.join(TEST_DIR, 'data_files/sample_items_for_real_schemas.csv') @@ -598,38 +502,38 @@ def test_workbook_with_schemas(): print() # start o a fresh line - SchemaAutoloadMixin.clear_schema_cache() - - actual_data = CsvManager(filename=SAMPLE_ITEMS_FOR_REAL_SCHEMAS_FILE, tab_name='ExperimentSeq').load_content() - expected_data = { - "ExperimentSeq": [ - { - "accession": "foo", - "fragment_size_selection_method": "spri" - }, - { - "accession": "bar", - "fragment_size_selection_method": "blue" - } - ] - } - assert actual_data == expected_data - - actual_items = load_items(SAMPLE_ITEMS_FOR_REAL_SCHEMAS_FILE, - tab_name='ExperimentSeq', autoload_schemas=True) - expected_items = { - "ExperimentSeq": [ - { - "accession": "foo", - "fragment_size_selection_method": "SPRI beads" - }, - { - "accession": "bar", - "fragment_size_selection_method": "BluePippin" - } - ] - } - assert actual_items == expected_items + with SchemaManager.fresh_schema_manager_context_for_testing(): + + actual_data = load_table_set(filename=SAMPLE_ITEMS_FOR_REAL_SCHEMAS_FILE, tab_name='ExperimentSeq') + expected_data = { + "ExperimentSeq": [ + { + "accession": "foo", + "fragment_size_selection_method": "spri" + }, + { + "accession": "bar", + "fragment_size_selection_method": "blue" + } + ] + } + assert actual_data == expected_data + + actual_items = load_items(SAMPLE_ITEMS_FOR_REAL_SCHEMAS_FILE, + tab_name='ExperimentSeq', autoload_schemas=True) + expected_items = { + "ExperimentSeq": [ + { + "accession": "foo", + "fragment_size_selection_method": "SPRI beads" + }, + { + "accession": "bar", + "fragment_size_selection_method": "BluePippin" + } + ] + } + assert actual_items == expected_items @using_fresh_ff_state_for_testing() @@ -638,47 +542,183 @@ def test_workbook_with_schemas_and_portal_vapp(): print() # start on a fresh line - SchemaAutoloadMixin.clear_schema_cache() + with SchemaManager.fresh_schema_manager_context_for_testing(): - portal_env = public_env_name(EnvUtils.PRD_ENV_NAME) + portal_env = public_env_name(EnvUtils.PRD_ENV_NAME) - experiment_seq_schema = ff_utils_module.get_schema('ExperimentSeq', portal_env=portal_env) + experiment_seq_schema = ff_utils_module.get_schema('ExperimentSeq', portal_env=portal_env) - expected_items = { - "ExperimentSeq": [ - { - "accession": "foo", - "fragment_size_selection_method": "SPRI beads" - }, - { - "accession": "bar", - "fragment_size_selection_method": "BluePippin" - } - ] - } + expected_items = { + "ExperimentSeq": [ + { + "accession": "foo", + "fragment_size_selection_method": "SPRI beads" + }, + { + "accession": "bar", + "fragment_size_selection_method": "BluePippin" + } + ] + } + + class MockVapp(NamedObject, AbstractVirtualApp): + + def __init__(self, name): + super().__init__(name=name) + self.call_count = 0 + + def get(self, path_url): + assert path_url.startswith('/profiles/ExperimentSeq.json?') + self.call_count += 1 + response = MockResponse(200, json=experiment_seq_schema) + return response + + portal_vapp = MockVapp(name=f'MockVapp[{portal_env}]') - class MockVapp(NamedObject, AbstractVirtualApp): + old_count = portal_vapp.call_count - def __init__(self, name): - super().__init__(name=name) - self.call_count = 0 + with mock.patch.object(ff_utils_module, "get_authentication_with_server", + mock_not_called("get_authentication_with_server")): + with mock.patch.object(ff_utils_module, "get_metadata", + mock_not_called("get_metadata")): + actual_items = load_items(SAMPLE_ITEMS_FOR_REAL_SCHEMAS_FILE, + tab_name='ExperimentSeq', autoload_schemas=True, portal_vapp=portal_vapp) - def get(self, path_url): - assert path_url.startswith('/profiles/ExperimentSeq.json?') - self.call_count += 1 - response = MockResponse(200, json=experiment_seq_schema) - return response + assert portal_vapp.call_count == old_count + 1 + assert actual_items == expected_items - portal_vapp = MockVapp(name=f'MockVapp[{portal_env}]') - old_count = portal_vapp.call_count +_SAMPLE_SCHEMA_DIR = os.path.join(TEST_DIR, "data_files", "sample_schemas") +_SAMPLE_INSERTS_DIR = os.path.join(TEST_DIR, "data_files", "sample_inserts") +_SAMPLE_INSERTS = load_table_set(_SAMPLE_INSERTS_DIR) - with mock.patch.object(ff_utils_module, "get_authentication_with_server", - mock_not_called("get_authentication_with_server")): - with mock.patch.object(ff_utils_module, "get_metadata", - mock_not_called("get_metadata")): - actual_items = load_items(SAMPLE_ITEMS_FOR_REAL_SCHEMAS_FILE, - tab_name='ExperimentSeq', autoload_schemas=True, portal_vapp=portal_vapp) +ID_NAME_PATTERN = re.compile("^/?([^/]*)/([^/]*)/?$") + + +@contextlib.contextmanager +def mocked_schemas(mock_remotes=None, expected_portal_env=None, expected_portal_vapp=None): + def mocked_get_schema(schema_name, portal_env=None, portal_vapp=None): + if expected_portal_env is not None: + assert portal_env == expected_portal_env, (f"get_schema got ff_env={portal_env!r}," + f" but expected ff_env={expected_portal_env!r}.") + if expected_portal_vapp is not None: + assert portal_vapp == expected_portal_vapp, (f"get_schema got portal_vapp={portal_vapp!r}," + f" but expected portal_vapp={expected_portal_vapp!r}.") + snake_name = to_snake_case(schema_name) + schema_file = os.path.join(_SAMPLE_SCHEMA_DIR, f"{snake_name}.json") + if os.path.exists(schema_file): + return json_file_contents(schema_file) + else: + return None + + def mocked_get_metadata(obj_id, key=None, ff_env=None, check_queue=False, add_on=''): + ignored(key, ff_env, check_queue, add_on) + parts = ID_NAME_PATTERN.match(obj_id) + assert parts, f"mocked_get_metadata got {obj_id}, but expected //" + item_type, item_ref = parts.groups() + return _SAMPLE_INSERTS_LOOKUP_TABLE.contains_ref(item_type=item_type, item_ref=item_ref) + + with mock.patch.object(bundle_utils_module, "get_schema") as mock_get_schema: + mock_get_schema.side_effect = mocked_get_schema + with mock.patch.object(bundle_utils_module, "get_metadata") as mock_get_metadata: + mock_get_metadata.side_effect = mocked_get_metadata + _SAMPLE_INSERTS_LOOKUP_TABLE = TabbedItemTable(_SAMPLE_INSERTS) + yield + + +SAMPLE_PROJECT_UUID = "dac6d5b3-6ef6-4271-9715-a78329acf846" +SAMPLE_PROJECT_NAME = 'test-project' +SAMPLE_PROJECT_TITLE = SAMPLE_PROJECT_NAME.title().replace('-', ' ') +SAMPLE_PROJECT = { + "title": SAMPLE_PROJECT_TITLE, + "uuid": SAMPLE_PROJECT_UUID, + "description": f"This is the {SAMPLE_PROJECT_TITLE}.", + "name": SAMPLE_PROJECT_NAME, + "status": "shared", + "date_created": "2020-11-24T20:46:00.000000+00:00", +} +SAMPLE_PROJECT_SANS_UUID = SAMPLE_PROJECT.copy() # to be modified on next line +SAMPLE_PROJECT_SANS_UUID.pop('uuid') + +SAMPLE_INSTITUTION_UUID = "87199845-51b5-4352-bdea-583edae4bb6a" +SAMPLE_INSTITUTION_NAME = "cgap-backend-team" +SAMPLE_INSTITUTION_TITLE = SAMPLE_INSTITUTION_NAME.title().replace('-', ' ') +SAMPLE_INSTITUTION = { + "name": SAMPLE_INSTITUTION_NAME, + "title": SAMPLE_INSTITUTION_TITLE, + "status": "shared", + "uuid": SAMPLE_INSTITUTION_UUID, +} +SAMPLE_INSTITUTION_SANS_UUID = SAMPLE_INSTITUTION.copy() # to be modified on next line +SAMPLE_INSTITUTION_SANS_UUID.pop('uuid') + +SAMPLE_USER_EMAIL = "jdoe@example.com" +SAMPLE_USER_FIRST_NAME = "Jenny" +SAMPLE_USER_LAST_NAME = "Doe" +SAMPLE_USER_ROLE = "developer" +SAMPLE_USER_UUID = "e0dec518-cb0c-45f3-8c97-21b2659ec129" +SAMPLE_USER_WITH_UUID_REFS = { + "email": SAMPLE_USER_EMAIL, + "first_name": SAMPLE_USER_FIRST_NAME, + "last_name": SAMPLE_USER_LAST_NAME, + "uuid": SAMPLE_USER_UUID, + "project": SAMPLE_PROJECT_UUID, + "project_roles#0.project": SAMPLE_PROJECT_UUID, + "project_roles#0.role": SAMPLE_USER_ROLE, + "user_institution": SAMPLE_INSTITUTION_UUID, +} +SAMPLE_USER_WITH_NAME_REFS = { + "email": SAMPLE_USER_EMAIL, + "first_name": SAMPLE_USER_FIRST_NAME, + "last_name": SAMPLE_USER_LAST_NAME, + "uuid": SAMPLE_USER_UUID, + "project": SAMPLE_PROJECT_NAME, + "project_roles#0.project SAMPLE_PROJECT_NAME," + "project_roles#0.role": SAMPLE_USER_ROLE, + "user_institution": SAMPLE_INSTITUTION_NAME, +} + + +def test_table_checker(): + + print() # start on a fresh line + + mock_ff_env = 'some-env' + + with mocked_schemas(mock_remotes=False): + + # # Here the User refers to project and institution by UUID, but we don't have the UUID in our + # sample_workbook_with_unmatched_uuid_refs = { + # "User": [SAMPLE_USER_WITH_UUID_REFS], + # "Project": [SAMPLE_PROJECT_SANS_UUID], + # "Institution": [SAMPLE_INSTITUTION_SANS_UUID], + # } + # + # with printed_output() as printed: + # with pytest.raises(Exception) as exc: + # checker = TableChecker(sample_workbook_with_unmatched_uuid_refs, portal_env=mock_ff_env) + # checker.check_tabs() + # assert str(exc.value) == "There were 2 problems while compiling hints." + # assert printed.lines == [ + # f"Problem: User[0].project: Unable to validate Project reference: {SAMPLE_PROJECT_UUID!r}", + # (f"Problem: User[0].user_institution: Unable to validate Institution reference:" + # f" {SAMPLE_INSTITUTION_UUID!r}") + # ] + + sample_workbook_with_matched_uuid_refs = { + "User": [SAMPLE_USER_WITH_UUID_REFS], + "Project": [SAMPLE_PROJECT], + "Institution": [SAMPLE_INSTITUTION], + } - assert portal_vapp.call_count == old_count + 1 - assert actual_items == expected_items + checker = TableChecker(sample_workbook_with_matched_uuid_refs, portal_env=mock_ff_env) + checker.check_tabs() + + # sample_workbook_with_name_refs = { + # "User": [SAMPLE_USER_WITH_NAME_REFS], + # "Project": [SAMPLE_PROJECT], + # "Institution": [SAMPLE_INSTITUTION], + # } + # + # checker = TableChecker(sample_workbook_with_name_refs, portal_env=mock_ff_env) + # checker.check_tabs() From f9c38a0618402571b5d4ff1606123452ddb9c444 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Fri, 20 Oct 2023 13:13:05 -0400 Subject: [PATCH 075/101] WIP --- dcicutils/bundle_utils.py | 302 +++++++++++----------- dcicutils/lang_utils.py | 8 +- dcicutils/sheet_utils.py | 43 +++- dcicutils/validation_utils.py | 287 +++++++++++++++++++++ test/data_files/sample_items.tabs.json | 5 +- test/data_files/sample_items.tabs.yaml | 4 +- test/helpers_for_bundles.py | 70 +++++ test/test_bundle_utils.py | 337 +++++++++++-------------- test/test_sheet_utils.py | 54 ++-- test/test_validation_utils.py | 107 ++++++++ 10 files changed, 828 insertions(+), 389 deletions(-) create mode 100644 dcicutils/validation_utils.py create mode 100644 test/helpers_for_bundles.py create mode 100644 test/test_validation_utils.py diff --git a/dcicutils/bundle_utils.py b/dcicutils/bundle_utils.py index 9cac46157..18f342c02 100644 --- a/dcicutils/bundle_utils.py +++ b/dcicutils/bundle_utils.py @@ -1,17 +1,15 @@ -import contextlib +# import contextlib import copy -# import os -# import uuid -# import warnings +import jsonschema from typing import Any, Dict, List, Optional, Union # , Type from .common import AnyJsonData # , Regexp, CsvReader from .env_utils import EnvUtils, public_env_name -from .ff_utils import get_schema, get_metadata +from .ff_utils import get_metadata # , get_schema from .lang_utils import there_are from .misc_utils import AbstractVirtualApp, ignored, PRINT, to_camel_case from .sheet_utils import ( - TabbedSchemas, LoadTableError, prefer_number, + TabbedJsonSchemas, LoadTableError, prefer_number, Header, Headers, TabbedHeaders, ParsedHeader, ParsedHeaders, TabbedParsedHeaders, SheetCellValue, TabbedSheetData, # SheetRow, SheetData, @@ -20,9 +18,15 @@ # SimpleJsonInsertsManager, SimpleYamlInsertsManager, SimpleJsonLinesInsertsManager, # TabbedJsonInsertsManager, TabbedYamlInsertsManager, # InsertsDirectoryManager, + InsertsManager, load_table_set ) -from .task_utils import pmap +# from .task_utils import pmap +from .validation_utils import SchemaManager + + +PatchPrototype = Dict +TabbedPatchPrototypes = Dict[str, PatchPrototype] # @contextlib.contextmanager @@ -236,7 +240,10 @@ def assure_patch_prototype_shape(cls, *, parent: Union[Dict, List], keys: Parsed INSTAGUIDS_ENABLED = False # Experimental feature not enabled by default @classmethod - def parse_item_value(cls, value: SheetCellValue) -> AnyJsonData: + def parse_item_value(cls, value: SheetCellValue, + apply_heuristics: bool = False, split_pipe: bool = False) -> AnyJsonData: + if not apply_heuristics: + return value # TODO: Remodularize this for easier testing and more Schema-driven effect # Doug asks that this be broken up into different mechanisms, more modular and separately testable. # I pretty much agree with that. I'm just waiting for suggestions on what kinds of features are desired. @@ -249,18 +256,18 @@ def parse_item_value(cls, value: SheetCellValue) -> AnyJsonData: return False elif lvalue == 'null' or lvalue == '': return None - elif '|' in value: + elif split_pipe and '|' in value: if value == '|': # Use '|' for [] return [] else: if value.endswith("|"): # Use 'foo|' for ['foo'] value = value[:-1] - return [cls.parse_item_value(subvalue) for subvalue in value.split('|')] + return [cls.parse_item_value(subvalue, apply_heuristics=apply_heuristics, split_pipe=split_pipe) + for subvalue in value.split('|')] else: # Doug points out that the schema might not agree, might want a string representation of a number. # At this semantic layer, this might be a bad choice. - # return prefer_number(value) - return value + return prefer_number(value) else: # presumably a number (int or float) return value @@ -310,80 +317,71 @@ def finder(subheader, subschema): return finder(subheader=parsed_header, subschema=schema) -class SchemaManager: - - SCHEMA_CACHE = {} # Shared cache. Do not override. Use .clear_schema_cache() to clear it. - - @classmethod - @contextlib.contextmanager - def fresh_schema_manager_context_for_testing(cls): - old_schema_cache = cls.SCHEMA_CACHE - try: - cls.SCHEMA_CACHE = {} - yield - finally: - cls.SCHEMA_CACHE = old_schema_cache - - def __init__(self, portal_env: Optional[str] = None, portal_vapp: Optional[AbstractVirtualApp] = None): - if portal_env is None and portal_vapp is None: - portal_env = public_env_name(EnvUtils.PRD_ENV_NAME) - PRINT(f"The portal_env was not explicitly supplied. Schemas will come from portal_env={portal_env!r}.") - self.portal_env = portal_env - self.portal_vapp = portal_vapp - - def fetch_relevant_schemas(self, schema_names: List[str], schemas: Optional[TabbedSchemas] = None): - if schemas is None: - schemas = {} - # The schema_names argument is not normally given, but it is there for easier testing - def fetch_schema(schema_name): - schema = schemas.get(schema_name) - schema = (self.fetch_schema(schema_name, portal_env=self.portal_env, portal_vapp=self.portal_vapp) - if schema is None - else schema) - return schema_name, schema - return {schema_name: schema - for schema_name, schema in pmap(fetch_schema, schema_names)} - - @classmethod - def schema_exists(cls, schema_name: str, *, portal_env: Optional[str] = None, - portal_vapp: Optional[AbstractVirtualApp] = None): - return bool(cls.fetch_schema(schema_name=schema_name, portal_env=portal_env, portal_vapp=portal_vapp)) - - @classmethod - def fetch_schema(cls, schema_name: str, *, portal_env: Optional[str] = None, - portal_vapp: Optional[AbstractVirtualApp] = None): - schema: Optional[AnyJsonData] = cls.SCHEMA_CACHE.get(schema_name) - if schema is None and schema_name not in cls.SCHEMA_CACHE: # If None is already stored, don't look it up again - schema = get_schema(schema_name, portal_env=portal_env, portal_vapp=portal_vapp) - cls.SCHEMA_CACHE[schema_name] = schema - return schema - - @classmethod - def clear_schema_cache(cls): - for key in list(cls.SCHEMA_CACHE.keys()): # important to get the list of keys as a separate object first - cls.SCHEMA_CACHE.pop(key, None) +# class SchemaManager: +# +# SCHEMA_CACHE = {} # Shared cache. Do not override. Use .clear_schema_cache() to clear it. +# +# @classmethod +# @contextlib.contextmanager +# def fresh_schema_manager_context_for_testing(cls): +# old_schema_cache = cls.SCHEMA_CACHE +# try: +# cls.SCHEMA_CACHE = {} +# yield +# finally: +# cls.SCHEMA_CACHE = old_schema_cache +# +# def __init__(self, schemas: Optional[TabbedSchemas] = None, +# portal_env: Optional[str] = None, portal_vapp: Optional[AbstractVirtualApp] = None): +# if portal_env is None and portal_vapp is None: +# portal_env = public_env_name(EnvUtils.PRD_ENV_NAME) +# PRINT(f"The portal_env was not explicitly supplied. Schemas will come from portal_env={portal_env!r}.") +# self.portal_env = portal_env +# self.portal_vapp = portal_vapp +# self.schemas = {} if schemas is None else schemas.copy() +# +# def fetch_relevant_schemas(self, schema_names: List[str]): # , schemas: Optional[TabbedSchemas] = None): +# # if schemas is None: +# # schemas = self.schemas +# # The schema_names argument is not normally given, but it is there for easier testing +# def fetch_schema(schema_name): +# cached_schema = self.schemas.get(schema_name) # schemas.get(schema_name) +# schema = self.fetch_schema(schema_name) if cached_schema is None else cached_schema +# return schema_name, schema +# return {schema_name: schema +# for schema_name, schema in pmap(fetch_schema, schema_names)} +# +# def schema_exists(self, schema_name: str): +# return bool(self.fetch_schema(schema_name=schema_name)) +# +# def fetch_schema(self, schema_name: str): +# schema: Optional[AnyJsonData] = self.SCHEMA_CACHE.get(schema_name) +# if schema is None and schema_name not in self.SCHEMA_CACHE: # If None is already stored, don't look it up again +# schema = get_schema(schema_name, portal_env=self.portal_env, portal_vapp=self.portal_vapp) +# self.SCHEMA_CACHE[schema_name] = schema +# return schema +# +# @classmethod +# def clear_schema_cache(cls): +# for key in list(cls.SCHEMA_CACHE.keys()): # important to get the list of keys as a separate object first +# cls.SCHEMA_CACHE.pop(key, None) +# +# def identifying_properties(self, schema=None, schema_name=None, among: Optional[List[str]] = None): +# schema = schema if schema is not None else self.fetch_schema(schema_name) +# possible_identifying_properties = set(schema.get("identifyingProperties") or []) | {'uuid'} +# identifying_properties = sorted(possible_identifying_properties +# if among is None +# else (prop +# for prop in among +# if prop in possible_identifying_properties)) +# return identifying_properties ITEM_MANAGER_REGISTRY = TableSetManagerRegistry() -PatchPrototype = Dict -TabbedPatchPrototypes = Dict[str, PatchPrototype] - - -def extract_tabbed_headers(data: TabbedSheetData) -> TabbedHeaders: - result: TabbedHeaders = {} - for tab, rows in data.items(): - if rows: - # Data is homogeneous, so whatever the headers for the first row should be the same for all - headers: List[str] = list(rows[0].keys()) - else: - # If there's no data in the tab, there are also no headers - headers: List[str] = [] - result[tab] = headers - return result -class TableInflater: +class InflatableTabbedDataManager: """ This tool can be used independently of the item tools. It doesn't involve schemas, but it does allow the inflation of a table with dotted names to structures. e.g., a table with headers mother.name, mother.age, @@ -397,9 +395,10 @@ class TableInflater: Note, too, that although data != inflate(data), once inflated, inflate(inflate(data)) == inflate(data). """ - def __init__(self, tabbed_sheet_data: TabbedSheetData): + def __init__(self, tabbed_sheet_data: TabbedSheetData, apply_heuristics: bool = False): self.tabbed_sheet_data: TabbedSheetData = tabbed_sheet_data - self.headers_by_tab_name: TabbedHeaders = extract_tabbed_headers(tabbed_sheet_data) + self.apply_heuristics = apply_heuristics + self.headers_by_tab_name: TabbedHeaders = InsertsManager.extract_tabbed_headers(tabbed_sheet_data) self.parsed_headers_by_tab_name: TabbedParsedHeaders = { tab_name: ItemTools.parse_sheet_headers(headers) for tab_name, headers in self.headers_by_tab_name.items() @@ -424,55 +423,77 @@ def inflate_tab(self, tab_name: str): for row in self.tabbed_sheet_data[tab_name]] return result - @classmethod - def inflate_row(cls, row: Dict, *, prototype: Dict, parsed_headers: ParsedHeaders): + def inflate_row(self, row: Dict, *, prototype: Dict, parsed_headers: ParsedHeaders): patch_item = copy.deepcopy(prototype) for column_number, column_value in enumerate(row.values()): - parsed_value = ItemTools.parse_item_value(column_value) + parsed_value = ItemTools.parse_item_value(column_value, apply_heuristics=self.apply_heuristics) ItemTools.set_path_value(patch_item, parsed_headers[column_number], parsed_value) return patch_item @classmethod - def inflate(cls, tabbed_sheet_data: TabbedSheetData): - inflater = cls(tabbed_sheet_data) + def inflate(cls, tabbed_sheet_data: TabbedSheetData, apply_heuristics: bool = False): + inflater = cls(tabbed_sheet_data, apply_heuristics=apply_heuristics) inflated = inflater.inflate_tabs() return inflated -inflate = TableInflater.inflate +inflate = InflatableTabbedDataManager.inflate -def load_table_structures(filename: str, tab_name: Optional[str] = None, escaping: Optional[bool] = None, - prefer_number: bool = True, **kwargs): +def load_table_structures(filename: str, *, apply_heuristics: bool = True, + tab_name: Optional[str] = None, escaping: Optional[bool] = None, **kwargs): """This differs from load_table_set only in that it inflates the content. It does not apply schemas.""" - tabbed_rows = load_table_set(filename=filename, tab_name=tab_name, escaping=escaping, prefer_number=prefer_number, - **kwargs) - tabbed_structures = inflate(tabbed_rows) + tabbed_rows = load_table_set(filename=filename, tab_name=tab_name, escaping=escaping, **kwargs) + tabbed_structures = inflate(tabbed_rows, apply_heuristics=apply_heuristics) return tabbed_structures -class TabbedItemTable: +class TableChecker(InflatableTabbedDataManager, TypeHintContext): - def __init__(self, tabbed_sheet_data: TabbedSheetData, - portal_env: Optional[str] = None, portal_vapp: Optional[AbstractVirtualApp] = None): + def __init__(self, tabbed_sheet_data: TabbedSheetData, schemas: Optional[TabbedJsonSchemas] = None, + portal_env: Optional[str] = None, portal_vapp: Optional[AbstractVirtualApp] = None, + apply_heuristics: bool = False): + + if portal_env is None and portal_vapp is None: + portal_env = public_env_name(EnvUtils.PRD_ENV_NAME) + # InflatableTabbedDataManager supplies: + # self.tabbed_sheet_data: TabbedSheetData = + # self.headers_by_tab_name: TabbedHeaders = + # self.parsed_headers_by_tab_name: TabbedParsedHeaders = + # self.patch_prototypes_by_tab_name: TabbedPatchPrototypes = + self._problems: List[str] = [] + super().__init__(tabbed_sheet_data=tabbed_sheet_data, apply_heuristics=apply_heuristics) self.portal_env = portal_env self.portal_vapp = portal_vapp - self.headers_by_tab_name: Dict[str, str] = { - tab_name: list(rows[0].keys()) if rows else [] - for tab_name, rows in tabbed_sheet_data.items() - } + self.schema_manager: SchemaManager = SchemaManager(portal_env=portal_env, portal_vapp=portal_vapp, + schemas=schemas) + self.schemas = self.schema_manager.fetch_relevant_schemas(self.tab_names) # , schemas=schemas) self.lookup_tables_by_tab_name: Dict[str, Dict[str, Dict]] = { tab_name: self.build_lookup_table_for_tab(tab_name, rows=rows) for tab_name, rows in tabbed_sheet_data.items() } + self.type_hints_by_tab_name: Dict[str, OptionalTypeHints] = { + tab_name: self.compile_type_hints(tab_name) + for tab_name in self.tab_names + } + + def schema_for_tab(self, tab_name: str) -> dict: + # Once our class is initialized, every tab should have a schema, even if just {} + schema = self.schemas.get(tab_name) + if schema is None: + raise ValueError(f"No schema was given or fetched for tab {tab_name!r}.") + return schema + + def note_problem(self, problem: str): + self._problems.append(problem) def build_lookup_table_for_tab(self, tab_name: str, *, rows: List[Dict]) -> Dict[str, Dict]: - # TODO: It might be enough to just return the keys as a set, not a full dict - schema = get_schema(tab_name, portal_env=self.portal_env, portal_vapp=self.portal_vapp) - possible_identifying_properties = set(schema.get("identifyingProperties") or []) | {'uuid'} - identifying_properties = [prop - for prop in self.headers_by_tab_name[tab_name] - if prop in possible_identifying_properties] + # schema = self.schema_for_tab(tab_name) + # possible_identifying_properties = set(schema.get("identifyingProperties") or []) | {'uuid'} + # identifying_properties = [prop + # for prop in self.headers_by_tab_name[tab_name] + # if prop in possible_identifying_properties] + identifying_properties = self.schema_manager.identifying_properties(schema_name=tab_name) if not identifying_properties: # Maybe issue a warning here that we're going to lose empty_lookup_table: Dict[str, Dict] = {} @@ -499,30 +520,6 @@ def resolve_ref(self, item_type, item_ref): else: # Apparently some stray type not in our tables return None - -class TableChecker(TableInflater, TypeHintContext): - - def __init__(self, tabbed_sheet_data: TabbedSheetData, schemas: Optional[TabbedSchemas] = None, - portal_env: Optional[str] = None, portal_vapp: Optional[AbstractVirtualApp] = None): - self.portal_env = portal_env - self.portal_vapp = portal_vapp - self._problems: List[str] = [] - super().__init__(tabbed_sheet_data=tabbed_sheet_data) - self.schema_manager = SchemaManager(portal_env=portal_env, portal_vapp=portal_vapp) - self.schemas = self.schema_manager.fetch_relevant_schemas(self.tab_names, schemas=schemas) - self.type_hints_by_tab_name: Dict[str, OptionalTypeHints] = { - tab_name: self.compile_type_hints(tab_name) - for tab_name in self.tab_names - } - self.tabbed_item_table = TabbedItemTable(tabbed_sheet_data, portal_env=portal_env, portal_vapp=portal_vapp) - # self.lookup_tables_by_tab_name: Dict[str, Dict[str, Dict]] = { - # tab_name: self.build_lookup_table_for_tab(tab_name, rows=rows) - # for tab_name, rows in tabbed_sheet_data.items() - # } - - def note_problem(self, problem: str): - self._problems.append(problem) - def raise_any_pending_problems(self): problems = self._problems if problems: @@ -540,12 +537,8 @@ def check_tabs(self): return result def validate_ref(self, item_type, item_ref): - if self.tabbed_item_table.contains_ref(item_type=item_type, item_ref=item_ref): + if self.contains_ref(item_type=item_type, item_ref=item_ref): return True - # lookup_table = self.lookup_tables_by_tab_name.get(item_type) - # if lookup_table: - # if item_ref in lookup_table: - # return True try: info = get_metadata(f"/{to_camel_case(item_type)}/{item_ref}") # Basically return True if there's a value at all, @@ -557,25 +550,6 @@ def validate_ref(self, item_type, item_ref): def schema_exists(self, schema_name: str) -> bool: return self.schema_manager.schema_exists(schema_name) - # def build_lookup_table_for_tab(self, tab_name: str, *, rows: List[Dict]) -> Dict[str, Dict]: - # # TODO: It might be enough to just return the keys as a set, not a full dict - # schema = get_schema(tab_name, portal_env=self.portal_env, portal_vapp=self.portal_vapp) - # possible_identifying_properties = set(schema.get("identifyingProperties") or []) | {'uuid'} - # identifying_properties = [prop - # for prop in self.headers_by_tab_name[tab_name] - # if prop in possible_identifying_properties] - # if not identifying_properties: - # # Maybe issue a warning here that we're going to lose - # empty_lookup_table: Dict[str, Dict] = {} - # return empty_lookup_table - # lookup_table: Dict[str, Dict] = {} - # for row in rows: - # for identifying_property in identifying_properties: - # value = row.get(identifying_property) - # if value is not '' and value is not None: - # lookup_table[str(value)] = row - # return lookup_table - def check_tab(self, tab_name: str): prototype = self.patch_prototypes_by_tab_name[tab_name] parsed_headers = self.parsed_headers_by_tab_name[tab_name] @@ -589,7 +563,7 @@ def check_row(self, row: Dict, *, tab_name: str, row_number: int, prototype: Dic parsed_headers: ParsedHeaders, type_hints: OptionalTypeHints): patch_item = copy.deepcopy(prototype) for column_number, column_value in enumerate(row.values()): - parsed_value = ItemTools.parse_item_value(column_value) + parsed_value = ItemTools.parse_item_value(column_value, apply_heuristics=self.apply_heuristics) type_hint = type_hints[column_number] if type_hint: try: @@ -602,8 +576,11 @@ def check_row(self, row: Dict, *, tab_name: str, row_number: int, prototype: Dic return patch_item @classmethod - def check(cls, tabbed_sheet_data: TabbedSheetData, schemas: Optional[TabbedSchemas] = None, **kwargs): - checker = cls(tabbed_sheet_data, schemas=schemas, **kwargs) + def check(cls, tabbed_sheet_data: TabbedSheetData, schemas: Optional[TabbedJsonSchemas] = None, + apply_heuristics: bool = False, + portal_env: Optional[str] = None, portal_vapp: Optional[AbstractVirtualApp] = None): + checker = cls(tabbed_sheet_data, schemas=schemas, apply_heuristics=apply_heuristics, + portal_env=portal_env, portal_vapp=portal_vapp) checked = checker.check_tabs() return checked @@ -638,8 +615,15 @@ def create_tab_processor_state(self, tab_name: str) -> SheetState: def load_items(filename: str, tab_name: Optional[str] = None, escaping: Optional[bool] = None, - schemas: Optional[TabbedSchemas] = None, **kwargs): + schemas: Optional[TabbedJsonSchemas] = None, apply_heuristics: bool = False, + portal_env: Optional[str] = None, portal_vapp: Optional[AbstractVirtualApp] = None, + validate: bool = False, **kwargs): tabbed_rows = load_table_set(filename=filename, tab_name=tab_name, escaping=escaping, prefer_number=False, **kwargs) - checked_items = check(tabbed_rows, schemas=schemas) + checked_items = check(tabbed_rows, schemas=schemas, portal_env=portal_env, portal_vapp=portal_vapp, + apply_heuristics=apply_heuristics) + if validate: + raise NotImplementedError("Need to implement validation.") # TODO: Implement validation return checked_items + + diff --git a/dcicutils/lang_utils.py b/dcicutils/lang_utils.py index 753241120..de020f265 100644 --- a/dcicutils/lang_utils.py +++ b/dcicutils/lang_utils.py @@ -352,7 +352,7 @@ def _conjugate_be(cls, count, tense): def there_are(cls, items, *, kind: str = "thing", count: Optional[int] = None, there: str = "there", capitalize=True, joiner=None, zero: object = "no", punctuate=None, punctuate_none=None, use_article=False, show=True, context=None, tense='present', punctuation_mark: str = ".", - **joiner_options) -> str: + just_are=False, **joiner_options) -> str: """ Constructs a sentence that enumerates a set of things. @@ -372,6 +372,7 @@ def there_are(cls, items, *, kind: str = "thing", count: Optional[int] = None, t :param show: whether to show the items if there are any (default True) :param context: an optional prepositional phrase indicating the context of the item(s) (default None) :param tense: one of 'past', 'present', 'future', 'conditional', or 'hypothetical' for the verbs used + :param just_are: whether to stop at "There is" or "There are" without anything else. By far the most common uses are likely to be: @@ -403,7 +404,10 @@ def there_are(cls, items, *, kind: str = "thing", count: Optional[int] = None, t n = len(items) if count is None else count # If the items is not in the tenses table, it's assumed to be a modal like 'might', 'may', 'must', 'can' etc. is_or_are = cls._conjugate_be(count=n, tense=tense) - part1 = f"{there} {is_or_are} {n_of(n, kind, num_format=lambda n, thing: zero if n == 0 else None)}" + part0 = f"{there} {is_or_are}" + if just_are: + return part0 + part1 = f"{part0} {n_of(n, kind, num_format=lambda n, thing: zero if n == 0 else None)}" if context: part1 += f" {context}" if n == 0 or not show: diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index a2fa9670c..53280b5a5 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -37,7 +37,8 @@ CsvReader = type(csv.reader(TemporaryFile())) SheetData = List[dict] TabbedSheetData = Dict[str, SheetData] -TabbedSchemas = Dict[str, Dict] +JsonSchema = Dict +TabbedJsonSchemas = Dict[str, JsonSchema] class LoadFailure(Exception): @@ -166,7 +167,12 @@ class AbstractTableSetManager: ALLOWED_FILE_EXTENSIONS: List[str] = [] - def __init__(self, filename: str, **kwargs): + def __init__(self, filename: str, prefer_number: Optional[bool] = None, **kwargs): + if prefer_number: + # It's OK to pass prefer_number=None (meaning take the default) and prefer_number=False, + # since that requires no action, but if a class wants to manage such preferences, + # as happens in FlattenedTableSetManager, it will have to do it itself. + raise ValueError(f"This class {self.__class__.__name__} does not implement prefer_number={prefer_number!r}") self.filename: str = filename unwanted_kwargs(context=self.__class__.__name__, kwargs=kwargs) @@ -223,7 +229,7 @@ def _get_reader_agent(self) -> Any: raise NotImplementedError(f"._get_reader_agent() is not implemented for {self.__class__.__name__}.") # noQA -class SemanticTableSetManager(BasicTableSetManager): +class FlattenedTableSetManager(BasicTableSetManager): """ This is the base class for all workbook-like data sources, i.e., that may need to apply semantic processing. Those may be: @@ -249,10 +255,12 @@ def load(cls, filename: str, **kwargs) -> AnyJsonData: raise LoadArgumentsError(f"The TableSetManager subclass {cls.__name__} expects only" f" {disjoined_list(cls.ALLOWED_FILE_EXTENSIONS)} filenames: {filename}") - table_set_manager: SemanticTableSetManager = cls(filename=filename, **kwargs) + table_set_manager: FlattenedTableSetManager = cls(filename=filename, **kwargs) return table_set_manager.load_content() - def __init__(self, filename: str, prefer_number: bool = True, **kwargs): + def __init__(self, filename: str, prefer_number: Optional[bool] = None, **kwargs): + if prefer_number is None: # i.e., no initial value specified + prefer_number = True self.prefer_number: bool = prefer_number super().__init__(filename=filename, **kwargs) @@ -331,7 +339,7 @@ def manager_for_special_filename(self, filename: str) -> Optional[Type[AbstractT @TABLE_SET_MANAGER_REGISTRY.register() -class XlsxManager(SemanticTableSetManager): +class XlsxManager(FlattenedTableSetManager): """ This implements the mechanism to get a series of rows out of the sheets in an XLSX file. """ @@ -424,14 +432,23 @@ def tab_names(self) -> List[str]: def _get_reader_agent(self) -> Any: return self + @classmethod + def extract_tabbed_headers(cls, data: TabbedSheetData) -> TabbedHeaders: + result: TabbedHeaders = {} + for tab, rows in data.items(): + if rows: + # Data is homogeneous, so whatever the headers for the first row should be the same for all + headers: List[str] = list(rows[0].keys()) + else: + # If there's no data in the tab, there are also no headers + headers: List[str] = [] + result[tab] = headers + return result + def load_content(self) -> Dict[str, AnyJsonData]: data = self._load_inserts_data(self.filename) - for tab_name, tab_content in data.items(): - self.content_by_tab_name[tab_name] = tab_content - if not tab_content: - self.headers_by_tab_name[tab_name] = [] - else: - self.headers_by_tab_name[tab_name] = list(tab_content[0].keys()) + self.content_by_tab_name = data + self.headers_by_tab_name = self.extract_tabbed_headers(data) return self.content_by_tab_name @@ -515,7 +532,7 @@ def _parse_inserts_data(self, filename: str) -> AnyJsonData: @TABLE_SET_MANAGER_REGISTRY.register() -class CsvManager(SingleTableMixin, SemanticTableSetManager): +class CsvManager(SingleTableMixin, FlattenedTableSetManager): """ This implements the mechanism to get a series of rows out of the sheet in a csv file, returning a result that still looks like there could have been multiple tabs. diff --git a/dcicutils/validation_utils.py b/dcicutils/validation_utils.py new file mode 100644 index 000000000..36284ff5c --- /dev/null +++ b/dcicutils/validation_utils.py @@ -0,0 +1,287 @@ +import contextlib +import json +import jsonschema +import re + +from typing import Dict, List, Optional +from .common import AnyJsonData +from .ff_utils import get_schema +from .env_utils import EnvUtils, public_env_name +from .lang_utils import there_are, maybe_pluralize, disjoined_list +from .misc_utils import AbstractVirtualApp, PRINT +from .sheet_utils import JsonSchema, TabbedJsonSchemas, SheetData, TabbedSheetData +from .task_utils import pmap + + +class SchemaManager: + + SCHEMA_CACHE = {} # Shared cache. Do not override. Use .clear_schema_cache() to clear it. + + @classmethod + @contextlib.contextmanager + def fresh_schema_manager_context_for_testing(cls): + old_schema_cache = cls.SCHEMA_CACHE + try: + cls.SCHEMA_CACHE = {} + yield + finally: + cls.SCHEMA_CACHE = old_schema_cache + + def __init__(self, schemas: Optional[TabbedJsonSchemas] = None, + portal_env: Optional[str] = None, portal_vapp: Optional[AbstractVirtualApp] = None): + if portal_env is None and portal_vapp is None: + portal_env = public_env_name(EnvUtils.PRD_ENV_NAME) + PRINT(f"The portal_env was not explicitly supplied. Schemas will come from portal_env={portal_env!r}.") + self.portal_env = portal_env + self.portal_vapp = portal_vapp + self.schemas = {} if schemas is None else schemas.copy() + + def fetch_relevant_schemas(self, schema_names: List[str]): # , schemas: Optional[TabbedSchemas] = None): + # if schemas is None: + # schemas = self.schemas + # The schema_names argument is not normally given, but it is there for easier testing + def fetch_schema(schema_name): + cached_schema = self.schemas.get(schema_name) # schemas.get(schema_name) + schema = self.fetch_schema(schema_name) if cached_schema is None else cached_schema + return schema_name, schema + return {schema_name: schema + for schema_name, schema in pmap(fetch_schema, schema_names)} + + def schema_exists(self, schema_name: str): + return bool(self.fetch_schema(schema_name=schema_name)) + + def fetch_schema(self, schema_name: str): + schema: Optional[AnyJsonData] = self.SCHEMA_CACHE.get(schema_name) + if schema is None and schema_name not in self.SCHEMA_CACHE: # If None is already stored, don't look it up again + schema = get_schema(schema_name, portal_env=self.portal_env, portal_vapp=self.portal_vapp) + self.SCHEMA_CACHE[schema_name] = schema + return schema + + @classmethod + def clear_schema_cache(cls): + for key in list(cls.SCHEMA_CACHE.keys()): # important to get the list of keys as a separate object first + cls.SCHEMA_CACHE.pop(key, None) + + def identifying_properties(self, schema: Optional[JsonSchema] = None, schema_name: Optional[str] = None, + among: Optional[List[str]] = None): + schema = schema if schema is not None else self.fetch_schema(schema_name) + possible_identifying_properties = set(schema.get("identifyingProperties") or []) | {'uuid'} + identifying_properties = sorted(possible_identifying_properties + if among is None + else (prop + for prop in among + if prop in possible_identifying_properties)) + return identifying_properties + + @classmethod + def identifying_value(cls, data_item: Dict[str, AnyJsonData], identifying_properties) -> AnyJsonData: + if not identifying_properties: + raise ValueError("No identifying properties were specified.") + for identifying_property in identifying_properties: + if identifying_property in data_item: + return data_item[identifying_property] + raise ValueError(f'{there_are(identifying_properties, just_are=True)}' + f' no {maybe_pluralize(identifying_properties, "identifying property")}' + f' {disjoined_list([repr(x) for x in identifying_properties])}' + f' in {json.dumps(data_item)}.') + + +def validate_data_against_schemas(data: TabbedSheetData, + portal_vapp: Optional[AbstractVirtualApp] = None, + schemas: Optional[TabbedJsonSchemas] = None) -> Optional[Dict]: + """ + Validates the given data against the corresponding schema(s). The given data is assumed to + be in a format as returned by sheet_utils, i.e. a dictionary of lists of objects where each + top-level dictionary property is the name of a data type for the contained list of objects. + If no schemas are passed then they will be fetched from the Portal using the given portal_vapp + to access them; the schemas are in a form similar to the data - a dictionary of schema objects, + where each top-level dictionary property is the name of the data type for the contained schema. + These data types are (strings) assumed to be in snake-case form, e.g. "file_submitted". + + If there are any missing required properties, any extraneous properties, or any undentified + items in the data, then returns a dictionary with an itemized description of each of these errors, + otherwise returns None if there are no problems. Note that an unidentified item is one which has + no value for uuid nor any of the other identifying property values as defined by the schema. + + For example given data that looks something like this: + { + "file_format": [ + , + , + + ], + "file_submitted": [ + , + , + + ] + } + + This function might return someting like this (assuming these errors existed): + { + "errors": [ + { "type": "file_format", + "unidentified": true, + "index": 2 + "identifying_properties": [ "uuid", "file_format" ] + }, + { "type": "file_format", + "item": "vcf_gz", + "index": 1 + "missing_properties": [ "standard_file_format" ] + }, + { "type": "file_submitted", + "item": "ebcfa32f-8eea-4591-a784-449fa5cd9ae9", + "index": 3 + "extraneous_properties": [ "xyzzy", "foobar" ] + } + { "error": "No schema found for: some_undefined_type" + } + ] + } + + The "item" is the identifying value for the specified object (uuid or another defined by the schema). + The "index" is the (0-indexed) ordinal position of the object within the list within the type within + the given data, which can be useful in identifying the object in the source data if it is unidentified. + """ + + schema_manager = SchemaManager(portal_vapp=portal_vapp, schemas=schemas) + + # def fetch_relevant_schemas(schema_names: List, portal_vapp: VirtualApp) -> List: + # def fetch_schema(schema_name: str) -> Optional[Dict]: + # return schema_name, get_schema(schema_name, portal_vapp=portal_vapp) + # return {schema_name: schema for schema_name, schema in pmap(fetch_schema, schema_names)} + # + # errors = [] + # + # if not schemas: + # if not portal_vapp: + # raise Exception("Must specify portal_vapp if no schemas specified.") + # try: + # schema_names = [data_type for data_type in data] + # schemas = fetch_relevant_schemas(schema_names, portal_vapp=portal_vapp) + # except Exception as e: + # errors.append({"exception": f"Exception fetching relevant schemas: {get_error_message(e)}"}) + # schemas = {} + + errors = [] + schemas = schema_manager.fetch_relevant_schemas(list(data.keys())) + + for data_type in data: + schema = schemas.get(data_type) + if not schema: + errors.append({"error": f"No schema found for: {data_type}"}) + continue + data_errors = validate_data_items_against_schemas(data[data_type], data_type, schema) + errors.extend(data_errors) + return {"errors": errors} if errors else None + + +def validate_data_items_against_schemas(data_items: SheetData, data_type: str, schema: JsonSchema) -> List[Dict]: + """ + Like validate_data_against_schemas but for a simple list of data items each of the same given data type. + """ + errors = [] + for data_item_index, data_item in enumerate(data_items): + data_item_errors = validate_data_item_against_schemas(data_item, data_type, data_item_index, schema) + errors.extend(data_item_errors) + return errors + + +def validate_data_item_against_schemas(data_item: AnyJsonData, data_type: str, + data_item_index: Optional[int], schema: JsonSchema) -> List[Dict]: + """ + Like validate_data_against_schemas but for a single data item of the given data type. + The given data item index is just for informational purposes; it corresponds to the + ordinal index of the data item in its containing list. Uses the standard jsonschema + package to do the heavy lifting of actual schema validation, but exerts extra effort to + specifically itemize/aggregate the most common (missing and extraneous properties) errors. + """ + errors = [] + + identifying_properties = schema.get("identifyingProperties", []) + identifying_value = SchemaManager.identifying_value(data_item, identifying_properties) + if not identifying_value: + errors.append({ + "type": data_type, + "unidentified": True, + "index": data_item_index, + "identifying_properties": identifying_properties + }) + + def extract_single_quoted_strings(message: str) -> List[str]: + return re.findall(r"'(.*?)'", message) + + schema_validator = jsonschema.Draft7Validator(schema) + for schema_validation_error in schema_validator.iter_errors(data_item): + if schema_validation_error.validator == "required": + errors.append({ + "type": data_type, + "item" if identifying_value else "unidentified": identifying_value if identifying_value else True, + "index": data_item_index, + "missing_properties": schema_validation_error.validator_value}) + continue + if schema_validation_error.validator == "additionalProperties": + properties = extract_single_quoted_strings(schema_validation_error.message) + if properties: + errors.append({ + "type": data_type, + "item" if identifying_value else "unidentified": identifying_value if identifying_value else True, + "index": data_item_index, + "extraneous_properties": properties}) + continue + errors.append({ + "type": data_type, + "item" if identifying_value else "unidentified": identifying_value if identifying_value else True, + "index": data_item_index, + "unclassified_error": schema_validation_error.message}) + + return errors + + +def summary_of_data_validation_errors(data_validation_errors: Dict, # submission: SmahtSubmissionFolio, + data_file_name: str, + s3_data_file_location: str, + s3_details_location: str) -> List[str]: + """ + Summarize the given data validation errors into a simple short list of English phrases; + this will end up going into the additional_properties of the IngestionSubmission object + in the Portal database (see SubmissionFolio.record_results); this is what will get + displayed, if any errors, by the submitr tool when it detects processing has completed. + """ + errors = data_validation_errors.get("errors") + if not errors: + return [] + + unidentified_count = 0 + missing_properties_count = 0 + extraneous_properties_count = 0 + unclassified_error_count = 0 + exception_count = 0 + + for error in errors: + if error.get("unidentified"): + unidentified_count += 1 + if error.get("missing_properties"): + missing_properties_count += 1 + if error.get("extraneous_properties"): + extraneous_properties_count += 1 + if error.get("unclassified_error_count"): + unclassified_error_count += 1 + if error.get("exception"): + exception_count += 1 + + return [ + f"Ingestion data validation error summary:", + # f"Data file: {submission.data_file_name}", + f"Data file: {data_file_name}", + # f"Data file in S3: {submission.s3_data_file_location}", + f"Data file in S3: {s3_data_file_location}", + f"Items unidentified: {unidentified_count}", + f"Items missing properties: {missing_properties_count}", + f"Items with extraneous properties: {extraneous_properties_count}", + f"Other errors: {unclassified_error_count}", + f"Exceptions: {exception_count}", + # f"Details: {submission.s3_details_location}" + f"Details: {s3_details_location}" + ] diff --git a/test/data_files/sample_items.tabs.json b/test/data_files/sample_items.tabs.json index f972245f0..b59118c62 100644 --- a/test/data_files/sample_items.tabs.json +++ b/test/data_files/sample_items.tabs.json @@ -18,10 +18,7 @@ "x": "alpha", "y": { "a": "beta", - "z": [ - "gamma", - "delta" - ] + "z": "gamma|delta" } } ], diff --git a/test/data_files/sample_items.tabs.yaml b/test/data_files/sample_items.tabs.yaml index f98d9259b..5ca636a4c 100644 --- a/test/data_files/sample_items.tabs.yaml +++ b/test/data_files/sample_items.tabs.yaml @@ -10,9 +10,7 @@ Sheet1: - x: alpha y: a: beta - z: - - gamma - - delta + z: gamma|delta Sheet2: - age: 23 father: diff --git a/test/helpers_for_bundles.py b/test/helpers_for_bundles.py new file mode 100644 index 000000000..922ed1eaa --- /dev/null +++ b/test/helpers_for_bundles.py @@ -0,0 +1,70 @@ +SAMPLE_PROJECT_UUID = "dac6d5b3-6ef6-4271-9715-a78329acf846" +SAMPLE_PROJECT_NAME = 'test-project' +SAMPLE_PROJECT_TITLE = SAMPLE_PROJECT_NAME.title().replace('-', ' ') +SAMPLE_PROJECT = { + "title": SAMPLE_PROJECT_TITLE, + "uuid": SAMPLE_PROJECT_UUID, + "description": f"This is the {SAMPLE_PROJECT_TITLE}.", + "name": SAMPLE_PROJECT_NAME, + "status": "shared", + "date_created": "2020-11-24T20:46:00.000000+00:00", +} +SAMPLE_PROJECT_SANS_UUID = SAMPLE_PROJECT.copy() # to be modified on next line +SAMPLE_PROJECT_SANS_UUID.pop('uuid') + +SAMPLE_INSTITUTION_UUID = "87199845-51b5-4352-bdea-583edae4bb6a" +SAMPLE_INSTITUTION_NAME = "cgap-backend-team" +SAMPLE_INSTITUTION_TITLE = SAMPLE_INSTITUTION_NAME.title().replace('-', ' ') +SAMPLE_INSTITUTION = { + "name": SAMPLE_INSTITUTION_NAME, + "title": SAMPLE_INSTITUTION_TITLE, + "status": "shared", + "uuid": SAMPLE_INSTITUTION_UUID, +} +SAMPLE_INSTITUTION_SANS_UUID = SAMPLE_INSTITUTION.copy() # to be modified on next line +SAMPLE_INSTITUTION_SANS_UUID.pop('uuid') + +SAMPLE_USER_EMAIL = "jdoe@example.com" +SAMPLE_USER_FIRST_NAME = "Jenny" +SAMPLE_USER_LAST_NAME = "Doe" +SAMPLE_USER_ROLE = "developer" +SAMPLE_USER_UUID = "e0dec518-cb0c-45f3-8c97-21b2659ec129" +SAMPLE_USER_WITH_UUID_REFS = { + "email": SAMPLE_USER_EMAIL, + "first_name": SAMPLE_USER_FIRST_NAME, + "last_name": SAMPLE_USER_LAST_NAME, + "uuid": SAMPLE_USER_UUID, + "project": SAMPLE_PROJECT_UUID, + "project_roles#0.project": SAMPLE_PROJECT_UUID, + "project_roles#0.role": SAMPLE_USER_ROLE, + "user_institution": SAMPLE_INSTITUTION_UUID, +} +SAMPLE_USER_WITH_NAME_REFS = { + "email": SAMPLE_USER_EMAIL, + "first_name": SAMPLE_USER_FIRST_NAME, + "last_name": SAMPLE_USER_LAST_NAME, + "uuid": SAMPLE_USER_UUID, + "project": SAMPLE_PROJECT_NAME, + "project_roles#0.project": SAMPLE_PROJECT_NAME, + "project_roles#0.role": SAMPLE_USER_ROLE, + "user_institution": SAMPLE_INSTITUTION_NAME, +} + +SAMPLE_WORKBOOK_WITH_UNMATCHED_UUID_REFS = { + # Here the User refers to project and institution by UUID, but we don't have the UUID in our local cache + "User": [SAMPLE_USER_WITH_UUID_REFS], + "Project": [SAMPLE_PROJECT_SANS_UUID], + "Institution": [SAMPLE_INSTITUTION_SANS_UUID], +} + +SAMPLE_WORKBOOK_WITH_MATCHED_UUID_REFS = { + "User": [SAMPLE_USER_WITH_UUID_REFS], + "Project": [SAMPLE_PROJECT], + "Institution": [SAMPLE_INSTITUTION], +} + +SAMPLE_WORKBOOK_WITH_NAME_REFS = { + "User": [SAMPLE_USER_WITH_NAME_REFS], + "Project": [SAMPLE_PROJECT], + "Institution": [SAMPLE_INSTITUTION], +} diff --git a/test/test_bundle_utils.py b/test/test_bundle_utils.py index 4569905e7..5b38b8e0a 100644 --- a/test/test_bundle_utils.py +++ b/test/test_bundle_utils.py @@ -1,35 +1,31 @@ -# import contextlib import contextlib +# import copy +import glob import json import os import pytest import re # from collections import namedtuple -from dcicutils import bundle_utils as bundle_utils_module, ff_utils as ff_utils_module -from dcicutils.common import AnyJsonData -from dcicutils.env_utils import EnvUtils, public_env_name -from dcicutils.misc_utils import ( - ignored, is_uuid, local_attrs, NamedObject, AbstractVirtualApp, to_snake_case, json_file_contents, +from dcicutils import ( + bundle_utils as bundle_utils_module, + ff_utils as ff_utils_module, + validation_utils as validation_utils_module ) -from dcicutils.qa_utils import printed_output, mock_not_called, MockResponse from dcicutils.bundle_utils import ( # High-level interfaces - # ItemManager, - inflate, check, - load_table_structures, - load_items, # ITEM_MANAGER_REGISTRY, + load_table_structures, load_items, # inflate, # Low-level implementation - # SchemaAutoloadMixin, - SchemaManager, - ItemTools, - TableInflater, TableChecker, TabbedItemTable, - # XlsxItemManager, - # CsvItemManager, TsvItemManager, - NumHint, - TypeHint, EnumHint, - BoolHint, + SchemaManager, ItemTools, TableChecker, + # XlsxItemManager, CsvItemManager, TsvItemManager, ... + BoolHint, # NumHint, TypeHint, EnumHint, RefHint, ... +) +from dcicutils.common import AnyJsonData +from dcicutils.env_utils import EnvUtils, public_env_name +from dcicutils.misc_utils import ( + ignored, is_uuid, NamedObject, AbstractVirtualApp, to_snake_case, json_file_contents, find_association, ) +from dcicutils.qa_utils import printed_output, mock_not_called, MockResponse from dcicutils.sheet_utils import ( # High-level interfaces # TABLE_SET_MANAGER_REGISTRY, @@ -43,14 +39,23 @@ infer_tab_name_from_filename, # prefer_number, unwanted_kwargs, expand_string_escape_sequences, load_table_set, ) -from typing import Dict, Optional +# from dcicutils.validation_utils import validate_data_against_schemas, summary_of_data_validation_errors +from typing import Dict from unittest import mock from .conftest_settings import TEST_DIR from .helpers import using_fresh_ff_state_for_testing +from .helpers_for_bundles import ( + SAMPLE_PROJECT_UUID, SAMPLE_INSTITUTION_UUID, + SAMPLE_WORKBOOK_WITH_UNMATCHED_UUID_REFS, SAMPLE_WORKBOOK_WITH_MATCHED_UUID_REFS, + SAMPLE_WORKBOOK_WITH_NAME_REFS, +) from .test_sheet_utils import ( SAMPLE_XLSX_FILE, SAMPLE_XLSX_FILE_ITEM_CONTENT, # SAMPLE_XLSX_FILE_RAW_CONTENT, + SAMPLE_XLSX_FILE_INFLATED_CONTENT, SAMPLE_CSV_FILE, SAMPLE_CSV_FILE_ITEM_CONTENT, # SAMPLE_CSV_FILE_RAW_CONTENT, + SAMPLE_CSV_FILE_INFLATED_CONTENT, SAMPLE_TSV_FILE, SAMPLE_TSV_FILE_ITEM_CONTENT, # SAMPLE_TSV_FILE_RAW_CONTENT, + SAMPLE_TSV_FILE_INFLATED_CONTENT, SAMPLE_JSON_TABS_FILE, SAMPLE_JSON_TABS_FILE_ITEM_CONTENT, SAMPLE_YAML_TABS_FILE, ) @@ -108,53 +113,47 @@ def test_item_tools_parse_item_value_basic(): for x in [37, 19.3, True, False, None, 'simple text']: assert ItemTools.parse_item_value(x) == x - assert ItemTools.parse_item_value('3') == 3 - assert ItemTools.parse_item_value('+3') == 3 - assert ItemTools.parse_item_value('-3') == -3 - - assert ItemTools.parse_item_value('3.5') == 3.5 - assert ItemTools.parse_item_value('+3.5') == 3.5 - assert ItemTools.parse_item_value('-3.5') == -3.5 - - assert ItemTools.parse_item_value('3.5e1') == 35.0 - assert ItemTools.parse_item_value('+3.5e1') == 35.0 - assert ItemTools.parse_item_value('-3.5e1') == -35.0 + expectations = [ - assert ItemTools.parse_item_value('') is None + # Integers + ('3', 3), ('+3', 3), ('-3', -3), - assert ItemTools.parse_item_value('null') is None - assert ItemTools.parse_item_value('Null') is None - assert ItemTools.parse_item_value('NULL') is None + # Floats + ('3.5', 3.5), ('+3.5', 3.5), ('-3.5', -3.5), + ('3.5e1', 35.0), ('+3.5e1', 35.0), ('-3.5e1', -35.0), - assert ItemTools.parse_item_value('true') is True - assert ItemTools.parse_item_value('True') is True - assert ItemTools.parse_item_value('TRUE') is True + # Nulls + (None, None), + ('', None), ('null', None), ('Null', None), ('NULL', None), - assert ItemTools.parse_item_value('false') is False - assert ItemTools.parse_item_value('False') is False - assert ItemTools.parse_item_value('FALSE') is False - - assert ItemTools.parse_item_value('|') == [] # special case: lone '|' means empty - assert ItemTools.parse_item_value('alpha|') == ['alpha'] # special case: trailing '|' means singleton - assert ItemTools.parse_item_value('|alpha|') == [None, 'alpha'] - assert ItemTools.parse_item_value('|alpha') == [None, 'alpha'] - assert ItemTools.parse_item_value('alpha|beta|gamma') == ['alpha', 'beta', 'gamma'] - assert ItemTools.parse_item_value('alpha|true|false|null||7|1.5') == ['alpha', True, False, None, None, 7, 1.5] - - -def test_item_tools_parse_item_value_guids(): - - sample_simple_field_input = "#foo" - - parsed = ItemTools.parse_item_value(sample_simple_field_input) - assert parsed == sample_simple_field_input + # Booleans + ('true', True), ('True', True), ('TRUE', True), + ('false', False), ('False', False), ('FALSE', False), + ] - sample_compound_field_input = '#foo|#bar' - sample_compound_field_list = ['#foo', '#bar'] + for input, heuristic_result in expectations: + assert ItemTools.parse_item_value(input) == input + assert ItemTools.parse_item_value(input, apply_heuristics=False) == input + assert ItemTools.parse_item_value(input, apply_heuristics=True) == heuristic_result + assert ItemTools.parse_item_value(input, apply_heuristics=True, split_pipe=False) == heuristic_result + assert ItemTools.parse_item_value(input, apply_heuristics=True, split_pipe=True) == heuristic_result + + expectations = [ + # Lists + ('|', []), # special case: lone '|' means empty + ('alpha|', ['alpha']), ('7|', [7]), # special case: trailing '|' means singleton + # These follow from general case of '|' as separator of items recursively parsed + ('|alpha', [None, 'alpha']), ('|alpha|', [None, 'alpha']), ('|7', [None, 7]), + ('alpha|beta|gamma', ['alpha', 'beta', 'gamma']), + ('alpha|true|false|null||7|1.5', ['alpha', True, False, None, None, 7, 1.5]) + ] - parsed = ItemTools.parse_item_value(sample_compound_field_input) - assert isinstance(parsed, list) - assert parsed == sample_compound_field_list + for input, heuristic_result in expectations: + assert ItemTools.parse_item_value(input) == input + assert ItemTools.parse_item_value(input, apply_heuristics=False) == input + assert ItemTools.parse_item_value(input, apply_heuristics=True) == input + assert ItemTools.parse_item_value(input, apply_heuristics=True, split_pipe=False) == input + assert ItemTools.parse_item_value(input, apply_heuristics=True, split_pipe=True) == heuristic_result def test_item_tools_set_path_value(): @@ -216,9 +215,9 @@ def test_item_tools_find_type_hint(): def test_load_table_structures(): - assert load_table_structures(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_ITEM_CONTENT - assert load_table_structures(SAMPLE_CSV_FILE) == SAMPLE_CSV_FILE_ITEM_CONTENT - assert load_table_structures(SAMPLE_TSV_FILE) == SAMPLE_TSV_FILE_ITEM_CONTENT + assert load_table_structures(SAMPLE_XLSX_FILE, apply_heuristics=True) == SAMPLE_XLSX_FILE_INFLATED_CONTENT + assert load_table_structures(SAMPLE_CSV_FILE, apply_heuristics=True) == SAMPLE_CSV_FILE_INFLATED_CONTENT + assert load_table_structures(SAMPLE_TSV_FILE, apply_heuristics=True) == SAMPLE_TSV_FILE_INFLATED_CONTENT loaded = load_table_structures(SAMPLE_JSON_TABS_FILE) print("loaded=", json.dumps(loaded, indent=2)) @@ -232,12 +231,17 @@ def test_load_table_structures(): def test_load_items(): - assert load_items(SAMPLE_XLSX_FILE, autoload_schemas=False) == SAMPLE_XLSX_FILE_ITEM_CONTENT - assert load_items(SAMPLE_CSV_FILE, autoload_schemas=False) == SAMPLE_CSV_FILE_ITEM_CONTENT - with pytest.raises(LoadArgumentsError) as exc: - load_items("something.else") - assert str(exc.value) == "Unknown file type: something.else" + with mock.patch.object(validation_utils_module, "get_schema") as mock_get_schema: + mock_get_schema.return_value = {} + + assert load_items(SAMPLE_XLSX_FILE, apply_heuristics=True) == SAMPLE_XLSX_FILE_ITEM_CONTENT + assert load_items(SAMPLE_CSV_FILE, apply_heuristics=True) == SAMPLE_CSV_FILE_ITEM_CONTENT + assert load_items(SAMPLE_TSV_FILE, apply_heuristics=True) == SAMPLE_TSV_FILE_ITEM_CONTENT + + with pytest.raises(LoadArgumentsError) as exc: + load_items("something.else") + assert str(exc.value) == "Unknown file type: something.else" SAMPLE_CSV_FILE2 = os.path.join(TEST_DIR, 'data_files/sample_items2.csv') @@ -383,14 +387,17 @@ def matches_template(json1: AnyJsonData, json2: AnyJsonData, *, previous_matches def test_load_items_with_schema(): + print() # start on a fresh line + print("Case 1") expected = SAMPLE_CSV_FILE2_CONTENT actual = CsvManager.load(SAMPLE_CSV_FILE2) assert actual == expected print("Case 2") + file_base_name = os.path.splitext(os.path.basename(SAMPLE_CSV_FILE2))[0] expected = SAMPLE_CSV_FILE2_ITEM_CONTENT - actual = load_items(SAMPLE_CSV_FILE2, schemas=SAMPLE_CSV_FILE2_SCHEMAS) + actual = load_items(SAMPLE_CSV_FILE2, schemas={file_base_name: {}}, apply_heuristics=True) # schemas=SAMPLE_CSV_FILE2_SCHEMAS) assert actual == expected print("Case 3") @@ -401,9 +408,9 @@ def test_load_items_with_schema(): def test_sample_items_csv_vs_json(): - csv_content = load_items(SAMPLE_CSV_FILE2, schemas=SAMPLE_CSV_FILE2_SCHEMAS, tab_name='Person') + csv_content = load_items(SAMPLE_CSV_FILE2, tab_name='Person', schemas=SAMPLE_CSV_FILE2_SCHEMAS) - json_content = load_items(SAMPLE_JSON_FILE2, tab_name="Person") + json_content = load_items(SAMPLE_JSON_FILE2, tab_name="Person", schemas=SAMPLE_CSV_FILE2_SCHEMAS) assert csv_content == json_content @@ -415,19 +422,19 @@ def test_sample_items_json_vs_yaml(): assert tabs_data_from_json == tabs_data_from_yaml -@pytest.mark.parametrize('instaguids_enabled', [True, False]) -def test_load_items_with_schema_and_instaguids(instaguids_enabled): - - with local_attrs(ItemTools, INSTAGUIDS_ENABLED=instaguids_enabled): - - expected = SAMPLE_CSV_FILE3_PERSON_CONTENT_HINTED - print("expected=", json.dumps(expected, indent=2)) - actual = load_items(SAMPLE_CSV_FILE3, schemas=SAMPLE_CSV_FILE3_SCHEMAS, tab_name='Person') - print("actual=", json.dumps(actual, indent=2)) - if instaguids_enabled: - assert matches_template(actual, expected) - else: - assert actual == expected # no substitution performed +# @pytest.mark.parametrize('instaguids_enabled', [True, False]) +# def test_load_items_with_schema_and_instaguids(instaguids_enabled): +# +# with local_attrs(ItemTools, INSTAGUIDS_ENABLED=instaguids_enabled): +# +# expected = SAMPLE_CSV_FILE3_PERSON_CONTENT_HINTED +# print("expected=", json.dumps(expected, indent=2)) +# actual = load_items(SAMPLE_CSV_FILE3, schemas=SAMPLE_CSV_FILE3_SCHEMAS, tab_name='Person') +# print("actual=", json.dumps(actual, indent=2)) +# if instaguids_enabled: +# assert matches_template(actual, expected) +# else: +# assert actual == expected # no substitution performed @using_fresh_ff_state_for_testing() @@ -446,9 +453,9 @@ def test_schema_autoload_mixin_caching(portal_env): sample_schema_name = 'foo' sample_schema = {'mock_schema_for': 'foo'} - with mock.patch.object(bundle_utils_module, "get_schema") as mock_get_schema: + with mock.patch.object(validation_utils_module, "get_schema") as mock_get_schema: mock_get_schema.return_value = sample_schema - assert schema_manager.fetch_schema(sample_schema_name, portal_env=schema_manager.portal_env) == sample_schema + assert schema_manager.fetch_schema(sample_schema_name) == sample_schema schema_cache_with_sample_schema = {sample_schema_name: sample_schema} assert SchemaManager.SCHEMA_CACHE == schema_cache_with_sample_schema @@ -465,7 +472,7 @@ def test_schema_autoload_mixin_fetch_schema(portal_env): assert schema_manager.portal_env == 'data' - user_schema = schema_manager.fetch_schema('user', portal_env=schema_manager.portal_env) + user_schema = schema_manager.fetch_schema('user') assert user_schema['$id'] == '/profiles/user.json' assert user_schema['title'] == 'User' @@ -519,8 +526,10 @@ def test_workbook_with_schemas(): } assert actual_data == expected_data + # portal_env = public_env_name(EnvUtils.PRD_ENV_NAME) + actual_items = load_items(SAMPLE_ITEMS_FOR_REAL_SCHEMAS_FILE, - tab_name='ExperimentSeq', autoload_schemas=True) + tab_name='ExperimentSeq') expected_items = { "ExperimentSeq": [ { @@ -576,19 +585,23 @@ def get(self, path_url): portal_vapp = MockVapp(name=f'MockVapp[{portal_env}]') old_count = portal_vapp.call_count - with mock.patch.object(ff_utils_module, "get_authentication_with_server", mock_not_called("get_authentication_with_server")): with mock.patch.object(ff_utils_module, "get_metadata", mock_not_called("get_metadata")): actual_items = load_items(SAMPLE_ITEMS_FOR_REAL_SCHEMAS_FILE, - tab_name='ExperimentSeq', autoload_schemas=True, portal_vapp=portal_vapp) + tab_name='ExperimentSeq', portal_vapp=portal_vapp) assert portal_vapp.call_count == old_count + 1 assert actual_items == expected_items _SAMPLE_SCHEMA_DIR = os.path.join(TEST_DIR, "data_files", "sample_schemas") +_SAMPLE_SCHEMAS = { + os.path.splitext(os.path.basename(file))[0]: json_file_contents(file) + for file in glob.glob(os.path.join(_SAMPLE_SCHEMA_DIR, "*.json")) +} + _SAMPLE_INSERTS_DIR = os.path.join(TEST_DIR, "data_files", "sample_inserts") _SAMPLE_INSERTS = load_table_set(_SAMPLE_INSERTS_DIR) @@ -596,7 +609,27 @@ def get(self, path_url): @contextlib.contextmanager -def mocked_schemas(mock_remotes=None, expected_portal_env=None, expected_portal_vapp=None): +def mocked_schemas(mock_remotes: bool = True, expected_portal_env=None, expected_portal_vapp=None): + + def lookup_mock_schema(item_type): + schema = _SAMPLE_SCHEMAS.get(item_type) + assert schema, f"The item type {item_type} is not mocked." + return schema + + def lookup_sample_insert(item_type, item_ref): + data = _SAMPLE_INSERTS[item_type] + schema = lookup_mock_schema(item_type) + possible_identifying_properties = set(schema.get("identifyingProperties") or []) | {'uuid'} + if not data: + return None + for prop in possible_identifying_properties: + if not prop in data[0]: + continue + found = find_association(data, **{prop: item_ref}) + if found: + return found + return None + def mocked_get_schema(schema_name, portal_env=None, portal_vapp=None): if expected_portal_env is not None: assert portal_env == expected_portal_env, (f"get_schema got ff_env={portal_env!r}," @@ -604,79 +637,26 @@ def mocked_get_schema(schema_name, portal_env=None, portal_vapp=None): if expected_portal_vapp is not None: assert portal_vapp == expected_portal_vapp, (f"get_schema got portal_vapp={portal_vapp!r}," f" but expected portal_vapp={expected_portal_vapp!r}.") - snake_name = to_snake_case(schema_name) - schema_file = os.path.join(_SAMPLE_SCHEMA_DIR, f"{snake_name}.json") - if os.path.exists(schema_file): - return json_file_contents(schema_file) - else: - return None + schema_snake_name = to_snake_case(schema_name) + return lookup_mock_schema(schema_snake_name) def mocked_get_metadata(obj_id, key=None, ff_env=None, check_queue=False, add_on=''): ignored(key, ff_env, check_queue, add_on) + if not mock_remotes: + raise Exception("No mock-remote {obj_id} was found.") parts = ID_NAME_PATTERN.match(obj_id) assert parts, f"mocked_get_metadata got {obj_id}, but expected //" item_type, item_ref = parts.groups() - return _SAMPLE_INSERTS_LOOKUP_TABLE.contains_ref(item_type=item_type, item_ref=item_ref) + return lookup_sample_insert(item_type=item_type, item_ref=item_ref) - with mock.patch.object(bundle_utils_module, "get_schema") as mock_get_schema: + with mock.patch.object(validation_utils_module, "get_schema") as mock_get_schema: mock_get_schema.side_effect = mocked_get_schema with mock.patch.object(bundle_utils_module, "get_metadata") as mock_get_metadata: mock_get_metadata.side_effect = mocked_get_metadata - _SAMPLE_INSERTS_LOOKUP_TABLE = TabbedItemTable(_SAMPLE_INSERTS) yield -SAMPLE_PROJECT_UUID = "dac6d5b3-6ef6-4271-9715-a78329acf846" -SAMPLE_PROJECT_NAME = 'test-project' -SAMPLE_PROJECT_TITLE = SAMPLE_PROJECT_NAME.title().replace('-', ' ') -SAMPLE_PROJECT = { - "title": SAMPLE_PROJECT_TITLE, - "uuid": SAMPLE_PROJECT_UUID, - "description": f"This is the {SAMPLE_PROJECT_TITLE}.", - "name": SAMPLE_PROJECT_NAME, - "status": "shared", - "date_created": "2020-11-24T20:46:00.000000+00:00", -} -SAMPLE_PROJECT_SANS_UUID = SAMPLE_PROJECT.copy() # to be modified on next line -SAMPLE_PROJECT_SANS_UUID.pop('uuid') - -SAMPLE_INSTITUTION_UUID = "87199845-51b5-4352-bdea-583edae4bb6a" -SAMPLE_INSTITUTION_NAME = "cgap-backend-team" -SAMPLE_INSTITUTION_TITLE = SAMPLE_INSTITUTION_NAME.title().replace('-', ' ') -SAMPLE_INSTITUTION = { - "name": SAMPLE_INSTITUTION_NAME, - "title": SAMPLE_INSTITUTION_TITLE, - "status": "shared", - "uuid": SAMPLE_INSTITUTION_UUID, -} -SAMPLE_INSTITUTION_SANS_UUID = SAMPLE_INSTITUTION.copy() # to be modified on next line -SAMPLE_INSTITUTION_SANS_UUID.pop('uuid') - -SAMPLE_USER_EMAIL = "jdoe@example.com" -SAMPLE_USER_FIRST_NAME = "Jenny" -SAMPLE_USER_LAST_NAME = "Doe" -SAMPLE_USER_ROLE = "developer" -SAMPLE_USER_UUID = "e0dec518-cb0c-45f3-8c97-21b2659ec129" -SAMPLE_USER_WITH_UUID_REFS = { - "email": SAMPLE_USER_EMAIL, - "first_name": SAMPLE_USER_FIRST_NAME, - "last_name": SAMPLE_USER_LAST_NAME, - "uuid": SAMPLE_USER_UUID, - "project": SAMPLE_PROJECT_UUID, - "project_roles#0.project": SAMPLE_PROJECT_UUID, - "project_roles#0.role": SAMPLE_USER_ROLE, - "user_institution": SAMPLE_INSTITUTION_UUID, -} -SAMPLE_USER_WITH_NAME_REFS = { - "email": SAMPLE_USER_EMAIL, - "first_name": SAMPLE_USER_FIRST_NAME, - "last_name": SAMPLE_USER_LAST_NAME, - "uuid": SAMPLE_USER_UUID, - "project": SAMPLE_PROJECT_NAME, - "project_roles#0.project SAMPLE_PROJECT_NAME," - "project_roles#0.role": SAMPLE_USER_ROLE, - "user_institution": SAMPLE_INSTITUTION_NAME, -} + def test_table_checker(): @@ -685,40 +665,23 @@ def test_table_checker(): mock_ff_env = 'some-env' - with mocked_schemas(mock_remotes=False): - - # # Here the User refers to project and institution by UUID, but we don't have the UUID in our - # sample_workbook_with_unmatched_uuid_refs = { - # "User": [SAMPLE_USER_WITH_UUID_REFS], - # "Project": [SAMPLE_PROJECT_SANS_UUID], - # "Institution": [SAMPLE_INSTITUTION_SANS_UUID], - # } - # - # with printed_output() as printed: - # with pytest.raises(Exception) as exc: - # checker = TableChecker(sample_workbook_with_unmatched_uuid_refs, portal_env=mock_ff_env) - # checker.check_tabs() - # assert str(exc.value) == "There were 2 problems while compiling hints." - # assert printed.lines == [ - # f"Problem: User[0].project: Unable to validate Project reference: {SAMPLE_PROJECT_UUID!r}", - # (f"Problem: User[0].user_institution: Unable to validate Institution reference:" - # f" {SAMPLE_INSTITUTION_UUID!r}") - # ] - - sample_workbook_with_matched_uuid_refs = { - "User": [SAMPLE_USER_WITH_UUID_REFS], - "Project": [SAMPLE_PROJECT], - "Institution": [SAMPLE_INSTITUTION], - } + with mocked_schemas(mock_remotes=True): + + with printed_output() as printed: + with pytest.raises(Exception) as exc: + checker = TableChecker(SAMPLE_WORKBOOK_WITH_UNMATCHED_UUID_REFS, portal_env=mock_ff_env) + checker.check_tabs() + assert str(exc.value) == "There were 2 problems while compiling hints." + assert printed.lines == [ + f"Problem: User[0].project: Unable to validate Project reference: {SAMPLE_PROJECT_UUID!r}", + (f"Problem: User[0].user_institution: Unable to validate Institution reference:" + f" {SAMPLE_INSTITUTION_UUID!r}") + ] - checker = TableChecker(sample_workbook_with_matched_uuid_refs, portal_env=mock_ff_env) + checker = TableChecker(SAMPLE_WORKBOOK_WITH_MATCHED_UUID_REFS, portal_env=mock_ff_env) checker.check_tabs() - # sample_workbook_with_name_refs = { - # "User": [SAMPLE_USER_WITH_NAME_REFS], - # "Project": [SAMPLE_PROJECT], - # "Institution": [SAMPLE_INSTITUTION], - # } - # - # checker = TableChecker(sample_workbook_with_name_refs, portal_env=mock_ff_env) - # checker.check_tabs() + checker = TableChecker(SAMPLE_WORKBOOK_WITH_NAME_REFS, portal_env=mock_ff_env) + checker.check_tabs() + + diff --git a/test/test_sheet_utils.py b/test/test_sheet_utils.py index a20f6bb8c..c012772c1 100644 --- a/test/test_sheet_utils.py +++ b/test/test_sheet_utils.py @@ -1,25 +1,8 @@ -# import contextlib import json import os import pytest from collections import namedtuple -# from dcicutils import bundle_utils as bundle_utils_module, ff_utils as ff_utils_module -# from dcicutils.common import AnyJsonData -# from dcicutils.env_utils import EnvUtils, public_env_name -# from dcicutils.misc_utils import is_uuid, local_attrs, NamedObject, AbstractVirtualApp -# from dcicutils.qa_utils import printed_output, mock_not_called, MockResponse -# from dcicutils.bundle_utils import ( -# # High-level interfaces -# ItemManager, load_items, ITEM_MANAGER_REGISTRY, -# # Low-level implementation -# SchemaAutoloadMixin, -# ItemTools, -# XlsxItemManager, -# CsvItemManager, TsvItemManager, -# # TypeHint, EnumHint, -# BoolHint, -# ) from dcicutils.sheet_utils import ( # High-level interfaces TABLE_SET_MANAGER_REGISTRY, @@ -32,10 +15,7 @@ # Utilities prefer_number, unwanted_kwargs, expand_string_escape_sequences, infer_tab_name_from_filename, ) -# from typing import Dict, Optional -# from unittest import mock from .conftest_settings import TEST_DIR -# from .helpers import using_fresh_ff_state_for_testing TEST_SHEET_1 = 'Sheet1' @@ -160,11 +140,39 @@ def test_table_set_manager_registry_manager_for_filename(): ] } +SAMPLE_XLSX_FILE_INFLATED_CONTENT = { + "Sheet1": [ + {"x": 1, "y": {"a": 1, "z": 1}}, + {"x": 1, "y": {"a": 2, "z": 3}}, + {"x": "alpha", "y": {"a": "beta", "z": "gamma|delta"}}, # ["gamma", "delta"] + ], + "Sheet2": [ + { + "name": "bill", "age": 23, + "mother": {"name": "mary", "age": 58}, + "father": {"name": "fred", "age": 63}, + "friends": [ + {"name": "sam", "age": 22}, + {"name": "arthur", "age": 19}, + ] + }, + { + "name": "joe", "age": 9, + "mother": {"name": "estrella", "age": 35}, + "father": {"name": "anthony", "age": 34}, + "friends": [ + {"name": "anders", "age": 9}, + {"name": None, "age": None} + ] + }, + ], +} + SAMPLE_XLSX_FILE_ITEM_CONTENT = { "Sheet1": [ {"x": 1, "y": {"a": 1, "z": 1}}, {"x": 1, "y": {"a": 2, "z": 3}}, - {"x": "alpha", "y": {"a": "beta", "z": ["gamma", "delta"]}}, + {"x": "alpha", "y": {"a": "beta", "z": "gamma|delta"}}, # not ["gamma", "delta"], unless schema says so ], "Sheet2": [ { @@ -194,6 +202,8 @@ def test_table_set_manager_registry_manager_for_filename(): SAMPLE_CSV_FILE_RAW_CONTENT = {SAMPLE_CSV_FILE_SHEET_NAME: SAMPLE_XLSX_FILE_RAW_CONTENT['Sheet2']} +SAMPLE_CSV_FILE_INFLATED_CONTENT = {SAMPLE_CSV_FILE_SHEET_NAME: SAMPLE_XLSX_FILE_INFLATED_CONTENT['Sheet2']} + SAMPLE_CSV_FILE_ITEM_CONTENT = {SAMPLE_CSV_FILE_SHEET_NAME: SAMPLE_XLSX_FILE_ITEM_CONTENT['Sheet2']} SAMPLE_TSV_FILE = os.path.join(TEST_DIR, 'data_files/sample_items_sheet2.tsv') @@ -202,6 +212,8 @@ def test_table_set_manager_registry_manager_for_filename(): SAMPLE_TSV_FILE_RAW_CONTENT = {SAMPLE_TSV_FILE_SHEET_NAME: SAMPLE_XLSX_FILE_RAW_CONTENT['Sheet2']} +SAMPLE_TSV_FILE_INFLATED_CONTENT = {SAMPLE_TSV_FILE_SHEET_NAME: SAMPLE_XLSX_FILE_INFLATED_CONTENT['Sheet2']} + SAMPLE_TSV_FILE_ITEM_CONTENT = {SAMPLE_TSV_FILE_SHEET_NAME: SAMPLE_XLSX_FILE_ITEM_CONTENT['Sheet2']} SAMPLE_JSON_TABS_FILE = os.path.join(TEST_DIR, 'data_files/sample_items.tabs.json') diff --git a/test/test_validation_utils.py b/test/test_validation_utils.py new file mode 100644 index 000000000..ce1730d1e --- /dev/null +++ b/test/test_validation_utils.py @@ -0,0 +1,107 @@ +import copy +import os +import pytest +import re + +from dcicutils.bundle_utils import inflate +from dcicutils.misc_utils import AbstractVirtualApp, NamedObject, json_file_contents, to_snake_case +from dcicutils.qa_utils import MockResponse +from dcicutils.validation_utils import SchemaManager, validate_data_against_schemas, summary_of_data_validation_errors +from .conftest_settings import TEST_DIR +from .helpers_for_bundles import SAMPLE_WORKBOOK_WITH_NAME_REFS + + +def test_schema_manager_identifying_value(): + + with pytest.raises(ValueError) as exc: + assert SchemaManager.identifying_value({'any': 'thing'}, []) + assert str(exc.value) == "No identifying properties were specified." + + person_named_fred = {'age': 33, 'name': 'Fred', 'favorite-color': 'yellow'} + assert SchemaManager.identifying_value(person_named_fred, ['uuid', 'name']) == 'Fred' + + person_nicknamed_fred = {'age': 33, 'nickname': 'Fred', 'favorite-color': 'yellow'} + with pytest.raises(ValueError) as exc: + SchemaManager.identifying_value(person_nicknamed_fred, ['uuid', 'name']) + assert str(exc.value) == ("""There are no identifying properties 'uuid' or 'name'""" + """ in {"age": 33, "nickname": "Fred", "favorite-color": "yellow"}.""") + + with pytest.raises(ValueError) as exc: + SchemaManager.identifying_value(person_nicknamed_fred, ['name']) + assert str(exc.value) == ("""There is no identifying property 'name'""" + """ in {"age": 33, "nickname": "Fred", "favorite-color": "yellow"}.""") + + +def test_validate_data_against_schemas(): + + with SchemaManager.fresh_schema_manager_context_for_testing(): + + class MockVapp(NamedObject, AbstractVirtualApp): + + @classmethod + def get(cls, path_url): + + m = re.match('/profiles/(.*)[.]json?', path_url) + if m: + base = to_snake_case(m.group(1)) + file = os.path.join(TEST_DIR, 'data_files', 'sample_schemas', f'{base}.json') + response_data = json_file_contents(file) + response = MockResponse(200, json=response_data, url=path_url) + return response + raise Exception(f"MockVapp can't handle this case: {path_url}") + + portal_vapp = MockVapp(name=f'MockVapp["data_files/sample_schemas"]') + + good_workbook = inflate(SAMPLE_WORKBOOK_WITH_NAME_REFS) + + assert validate_data_against_schemas(good_workbook, portal_vapp) is None + + bogus_workbook = copy.deepcopy(good_workbook) # modified immediately below + user_items = bogus_workbook['User'] + user_item0 = user_items[0] + user_item0['bogus'] = 'item' + + assert validate_data_against_schemas(bogus_workbook, portal_vapp) == { + 'errors': [ + { + 'extraneous_properties': ['bogus'], + 'index': 0, + 'item': 'e0dec518-cb0c-45f3-8c97-21b2659ec129', + 'type': 'User' + } + ] + } + + +def test_summary_of_data_validation_errors(): + + error_report_1 = { + 'errors': [ + { + 'extraneous_properties': ['bogus'], + 'index': 0, + 'item': 'e0dec518-cb0c-45f3-8c97-21b2659ec129', + 'type': 'User' + } + ] + } + + sample_data_file_name = 'my-data-file' + sample_s3_data_file_location = 'my-s3-data-file-location' + sample_s3_details_location = 'my-s3-details-location' + + assert summary_of_data_validation_errors(error_report_1, + data_file_name=sample_data_file_name, + s3_data_file_location=sample_s3_data_file_location, + s3_details_location=sample_s3_details_location + ) == [ + 'Ingestion data validation error summary:', + 'Data file: my-data-file', + 'Data file in S3: my-s3-data-file-location', + 'Items unidentified: 0', + 'Items missing properties: 0', + 'Items with extraneous properties: 1', + 'Other errors: 0', + 'Exceptions: 0', + 'Details: my-s3-details-location' + ] From 549f3489377a415605c367142b7465189bbee97c Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Fri, 20 Oct 2023 13:30:36 -0400 Subject: [PATCH 076/101] Reshuffle caching in SchemaManager so it sees instance-local schemas in all methods. --- dcicutils/validation_utils.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/dcicutils/validation_utils.py b/dcicutils/validation_utils.py index 36284ff5c..fc46f1c9c 100644 --- a/dcicutils/validation_utils.py +++ b/dcicutils/validation_utils.py @@ -40,17 +40,20 @@ def fetch_relevant_schemas(self, schema_names: List[str]): # , schemas: Optiona # if schemas is None: # schemas = self.schemas # The schema_names argument is not normally given, but it is there for easier testing - def fetch_schema(schema_name): - cached_schema = self.schemas.get(schema_name) # schemas.get(schema_name) - schema = self.fetch_schema(schema_name) if cached_schema is None else cached_schema - return schema_name, schema + def name_and_schema(schema_name): + # cached_schema = self.schemas.get(schema_name) # schemas.get(schema_name) + # schema = self.fetch_schema(schema_name) if cached_schema is None else cached_schema + return schema_name, self.fetch_schema(schema_name) return {schema_name: schema - for schema_name, schema in pmap(fetch_schema, schema_names)} + for schema_name, schema in pmap(name_and_schema, schema_names)} def schema_exists(self, schema_name: str): return bool(self.fetch_schema(schema_name=schema_name)) def fetch_schema(self, schema_name: str): + override_schema = self.schemas.get(schema_name) + if override_schema is not None: + return override_schema schema: Optional[AnyJsonData] = self.SCHEMA_CACHE.get(schema_name) if schema is None and schema_name not in self.SCHEMA_CACHE: # If None is already stored, don't look it up again schema = get_schema(schema_name, portal_env=self.portal_env, portal_vapp=self.portal_vapp) From 336142cd24877c6cc1282aaf3cc19be9bf492028 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Fri, 20 Oct 2023 15:07:53 -0400 Subject: [PATCH 077/101] Fix PEP8 and some static checks. --- dcicutils/bundle_utils.py | 14 +++++--------- dcicutils/misc_utils.py | 6 +----- dcicutils/qa_checkers.py | 2 +- dcicutils/validation_utils.py | 4 ++-- docs/source/dcicutils.rst | 7 +++++++ pyproject.toml | 2 +- test/test_bundle_utils.py | 9 ++------- 7 files changed, 19 insertions(+), 25 deletions(-) diff --git a/dcicutils/bundle_utils.py b/dcicutils/bundle_utils.py index 18f342c02..7748a9056 100644 --- a/dcicutils/bundle_utils.py +++ b/dcicutils/bundle_utils.py @@ -1,6 +1,4 @@ -# import contextlib import copy -import jsonschema from typing import Any, Dict, List, Optional, Union # , Type from .common import AnyJsonData # , Regexp, CsvReader @@ -282,7 +280,8 @@ def set_path_value(cls, datum: Union[List, Dict], path: ParsedHeader, value: Any cls.set_path_value(datum[key], more_path, value) @classmethod - def find_type_hint(cls, parsed_header: Optional[ParsedHeader], schema: Any, context: Optional[TypeHintContext] = None): + def find_type_hint(cls, parsed_header: Optional[ParsedHeader], schema: Any, + context: Optional[TypeHintContext] = None): def finder(subheader, subschema): if not parsed_header: @@ -356,7 +355,7 @@ def finder(subheader, subschema): # # def fetch_schema(self, schema_name: str): # schema: Optional[AnyJsonData] = self.SCHEMA_CACHE.get(schema_name) -# if schema is None and schema_name not in self.SCHEMA_CACHE: # If None is already stored, don't look it up again +# if schema is None and schema_name not in self.SCHEMA_CACHE: # If None is already stored, don't look up again # schema = get_schema(schema_name, portal_env=self.portal_env, portal_vapp=self.portal_vapp) # self.SCHEMA_CACHE[schema_name] = schema # return schema @@ -380,7 +379,6 @@ def finder(subheader, subschema): ITEM_MANAGER_REGISTRY = TableSetManagerRegistry() - class InflatableTabbedDataManager: """ This tool can be used independently of the item tools. It doesn't involve schemas, but it does allow the @@ -502,7 +500,7 @@ def build_lookup_table_for_tab(self, tab_name: str, *, rows: List[Dict]) -> Dict for row in rows: for identifying_property in identifying_properties: value = row.get(identifying_property) - if value is not '' and value is not None: + if value != '' and value is not None: lookup_table[str(value)] = row return lookup_table @@ -606,7 +604,7 @@ def _schema_required_headers(cls, schema): return [] # TODO: Make this compute a list of required headers (in parsed header form) def create_tab_processor_state(self, tab_name: str) -> SheetState: - # This will create state that allows us to efficiently assign values in the right place on each row + # This will create state that allows us to efficiently assign values in the right place on each row return self.SheetState(parsed_headers=self.parsed_headers_by_tab_name[tab_name], type_hints=self.type_hints_by_tab_name[tab_name]) @@ -625,5 +623,3 @@ def load_items(filename: str, tab_name: Optional[str] = None, escaping: Optional if validate: raise NotImplementedError("Need to implement validation.") # TODO: Implement validation return checked_items - - diff --git a/dcicutils/misc_utils.py b/dcicutils/misc_utils.py index 87e60affd..97a17c21b 100644 --- a/dcicutils/misc_utils.py +++ b/dcicutils/misc_utils.py @@ -9,10 +9,6 @@ import inspect import io import json -<<<<<<< HEAD -import os -======= ->>>>>>> master import logging import math import os @@ -305,7 +301,7 @@ def app(self): return self.wrapped_app.app -VirtualAppResponse = webtest.response.TestResponse +VirtualAppResponse = webtest.response.TestResponse # NoQA - PyCharm sees a problem, but none occurs in practice def exported(*variables): diff --git a/dcicutils/qa_checkers.py b/dcicutils/qa_checkers.py index 291135bd1..6eaa13640 100644 --- a/dcicutils/qa_checkers.py +++ b/dcicutils/qa_checkers.py @@ -82,7 +82,7 @@ def _check_version(cls): RAISE_ERROR_IF_CHANGELOG_MISMATCH = True VERSION_LINE_PATTERN = re.compile("^[#* ]*([0-9]+[.][^ \t\n]*)([ \t\n].*)?$") - VERSION_IS_BETA_PATTERN = re.compile("^.*[0-9][Bb][0-9]+$") + VERSION_IS_BETA_PATTERN = re.compile("^.*[0-9]([AaBb][0-9]+|[-][A-Za-z0-9-_.]*)$") @classmethod def _check_change_history(cls, version=None): diff --git a/dcicutils/validation_utils.py b/dcicutils/validation_utils.py index fc46f1c9c..3baff6be9 100644 --- a/dcicutils/validation_utils.py +++ b/dcicutils/validation_utils.py @@ -101,7 +101,7 @@ def validate_data_against_schemas(data: TabbedSheetData, where each top-level dictionary property is the name of the data type for the contained schema. These data types are (strings) assumed to be in snake-case form, e.g. "file_submitted". - If there are any missing required properties, any extraneous properties, or any undentified + If there are any absent required properties, any extraneous properties, or any undentified items in the data, then returns a dictionary with an itemized description of each of these errors, otherwise returns None if there are no problems. Note that an unidentified item is one which has no value for uuid nor any of the other identifying property values as defined by the schema. @@ -242,7 +242,7 @@ def extract_single_quoted_strings(message: str) -> List[str]: return errors -def summary_of_data_validation_errors(data_validation_errors: Dict, # submission: SmahtSubmissionFolio, +def summary_of_data_validation_errors(data_validation_errors: Dict, # submission: SmahtSubmissionFolio, data_file_name: str, s3_data_file_location: str, s3_details_location: str) -> List[str]: diff --git a/docs/source/dcicutils.rst b/docs/source/dcicutils.rst index 19569fa0c..2d5b8058c 100644 --- a/docs/source/dcicutils.rst +++ b/docs/source/dcicutils.rst @@ -323,6 +323,13 @@ trace_utils :members: +validation_utils +^^^^^^^^^^^^^^^^ + +.. automodule:: dcicutils.validation_utils + :members: + + variant_utils ^^^^^^^^^^^ diff --git a/pyproject.toml b/pyproject.toml index ef767b29a..e5367cf6d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dcicutils" -version = "7.13.0.2b10" # to become "8.0.0" +version = "7.13.0.2-alpha.10" # to become "8.0.0" description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources" authors = ["4DN-DCIC Team "] license = "MIT" diff --git a/test/test_bundle_utils.py b/test/test_bundle_utils.py index 5b38b8e0a..b656c55a1 100644 --- a/test/test_bundle_utils.py +++ b/test/test_bundle_utils.py @@ -397,7 +397,7 @@ def test_load_items_with_schema(): print("Case 2") file_base_name = os.path.splitext(os.path.basename(SAMPLE_CSV_FILE2))[0] expected = SAMPLE_CSV_FILE2_ITEM_CONTENT - actual = load_items(SAMPLE_CSV_FILE2, schemas={file_base_name: {}}, apply_heuristics=True) # schemas=SAMPLE_CSV_FILE2_SCHEMAS) + actual = load_items(SAMPLE_CSV_FILE2, schemas={file_base_name: {}}, apply_heuristics=True) assert actual == expected print("Case 3") @@ -623,7 +623,7 @@ def lookup_sample_insert(item_type, item_ref): if not data: return None for prop in possible_identifying_properties: - if not prop in data[0]: + if prop not in data[0]: continue found = find_association(data, **{prop: item_ref}) if found: @@ -656,9 +656,6 @@ def mocked_get_metadata(obj_id, key=None, ff_env=None, check_queue=False, add_on yield - - - def test_table_checker(): print() # start on a fresh line @@ -683,5 +680,3 @@ def test_table_checker(): checker = TableChecker(SAMPLE_WORKBOOK_WITH_NAME_REFS, portal_env=mock_ff_env) checker.check_tabs() - - From fa1f2722a927db40552afbccf16ecdde2c1a6d12 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Fri, 20 Oct 2023 15:33:55 -0400 Subject: [PATCH 078/101] Make sure jsonschema support is loaded. --- poetry.lock | 13 ++++++++++++- pyproject.toml | 1 + 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/poetry.lock b/poetry.lock index b6f321321..f88755163 100644 --- a/poetry.lock +++ b/poetry.lock @@ -982,6 +982,17 @@ files = [ {file = "jsonc_parser-1.1.5-py3-none-any.whl", hash = "sha256:abd1db76a4c6d1733ec7bb5340a89c49cbc878a181a1e7947ee6719eedf2c6cc"}, ] +[[package]] +name = "jsonschema-serialize-fork" +version = "2.1.1" +description = "Fork of Julian Berman's jsonschema to include support for serializing defaults" +category = "main" +optional = false +python-versions = "*" +files = [ + {file = "jsonschema_serialize_fork-2.1.1.tar.gz", hash = "sha256:49b502326ac408729f72c95db018bf0e4d47860e3cd76e944f368f41a5483ed5"}, +] + [[package]] name = "mccabe" version = "0.7.0" @@ -1743,4 +1754,4 @@ tests = ["PasteDeploy", "WSGIProxy2", "coverage", "mock", "nose (<1.3.0)", "pyqu [metadata] lock-version = "2.0" python-versions = ">=3.8,<3.12" -content-hash = "9ac1c380788ebe1f85d68535dfb6294056142928f89fbf317191ddc37ac5cfb4" +content-hash = "5cfcf95408526becaa3c1928dc5661d4895f0cc4c08553bcb5016ef6e6fec4a9" diff --git a/pyproject.toml b/pyproject.toml index dc98d1d94..6723bd73b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,6 +48,7 @@ chardet = "^5.2.0" docker = "^4.4.4" gitpython = "^3.1.2" jsonc-parser = "^1.1.5" +jsonschema_serialize_fork = "^2.1.1" openpyxl = "^3.1.2" opensearch-py = "^2.0.1" pyOpenSSL = "^23.1.1" From 2a80c957bcdf0afa15b9ef9a65824ff801fb6053 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Mon, 23 Oct 2023 08:53:30 -0400 Subject: [PATCH 079/101] Import jsonschema better. --- poetry.lock | 232 +++++++++++++++++++++++++++++++++++++++++++++++-- pyproject.toml | 2 +- 2 files changed, 227 insertions(+), 7 deletions(-) diff --git a/poetry.lock b/poetry.lock index f88755163..73ee5796f 100644 --- a/poetry.lock +++ b/poetry.lock @@ -12,6 +12,25 @@ files = [ {file = "async_timeout-4.0.2-py3-none-any.whl", hash = "sha256:8ca1e4fcf50d07413d66d1a5e416e42cfdf5851c981d679a09851a6853383b3c"}, ] +[[package]] +name = "attrs" +version = "23.1.0" +description = "Classes Without Boilerplate" +category = "main" +optional = false +python-versions = ">=3.7" +files = [ + {file = "attrs-23.1.0-py3-none-any.whl", hash = "sha256:1f28b4522cdc2fb4256ac1a020c78acf9cba2c6b461ccd2c126f3aa8e8335d04"}, + {file = "attrs-23.1.0.tar.gz", hash = "sha256:6279836d581513a26f1bf235f9acd333bc9115683f14f7e8fae46c98fc50e015"}, +] + +[package.extras] +cov = ["attrs[tests]", "coverage[toml] (>=5.3)"] +dev = ["attrs[docs,tests]", "pre-commit"] +docs = ["furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier", "zope-interface"] +tests = ["attrs[tests-no-zope]", "zope-interface"] +tests-no-zope = ["cloudpickle", "hypothesis", "mypy (>=1.1.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] + [[package]] name = "aws-requests-auth" version = "0.4.3" @@ -946,6 +965,25 @@ files = [ {file = "idna-3.4.tar.gz", hash = "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4"}, ] +[[package]] +name = "importlib-resources" +version = "6.1.0" +description = "Read resources from Python packages" +category = "main" +optional = false +python-versions = ">=3.8" +files = [ + {file = "importlib_resources-6.1.0-py3-none-any.whl", hash = "sha256:aa50258bbfa56d4e33fbd8aa3ef48ded10d1735f11532b8df95388cc6bdb7e83"}, + {file = "importlib_resources-6.1.0.tar.gz", hash = "sha256:9d48dcccc213325e810fd723e7fbb45ccb39f6cf5c31f00cf2b965f5f10f3cb9"}, +] + +[package.dependencies] +zipp = {version = ">=3.1.0", markers = "python_version < \"3.10\""} + +[package.extras] +docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (<7.2.5)", "sphinx (>=3.5)", "sphinx-lint"] +testing = ["pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy (>=0.9.1)", "pytest-ruff", "zipp (>=3.17)"] + [[package]] name = "iniconfig" version = "2.0.0" @@ -983,16 +1021,45 @@ files = [ ] [[package]] -name = "jsonschema-serialize-fork" -version = "2.1.1" -description = "Fork of Julian Berman's jsonschema to include support for serializing defaults" +name = "jsonschema" +version = "4.19.1" +description = "An implementation of JSON Schema validation for Python" category = "main" optional = false -python-versions = "*" +python-versions = ">=3.8" files = [ - {file = "jsonschema_serialize_fork-2.1.1.tar.gz", hash = "sha256:49b502326ac408729f72c95db018bf0e4d47860e3cd76e944f368f41a5483ed5"}, + {file = "jsonschema-4.19.1-py3-none-any.whl", hash = "sha256:cd5f1f9ed9444e554b38ba003af06c0a8c2868131e56bfbef0550fb450c0330e"}, + {file = "jsonschema-4.19.1.tar.gz", hash = "sha256:ec84cc37cfa703ef7cd4928db24f9cb31428a5d0fa77747b8b51a847458e0bbf"}, ] +[package.dependencies] +attrs = ">=22.2.0" +importlib-resources = {version = ">=1.4.0", markers = "python_version < \"3.9\""} +jsonschema-specifications = ">=2023.03.6" +pkgutil-resolve-name = {version = ">=1.3.10", markers = "python_version < \"3.9\""} +referencing = ">=0.28.4" +rpds-py = ">=0.7.1" + +[package.extras] +format = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3987", "uri-template", "webcolors (>=1.11)"] +format-nongpl = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "uri-template", "webcolors (>=1.11)"] + +[[package]] +name = "jsonschema-specifications" +version = "2023.7.1" +description = "The JSON Schema meta-schemas and vocabularies, exposed as a Registry" +category = "main" +optional = false +python-versions = ">=3.8" +files = [ + {file = "jsonschema_specifications-2023.7.1-py3-none-any.whl", hash = "sha256:05adf340b659828a004220a9613be00fa3f223f2b82002e273dee62fd50524b1"}, + {file = "jsonschema_specifications-2023.7.1.tar.gz", hash = "sha256:c91a50404e88a1f6ba40636778e2ee08f6e24c5613fe4c53ac24578a5a7f72bb"}, +] + +[package.dependencies] +importlib-resources = {version = ">=1.4.0", markers = "python_version < \"3.9\""} +referencing = ">=0.28.0" + [[package]] name = "mccabe" version = "0.7.0" @@ -1090,6 +1157,18 @@ prettytable = ">=2.3.0" [package.extras] test = ["docutils", "mypy", "pytest-cov", "pytest-pycodestyle", "pytest-runner"] +[[package]] +name = "pkgutil-resolve-name" +version = "1.3.10" +description = "Resolve a name to an object." +category = "main" +optional = false +python-versions = ">=3.6" +files = [ + {file = "pkgutil_resolve_name-1.3.10-py3-none-any.whl", hash = "sha256:ca27cc078d25c5ad71a9de0a7a330146c4e014c2462d9af19c6b828280649c5e"}, + {file = "pkgutil_resolve_name-1.3.10.tar.gz", hash = "sha256:357d6c9e6a755653cfd78893817c0853af365dd51ec97f3d358a819373bbd174"}, +] + [[package]] name = "pluggy" version = "1.2.0" @@ -1458,6 +1537,22 @@ async-timeout = {version = ">=4.0.2", markers = "python_full_version <= \"3.11.2 hiredis = ["hiredis (>=1.0.0)"] ocsp = ["cryptography (>=36.0.1)", "pyopenssl (==20.0.1)", "requests (>=2.26.0)"] +[[package]] +name = "referencing" +version = "0.30.2" +description = "JSON Referencing + Python" +category = "main" +optional = false +python-versions = ">=3.8" +files = [ + {file = "referencing-0.30.2-py3-none-any.whl", hash = "sha256:449b6669b6121a9e96a7f9e410b245d471e8d48964c67113ce9afe50c8dd7bdf"}, + {file = "referencing-0.30.2.tar.gz", hash = "sha256:794ad8003c65938edcdbc027f1933215e0d0ccc0291e3ce20a4d87432b59efc0"}, +] + +[package.dependencies] +attrs = ">=22.2.0" +rpds-py = ">=0.7.0" + [[package]] name = "requests" version = "2.31.0" @@ -1495,6 +1590,115 @@ files = [ [package.extras] idna2008 = ["idna"] +[[package]] +name = "rpds-py" +version = "0.10.6" +description = "Python bindings to Rust's persistent data structures (rpds)" +category = "main" +optional = false +python-versions = ">=3.8" +files = [ + {file = "rpds_py-0.10.6-cp310-cp310-macosx_10_7_x86_64.whl", hash = "sha256:6bdc11f9623870d75692cc33c59804b5a18d7b8a4b79ef0b00b773a27397d1f6"}, + {file = "rpds_py-0.10.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:26857f0f44f0e791f4a266595a7a09d21f6b589580ee0585f330aaccccb836e3"}, + {file = "rpds_py-0.10.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7f5e15c953ace2e8dde9824bdab4bec50adb91a5663df08d7d994240ae6fa31"}, + {file = "rpds_py-0.10.6-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:61fa268da6e2e1cd350739bb61011121fa550aa2545762e3dc02ea177ee4de35"}, + {file = "rpds_py-0.10.6-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c48f3fbc3e92c7dd6681a258d22f23adc2eb183c8cb1557d2fcc5a024e80b094"}, + {file = "rpds_py-0.10.6-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c0503c5b681566e8b722fe8c4c47cce5c7a51f6935d5c7012c4aefe952a35eed"}, + {file = "rpds_py-0.10.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:734c41f9f57cc28658d98270d3436dba65bed0cfc730d115b290e970150c540d"}, + {file = "rpds_py-0.10.6-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a5d7ed104d158c0042a6a73799cf0eb576dfd5fc1ace9c47996e52320c37cb7c"}, + {file = "rpds_py-0.10.6-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:e3df0bc35e746cce42579826b89579d13fd27c3d5319a6afca9893a9b784ff1b"}, + {file = "rpds_py-0.10.6-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:73e0a78a9b843b8c2128028864901f55190401ba38aae685350cf69b98d9f7c9"}, + {file = "rpds_py-0.10.6-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:5ed505ec6305abd2c2c9586a7b04fbd4baf42d4d684a9c12ec6110deefe2a063"}, + {file = "rpds_py-0.10.6-cp310-none-win32.whl", hash = "sha256:d97dd44683802000277bbf142fd9f6b271746b4846d0acaf0cefa6b2eaf2a7ad"}, + {file = "rpds_py-0.10.6-cp310-none-win_amd64.whl", hash = "sha256:b455492cab07107bfe8711e20cd920cc96003e0da3c1f91297235b1603d2aca7"}, + {file = "rpds_py-0.10.6-cp311-cp311-macosx_10_7_x86_64.whl", hash = "sha256:e8cdd52744f680346ff8c1ecdad5f4d11117e1724d4f4e1874f3a67598821069"}, + {file = "rpds_py-0.10.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:66414dafe4326bca200e165c2e789976cab2587ec71beb80f59f4796b786a238"}, + {file = "rpds_py-0.10.6-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cc435d059f926fdc5b05822b1be4ff2a3a040f3ae0a7bbbe672babb468944722"}, + {file = "rpds_py-0.10.6-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8e7f2219cb72474571974d29a191714d822e58be1eb171f229732bc6fdedf0ac"}, + {file = "rpds_py-0.10.6-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3953c6926a63f8ea5514644b7afb42659b505ece4183fdaaa8f61d978754349e"}, + {file = "rpds_py-0.10.6-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2bb2e4826be25e72013916eecd3d30f66fd076110de09f0e750163b416500721"}, + {file = "rpds_py-0.10.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7bf347b495b197992efc81a7408e9a83b931b2f056728529956a4d0858608b80"}, + {file = "rpds_py-0.10.6-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:102eac53bb0bf0f9a275b438e6cf6904904908562a1463a6fc3323cf47d7a532"}, + {file = "rpds_py-0.10.6-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:40f93086eef235623aa14dbddef1b9fb4b22b99454cb39a8d2e04c994fb9868c"}, + {file = "rpds_py-0.10.6-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:e22260a4741a0e7a206e175232867b48a16e0401ef5bce3c67ca5b9705879066"}, + {file = "rpds_py-0.10.6-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f4e56860a5af16a0fcfa070a0a20c42fbb2012eed1eb5ceeddcc7f8079214281"}, + {file = "rpds_py-0.10.6-cp311-none-win32.whl", hash = "sha256:0774a46b38e70fdde0c6ded8d6d73115a7c39d7839a164cc833f170bbf539116"}, + {file = "rpds_py-0.10.6-cp311-none-win_amd64.whl", hash = "sha256:4a5ee600477b918ab345209eddafde9f91c0acd931f3776369585a1c55b04c57"}, + {file = "rpds_py-0.10.6-cp312-cp312-macosx_10_7_x86_64.whl", hash = "sha256:5ee97c683eaface61d38ec9a489e353d36444cdebb128a27fe486a291647aff6"}, + {file = "rpds_py-0.10.6-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:0713631d6e2d6c316c2f7b9320a34f44abb644fc487b77161d1724d883662e31"}, + {file = "rpds_py-0.10.6-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b5a53f5998b4bbff1cb2e967e66ab2addc67326a274567697379dd1e326bded7"}, + {file = "rpds_py-0.10.6-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:6a555ae3d2e61118a9d3e549737bb4a56ff0cec88a22bd1dfcad5b4e04759175"}, + {file = "rpds_py-0.10.6-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:945eb4b6bb8144909b203a88a35e0a03d22b57aefb06c9b26c6e16d72e5eb0f0"}, + {file = "rpds_py-0.10.6-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:52c215eb46307c25f9fd2771cac8135d14b11a92ae48d17968eda5aa9aaf5071"}, + {file = "rpds_py-0.10.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1b3cd23d905589cb205710b3988fc8f46d4a198cf12862887b09d7aaa6bf9b9"}, + {file = "rpds_py-0.10.6-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:64ccc28683666672d7c166ed465c09cee36e306c156e787acef3c0c62f90da5a"}, + {file = "rpds_py-0.10.6-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:516a611a2de12fbea70c78271e558f725c660ce38e0006f75139ba337d56b1f6"}, + {file = "rpds_py-0.10.6-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:9ff93d3aedef11f9c4540cf347f8bb135dd9323a2fc705633d83210d464c579d"}, + {file = "rpds_py-0.10.6-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:d858532212f0650be12b6042ff4378dc2efbb7792a286bee4489eaa7ba010586"}, + {file = "rpds_py-0.10.6-cp312-none-win32.whl", hash = "sha256:3c4eff26eddac49d52697a98ea01b0246e44ca82ab09354e94aae8823e8bda02"}, + {file = "rpds_py-0.10.6-cp312-none-win_amd64.whl", hash = "sha256:150eec465dbc9cbca943c8e557a21afdcf9bab8aaabf386c44b794c2f94143d2"}, + {file = "rpds_py-0.10.6-cp38-cp38-macosx_10_7_x86_64.whl", hash = "sha256:cf693eb4a08eccc1a1b636e4392322582db2a47470d52e824b25eca7a3977b53"}, + {file = "rpds_py-0.10.6-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4134aa2342f9b2ab6c33d5c172e40f9ef802c61bb9ca30d21782f6e035ed0043"}, + {file = "rpds_py-0.10.6-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e782379c2028a3611285a795b89b99a52722946d19fc06f002f8b53e3ea26ea9"}, + {file = "rpds_py-0.10.6-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2f6da6d842195fddc1cd34c3da8a40f6e99e4a113918faa5e60bf132f917c247"}, + {file = "rpds_py-0.10.6-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b4a9fe992887ac68256c930a2011255bae0bf5ec837475bc6f7edd7c8dfa254e"}, + {file = "rpds_py-0.10.6-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b788276a3c114e9f51e257f2a6f544c32c02dab4aa7a5816b96444e3f9ffc336"}, + {file = "rpds_py-0.10.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:caa1afc70a02645809c744eefb7d6ee8fef7e2fad170ffdeacca267fd2674f13"}, + {file = "rpds_py-0.10.6-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:bddd4f91eede9ca5275e70479ed3656e76c8cdaaa1b354e544cbcf94c6fc8ac4"}, + {file = "rpds_py-0.10.6-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:775049dfa63fb58293990fc59473e659fcafd953bba1d00fc5f0631a8fd61977"}, + {file = "rpds_py-0.10.6-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:c6c45a2d2b68c51fe3d9352733fe048291e483376c94f7723458cfd7b473136b"}, + {file = "rpds_py-0.10.6-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:0699ab6b8c98df998c3eacf51a3b25864ca93dab157abe358af46dc95ecd9801"}, + {file = "rpds_py-0.10.6-cp38-none-win32.whl", hash = "sha256:ebdab79f42c5961682654b851f3f0fc68e6cc7cd8727c2ac4ffff955154123c1"}, + {file = "rpds_py-0.10.6-cp38-none-win_amd64.whl", hash = "sha256:24656dc36f866c33856baa3ab309da0b6a60f37d25d14be916bd3e79d9f3afcf"}, + {file = "rpds_py-0.10.6-cp39-cp39-macosx_10_7_x86_64.whl", hash = "sha256:0898173249141ee99ffcd45e3829abe7bcee47d941af7434ccbf97717df020e5"}, + {file = "rpds_py-0.10.6-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9e9184fa6c52a74a5521e3e87badbf9692549c0fcced47443585876fcc47e469"}, + {file = "rpds_py-0.10.6-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5752b761902cd15073a527b51de76bbae63d938dc7c5c4ad1e7d8df10e765138"}, + {file = "rpds_py-0.10.6-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:99a57006b4ec39dbfb3ed67e5b27192792ffb0553206a107e4aadb39c5004cd5"}, + {file = "rpds_py-0.10.6-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:09586f51a215d17efdb3a5f090d7cbf1633b7f3708f60a044757a5d48a83b393"}, + {file = "rpds_py-0.10.6-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e225a6a14ecf44499aadea165299092ab0cba918bb9ccd9304eab1138844490b"}, + {file = "rpds_py-0.10.6-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b2039f8d545f20c4e52713eea51a275e62153ee96c8035a32b2abb772b6fc9e5"}, + {file = "rpds_py-0.10.6-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:34ad87a831940521d462ac11f1774edf867c34172010f5390b2f06b85dcc6014"}, + {file = "rpds_py-0.10.6-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:dcdc88b6b01015da066da3fb76545e8bb9a6880a5ebf89e0f0b2e3ca557b3ab7"}, + {file = "rpds_py-0.10.6-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:25860ed5c4e7f5e10c496ea78af46ae8d8468e0be745bd233bab9ca99bfd2647"}, + {file = "rpds_py-0.10.6-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:7854a207ef77319ec457c1eb79c361b48807d252d94348305db4f4b62f40f7f3"}, + {file = "rpds_py-0.10.6-cp39-none-win32.whl", hash = "sha256:e6fcc026a3f27c1282c7ed24b7fcac82cdd70a0e84cc848c0841a3ab1e3dea2d"}, + {file = "rpds_py-0.10.6-cp39-none-win_amd64.whl", hash = "sha256:e98c4c07ee4c4b3acf787e91b27688409d918212dfd34c872201273fdd5a0e18"}, + {file = "rpds_py-0.10.6-pp310-pypy310_pp73-macosx_10_7_x86_64.whl", hash = "sha256:68fe9199184c18d997d2e4293b34327c0009a78599ce703e15cd9a0f47349bba"}, + {file = "rpds_py-0.10.6-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:3339eca941568ed52d9ad0f1b8eb9fe0958fa245381747cecf2e9a78a5539c42"}, + {file = "rpds_py-0.10.6-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a360cfd0881d36c6dc271992ce1eda65dba5e9368575663de993eeb4523d895f"}, + {file = "rpds_py-0.10.6-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:031f76fc87644a234883b51145e43985aa2d0c19b063e91d44379cd2786144f8"}, + {file = "rpds_py-0.10.6-pp310-pypy310_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1f36a9d751f86455dc5278517e8b65580eeee37d61606183897f122c9e51cef3"}, + {file = "rpds_py-0.10.6-pp310-pypy310_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:052a832078943d2b2627aea0d19381f607fe331cc0eb5df01991268253af8417"}, + {file = "rpds_py-0.10.6-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:023574366002bf1bd751ebaf3e580aef4a468b3d3c216d2f3f7e16fdabd885ed"}, + {file = "rpds_py-0.10.6-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:defa2c0c68734f4a82028c26bcc85e6b92cced99866af118cd6a89b734ad8e0d"}, + {file = "rpds_py-0.10.6-pp310-pypy310_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:879fb24304ead6b62dbe5034e7b644b71def53c70e19363f3c3be2705c17a3b4"}, + {file = "rpds_py-0.10.6-pp310-pypy310_pp73-musllinux_1_2_i686.whl", hash = "sha256:53c43e10d398e365da2d4cc0bcaf0854b79b4c50ee9689652cdc72948e86f487"}, + {file = "rpds_py-0.10.6-pp310-pypy310_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:3777cc9dea0e6c464e4b24760664bd8831738cc582c1d8aacf1c3f546bef3f65"}, + {file = "rpds_py-0.10.6-pp38-pypy38_pp73-macosx_10_7_x86_64.whl", hash = "sha256:40578a6469e5d1df71b006936ce95804edb5df47b520c69cf5af264d462f2cbb"}, + {file = "rpds_py-0.10.6-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:cf71343646756a072b85f228d35b1d7407da1669a3de3cf47f8bbafe0c8183a4"}, + {file = "rpds_py-0.10.6-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:10f32b53f424fc75ff7b713b2edb286fdbfc94bf16317890260a81c2c00385dc"}, + {file = "rpds_py-0.10.6-pp38-pypy38_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:81de24a1c51cfb32e1fbf018ab0bdbc79c04c035986526f76c33e3f9e0f3356c"}, + {file = "rpds_py-0.10.6-pp38-pypy38_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ac17044876e64a8ea20ab132080ddc73b895b4abe9976e263b0e30ee5be7b9c2"}, + {file = "rpds_py-0.10.6-pp38-pypy38_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5e8a78bd4879bff82daef48c14d5d4057f6856149094848c3ed0ecaf49f5aec2"}, + {file = "rpds_py-0.10.6-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:78ca33811e1d95cac8c2e49cb86c0fb71f4d8409d8cbea0cb495b6dbddb30a55"}, + {file = "rpds_py-0.10.6-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c63c3ef43f0b3fb00571cff6c3967cc261c0ebd14a0a134a12e83bdb8f49f21f"}, + {file = "rpds_py-0.10.6-pp38-pypy38_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:7fde6d0e00b2fd0dbbb40c0eeec463ef147819f23725eda58105ba9ca48744f4"}, + {file = "rpds_py-0.10.6-pp38-pypy38_pp73-musllinux_1_2_i686.whl", hash = "sha256:79edd779cfc46b2e15b0830eecd8b4b93f1a96649bcb502453df471a54ce7977"}, + {file = "rpds_py-0.10.6-pp38-pypy38_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:9164ec8010327ab9af931d7ccd12ab8d8b5dc2f4c6a16cbdd9d087861eaaefa1"}, + {file = "rpds_py-0.10.6-pp39-pypy39_pp73-macosx_10_7_x86_64.whl", hash = "sha256:d29ddefeab1791e3c751e0189d5f4b3dbc0bbe033b06e9c333dca1f99e1d523e"}, + {file = "rpds_py-0.10.6-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:30adb75ecd7c2a52f5e76af50644b3e0b5ba036321c390b8e7ec1bb2a16dd43c"}, + {file = "rpds_py-0.10.6-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dd609fafdcdde6e67a139898196698af37438b035b25ad63704fd9097d9a3482"}, + {file = "rpds_py-0.10.6-pp39-pypy39_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:6eef672de005736a6efd565577101277db6057f65640a813de6c2707dc69f396"}, + {file = "rpds_py-0.10.6-pp39-pypy39_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6cf4393c7b41abbf07c88eb83e8af5013606b1cdb7f6bc96b1b3536b53a574b8"}, + {file = "rpds_py-0.10.6-pp39-pypy39_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ad857f42831e5b8d41a32437f88d86ead6c191455a3499c4b6d15e007936d4cf"}, + {file = "rpds_py-0.10.6-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1d7360573f1e046cb3b0dceeb8864025aa78d98be4bb69f067ec1c40a9e2d9df"}, + {file = "rpds_py-0.10.6-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d08f63561c8a695afec4975fae445245386d645e3e446e6f260e81663bfd2e38"}, + {file = "rpds_py-0.10.6-pp39-pypy39_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:f0f17f2ce0f3529177a5fff5525204fad7b43dd437d017dd0317f2746773443d"}, + {file = "rpds_py-0.10.6-pp39-pypy39_pp73-musllinux_1_2_i686.whl", hash = "sha256:442626328600bde1d09dc3bb00434f5374948838ce75c41a52152615689f9403"}, + {file = "rpds_py-0.10.6-pp39-pypy39_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:e9616f5bd2595f7f4a04b67039d890348ab826e943a9bfdbe4938d0eba606971"}, + {file = "rpds_py-0.10.6.tar.gz", hash = "sha256:4ce5a708d65a8dbf3748d2474b580d606b1b9f91b5c6ab2a316e0b0cf7a4ba50"}, +] + [[package]] name = "s3transfer" version = "0.7.0" @@ -1751,7 +1955,23 @@ WebOb = ">=1.2" docs = ["Sphinx (>=1.8.1)", "docutils", "pylons-sphinx-themes (>=1.0.8)"] tests = ["PasteDeploy", "WSGIProxy2", "coverage", "mock", "nose (<1.3.0)", "pyquery"] +[[package]] +name = "zipp" +version = "3.17.0" +description = "Backport of pathlib-compatible object wrapper for zip files" +category = "main" +optional = false +python-versions = ">=3.8" +files = [ + {file = "zipp-3.17.0-py3-none-any.whl", hash = "sha256:0e923e726174922dce09c53c59ad483ff7bbb8e572e00c7f7c46b88556409f31"}, + {file = "zipp-3.17.0.tar.gz", hash = "sha256:84e64a1c28cf7e91ed2078bb8cc8c259cb19b76942096c8d7b84947690cabaf0"}, +] + +[package.extras] +docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (<7.2.5)", "sphinx (>=3.5)", "sphinx-lint"] +testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy (>=0.9.1)", "pytest-ruff"] + [metadata] lock-version = "2.0" python-versions = ">=3.8,<3.12" -content-hash = "5cfcf95408526becaa3c1928dc5661d4895f0cc4c08553bcb5016ef6e6fec4a9" +content-hash = "3617e1e3d479d0955f9107113d56e04d5692e74d73d37beba083e53ce83f4795" diff --git a/pyproject.toml b/pyproject.toml index 6723bd73b..9e0b03cca 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,7 +48,7 @@ chardet = "^5.2.0" docker = "^4.4.4" gitpython = "^3.1.2" jsonc-parser = "^1.1.5" -jsonschema_serialize_fork = "^2.1.1" +jsonschema = "^4.19.0" openpyxl = "^3.1.2" opensearch-py = "^2.0.1" pyOpenSSL = "^23.1.1" From f2a1c4f2e50531e6a113d1b56a36a78e04abb1bb Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Mon, 23 Oct 2023 12:15:26 -0400 Subject: [PATCH 080/101] Tidy things up Will's code review. --- dcicutils/bundle_utils.py | 129 ++++++++-------------------------- dcicutils/common.py | 4 +- dcicutils/sheet_utils.py | 44 +++++++----- dcicutils/validation_utils.py | 23 +----- test/test_bundle_utils.py | 37 ++-------- 5 files changed, 66 insertions(+), 171 deletions(-) diff --git a/dcicutils/bundle_utils.py b/dcicutils/bundle_utils.py index 7748a9056..5e37e8660 100644 --- a/dcicutils/bundle_utils.py +++ b/dcicutils/bundle_utils.py @@ -1,25 +1,16 @@ import copy -from typing import Any, Dict, List, Optional, Union # , Type -from .common import AnyJsonData # , Regexp, CsvReader +from typing import Any, Dict, List, Optional, Union +from .common import AnyJsonData from .env_utils import EnvUtils, public_env_name -from .ff_utils import get_metadata # , get_schema +from .ff_utils import get_metadata from .lang_utils import there_are from .misc_utils import AbstractVirtualApp, ignored, PRINT, to_camel_case from .sheet_utils import ( - TabbedJsonSchemas, LoadTableError, prefer_number, - Header, Headers, TabbedHeaders, - ParsedHeader, ParsedHeaders, TabbedParsedHeaders, - SheetCellValue, TabbedSheetData, # SheetRow, SheetData, - TableSetManagerRegistry, AbstractTableSetManager, # BasicTableSetManager, - # CsvManager, TsvManager, XlsxManager, - # SimpleJsonInsertsManager, SimpleYamlInsertsManager, SimpleJsonLinesInsertsManager, - # TabbedJsonInsertsManager, TabbedYamlInsertsManager, - # InsertsDirectoryManager, - InsertsManager, - load_table_set + LoadTableError, prefer_number, TabbedJsonSchemas, + Header, Headers, TabbedHeaders, ParsedHeader, ParsedHeaders, TabbedParsedHeaders, SheetCellValue, TabbedSheetData, + TableSetManagerRegistry, AbstractTableSetManager, InsertsManager, load_table_set, ) -# from .task_utils import pmap from .validation_utils import SchemaManager @@ -27,21 +18,6 @@ TabbedPatchPrototypes = Dict[str, PatchPrototype] -# @contextlib.contextmanager -# def deferred_problems(): -# problems = [] -# -# def note_problem(problem): -# problems.append(problem) -# -# yield note_problem -# -# if problems: -# for problem in problems: -# PRINT(f"Problem: {problem}") -# raise Exception(there_are(problems, kind='problem while compiling hints', tense='past', show=False)) - - class TypeHintContext: @classmethod @@ -240,14 +216,28 @@ def assure_patch_prototype_shape(cls, *, parent: Union[Dict, List], keys: Parsed @classmethod def parse_item_value(cls, value: SheetCellValue, apply_heuristics: bool = False, split_pipe: bool = False) -> AnyJsonData: + """ + Returns the item value unmodified, unless apply_heuristics=True is given, + in which case heuristics ARE applied. This is intended to be used for spreadsheet + values that look like non-strings and should perhaps be interepreted as such. + + This is a vestige of an older plan to have these things happen magically behind the scenes early in + the process. Unfortunately, that was found to impede correct processing later, so now this is disabled + by default. It may still be useful in some cases when dealing with data that has no schema, so the + functionality is still here and must be explicitly requested. + + :param value: a value in a table (such as a spreadsheet) + :param apply_heuristics: whether to apply heuristic coercions based on what the value looks like (default False) + :param split_pipe: whether to apply the 'split pipe' heuristic, changing 'a|1' to ['a', 1], even if + apply_heuristics=True was given (default False) + """ if not apply_heuristics: + # In order to not interfere with schema-driven processing, we mostly default to + # NOT applying heuristics. You have to ask for them explicitly if you want them. + # -kmp 23-Oct-2023 return value - # TODO: Remodularize this for easier testing and more Schema-driven effect - # Doug asks that this be broken up into different mechanisms, more modular and separately testable. - # I pretty much agree with that. I'm just waiting for suggestions on what kinds of features are desired. if isinstance(value, str): lvalue = value.lower() - # TODO: We could consult a schema to make this less heuristic, but this may do for now if lvalue == 'true': return True elif lvalue == 'false': @@ -316,66 +306,6 @@ def finder(subheader, subschema): return finder(subheader=parsed_header, subschema=schema) -# class SchemaManager: -# -# SCHEMA_CACHE = {} # Shared cache. Do not override. Use .clear_schema_cache() to clear it. -# -# @classmethod -# @contextlib.contextmanager -# def fresh_schema_manager_context_for_testing(cls): -# old_schema_cache = cls.SCHEMA_CACHE -# try: -# cls.SCHEMA_CACHE = {} -# yield -# finally: -# cls.SCHEMA_CACHE = old_schema_cache -# -# def __init__(self, schemas: Optional[TabbedSchemas] = None, -# portal_env: Optional[str] = None, portal_vapp: Optional[AbstractVirtualApp] = None): -# if portal_env is None and portal_vapp is None: -# portal_env = public_env_name(EnvUtils.PRD_ENV_NAME) -# PRINT(f"The portal_env was not explicitly supplied. Schemas will come from portal_env={portal_env!r}.") -# self.portal_env = portal_env -# self.portal_vapp = portal_vapp -# self.schemas = {} if schemas is None else schemas.copy() -# -# def fetch_relevant_schemas(self, schema_names: List[str]): # , schemas: Optional[TabbedSchemas] = None): -# # if schemas is None: -# # schemas = self.schemas -# # The schema_names argument is not normally given, but it is there for easier testing -# def fetch_schema(schema_name): -# cached_schema = self.schemas.get(schema_name) # schemas.get(schema_name) -# schema = self.fetch_schema(schema_name) if cached_schema is None else cached_schema -# return schema_name, schema -# return {schema_name: schema -# for schema_name, schema in pmap(fetch_schema, schema_names)} -# -# def schema_exists(self, schema_name: str): -# return bool(self.fetch_schema(schema_name=schema_name)) -# -# def fetch_schema(self, schema_name: str): -# schema: Optional[AnyJsonData] = self.SCHEMA_CACHE.get(schema_name) -# if schema is None and schema_name not in self.SCHEMA_CACHE: # If None is already stored, don't look up again -# schema = get_schema(schema_name, portal_env=self.portal_env, portal_vapp=self.portal_vapp) -# self.SCHEMA_CACHE[schema_name] = schema -# return schema -# -# @classmethod -# def clear_schema_cache(cls): -# for key in list(cls.SCHEMA_CACHE.keys()): # important to get the list of keys as a separate object first -# cls.SCHEMA_CACHE.pop(key, None) -# -# def identifying_properties(self, schema=None, schema_name=None, among: Optional[List[str]] = None): -# schema = schema if schema is not None else self.fetch_schema(schema_name) -# possible_identifying_properties = set(schema.get("identifyingProperties") or []) | {'uuid'} -# identifying_properties = sorted(possible_identifying_properties -# if among is None -# else (prop -# for prop in among -# if prop in possible_identifying_properties)) -# return identifying_properties - - ITEM_MANAGER_REGISTRY = TableSetManagerRegistry() @@ -486,11 +416,6 @@ def note_problem(self, problem: str): self._problems.append(problem) def build_lookup_table_for_tab(self, tab_name: str, *, rows: List[Dict]) -> Dict[str, Dict]: - # schema = self.schema_for_tab(tab_name) - # possible_identifying_properties = set(schema.get("identifyingProperties") or []) | {'uuid'} - # identifying_properties = [prop - # for prop in self.headers_by_tab_name[tab_name] - # if prop in possible_identifying_properties] identifying_properties = self.schema_manager.identifying_properties(schema_name=tab_name) if not identifying_properties: # Maybe issue a warning here that we're going to lose @@ -621,5 +546,9 @@ def load_items(filename: str, tab_name: Optional[str] = None, escaping: Optional checked_items = check(tabbed_rows, schemas=schemas, portal_env=portal_env, portal_vapp=portal_vapp, apply_heuristics=apply_heuristics) if validate: - raise NotImplementedError("Need to implement validation.") # TODO: Implement validation + # TODO: Maybe connect validation here. Although another option is to just call validation separately + # once this is successfully loaded. Needs thought. However, David's validation_utils can do + # the validation if we decide to do it, it would just need to be connected up. + # -kmp 23-Oct-2023 + raise NotImplementedError("Need to implement validation.") return checked_items diff --git a/dcicutils/common.py b/dcicutils/common.py index 876904890..1f3c163ce 100644 --- a/dcicutils/common.py +++ b/dcicutils/common.py @@ -39,8 +39,6 @@ LIBRARY_DIR = os.path.dirname(__file__) -Regexp = type(re.compile("sample")) - # ===== Auth Data ===== AuthStr = str @@ -59,6 +57,8 @@ # ===== JSON Data ===== +JsonSchema = Dict + AnyJsonData = Union[Dict[str, 'AnyJsonData'], List['AnyJsonData'], str, bool, int, float, None] KeyValueDict = Dict[Literal['Key', 'Value'], Any] diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index 53280b5a5..9242bb765 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -1,8 +1,6 @@ import contextlib import chardet -# import contextlib -# import copy import csv import glob import io @@ -11,19 +9,15 @@ import os import re import subprocess -# import uuid import yaml from openpyxl.worksheet.worksheet import Worksheet from openpyxl.workbook.workbook import Workbook from tempfile import TemporaryFile, TemporaryDirectory from typing import Any, Dict, Iterable, List, Optional, Tuple, Type, Union -from .common import AnyJsonData, Regexp -# from .env_utils import public_env_name, EnvUtils -# from .ff_utils import get_schema +from .common import AnyJsonData, Regexp, JsonSchema from .lang_utils import conjoined_list, disjoined_list, maybe_pluralize # , there_are from .misc_utils import ignored, pad_to, JsonLinesReader, remove_suffix # , PRINT, AbstractVirtualApp -# from .task_utils import pmap Header = str @@ -37,7 +31,6 @@ CsvReader = type(csv.reader(TemporaryFile())) SheetData = List[dict] TabbedSheetData = Dict[str, SheetData] -JsonSchema = Dict TabbedJsonSchemas = Dict[str, JsonSchema] @@ -77,6 +70,17 @@ def unwanted_kwargs(*, context, kwargs, context_plural=False, detailed=False): def prefer_number(value: SheetCellValue, kind='num'): + """ + Given a string, if the string has number syntax, returns the number it represents. Otherwise, returns its argument. + (It follows from this that if given an int or a float, it just returns that argument.) + + Using a kind= argument (as in kind='int' or kind='float') can better restrict what kind of number a string + is coerced to, but it has no effect when the argument is a number, even a number of the wrong kind. + + :param value: a string, int, or float + :param kind: one of 'num', 'int', or 'float' + :returns: the argument coerced to a number of the appropriate kind, if possible, or else the argument literally + """ if isinstance(value, str): # the given value might be an int or float, in which case just fall through if not value: return None @@ -95,10 +99,26 @@ def prefer_number(value: SheetCellValue, kind='num'): pass # If we couldn't parse it as an int or float, fall through to returning the original value pass + # NOTE WELL: + # In the case where we already have a number, even if it's the wrong type, we just leave it as we got it. + # The job of this function is not to do type enforcement or correctness checking, but rather to adjust + # for the fact that spreadsheets and csv files often pass string data where they mean to pass numbers. + # If some human has already been thinking about what to pass in a JSON or other such setting, + # this function is not trying to be smart enough to second-guess that. return value def expand_string_escape_sequences(text: str) -> str: + """ + Expands string escape sequences in a commonly used way. + A backslash followed by one of the following characters is expanded as indicated: + r (return or CR) - ASCII 13 decimal, 15 octal, 0d hex + n (newline or linefeed or LF) - ASCII 10 decimal, 12 octal, 0a hex + t (tab) - ASCII 9 decimal, 11 octal, 9 hex + f (formfeed or page) - ASCII 12 decimal, 14 octal, 0c hex + \\ (backslash) - ASCII 92 decimal, 134 octal, 5c hex + In all other situations, the backslash is left uninterpreted. + """ s = io.StringIO() escaping = False for ch in text: @@ -134,13 +154,6 @@ def open_unicode_text_input_file_respecting_byte_order_mark(filename): return io.open(filename, 'r', encoding=use_encoding) -# TODO: Consider whether this might want to be an abstract base class. Some change might be needed. -# -# Doug thinks we might want (metaclass=ABCMeta) here to make this an abstract base class. -# I am less certain but open to discussion. Among other things, as implemented now, -# the __init__ method here needs to run and the documentation says that ABC's won't appear -# in the method resolution order. -kmp 17-Aug-2023 -# See also discussion at https://github.com/4dn-dcic/utils/pull/276#discussion_r1297775535 class AbstractTableSetManager: """ The TableSetManager is the spanning class of anything that wants to be able to load a table set, @@ -176,7 +189,6 @@ def __init__(self, filename: str, prefer_number: Optional[bool] = None, **kwargs self.filename: str = filename unwanted_kwargs(context=self.__class__.__name__, kwargs=kwargs) - # TODO: Consider whether this should be an abstractmethod (but first see detailed design note at top of class.) @classmethod def load(cls, filename: str, **kwargs) -> TabbedSheetData: """ diff --git a/dcicutils/validation_utils.py b/dcicutils/validation_utils.py index 3baff6be9..3e8169000 100644 --- a/dcicutils/validation_utils.py +++ b/dcicutils/validation_utils.py @@ -150,23 +150,6 @@ def validate_data_against_schemas(data: TabbedSheetData, schema_manager = SchemaManager(portal_vapp=portal_vapp, schemas=schemas) - # def fetch_relevant_schemas(schema_names: List, portal_vapp: VirtualApp) -> List: - # def fetch_schema(schema_name: str) -> Optional[Dict]: - # return schema_name, get_schema(schema_name, portal_vapp=portal_vapp) - # return {schema_name: schema for schema_name, schema in pmap(fetch_schema, schema_names)} - # - # errors = [] - # - # if not schemas: - # if not portal_vapp: - # raise Exception("Must specify portal_vapp if no schemas specified.") - # try: - # schema_names = [data_type for data_type in data] - # schemas = fetch_relevant_schemas(schema_names, portal_vapp=portal_vapp) - # except Exception as e: - # errors.append({"exception": f"Exception fetching relevant schemas: {get_error_message(e)}"}) - # schemas = {} - errors = [] schemas = schema_manager.fetch_relevant_schemas(list(data.keys())) @@ -242,7 +225,8 @@ def extract_single_quoted_strings(message: str) -> List[str]: return errors -def summary_of_data_validation_errors(data_validation_errors: Dict, # submission: SmahtSubmissionFolio, +def summary_of_data_validation_errors(data_validation_errors: Dict, + # These next three items are available from a portal's SubmissionFolio data_file_name: str, s3_data_file_location: str, s3_details_location: str) -> List[str]: @@ -276,15 +260,12 @@ def summary_of_data_validation_errors(data_validation_errors: Dict, # submissio return [ f"Ingestion data validation error summary:", - # f"Data file: {submission.data_file_name}", f"Data file: {data_file_name}", - # f"Data file in S3: {submission.s3_data_file_location}", f"Data file in S3: {s3_data_file_location}", f"Items unidentified: {unidentified_count}", f"Items missing properties: {missing_properties_count}", f"Items with extraneous properties: {extraneous_properties_count}", f"Other errors: {unclassified_error_count}", f"Exceptions: {exception_count}", - # f"Details: {submission.s3_details_location}" f"Details: {s3_details_location}" ] diff --git a/test/test_bundle_utils.py b/test/test_bundle_utils.py index b656c55a1..ce716878f 100644 --- a/test/test_bundle_utils.py +++ b/test/test_bundle_utils.py @@ -1,12 +1,10 @@ import contextlib -# import copy import glob import json import os import pytest import re -# from collections import namedtuple from dcicutils import ( bundle_utils as bundle_utils_module, ff_utils as ff_utils_module, @@ -14,11 +12,11 @@ ) from dcicutils.bundle_utils import ( # High-level interfaces - load_table_structures, load_items, # inflate, + load_table_structures, load_items, # Low-level implementation SchemaManager, ItemTools, TableChecker, - # XlsxItemManager, CsvItemManager, TsvItemManager, ... - BoolHint, # NumHint, TypeHint, EnumHint, RefHint, ... + BoolHint, + # Probably we should test NumHint, TypeHint, EnumHint, RefHint, etc. as well. -kmp 23-Oct-2023 ) from dcicutils.common import AnyJsonData from dcicutils.env_utils import EnvUtils, public_env_name @@ -27,19 +25,9 @@ ) from dcicutils.qa_utils import printed_output, mock_not_called, MockResponse from dcicutils.sheet_utils import ( - # High-level interfaces - # TABLE_SET_MANAGER_REGISTRY, - # Low-level implementation - # BasicTableSetManager, - # XlsxManager, - CsvManager, # TsvManager, - # Error handling - LoadArgumentsError, LoadTableError, # LoadFailure, - # Utilities - infer_tab_name_from_filename, # prefer_number, unwanted_kwargs, expand_string_escape_sequences, - load_table_set, + CsvManager, LoadArgumentsError, LoadTableError, + infer_tab_name_from_filename, load_table_set, ) -# from dcicutils.validation_utils import validate_data_against_schemas, summary_of_data_validation_errors from typing import Dict from unittest import mock from .conftest_settings import TEST_DIR @@ -422,21 +410,6 @@ def test_sample_items_json_vs_yaml(): assert tabs_data_from_json == tabs_data_from_yaml -# @pytest.mark.parametrize('instaguids_enabled', [True, False]) -# def test_load_items_with_schema_and_instaguids(instaguids_enabled): -# -# with local_attrs(ItemTools, INSTAGUIDS_ENABLED=instaguids_enabled): -# -# expected = SAMPLE_CSV_FILE3_PERSON_CONTENT_HINTED -# print("expected=", json.dumps(expected, indent=2)) -# actual = load_items(SAMPLE_CSV_FILE3, schemas=SAMPLE_CSV_FILE3_SCHEMAS, tab_name='Person') -# print("actual=", json.dumps(actual, indent=2)) -# if instaguids_enabled: -# assert matches_template(actual, expected) -# else: -# assert actual == expected # no substitution performed - - @using_fresh_ff_state_for_testing() @pytest.mark.integrated @pytest.mark.parametrize('portal_env', [None, 'data']) From 9992ad57b9f65a5f5a49d49cf0d4fa88a3b97007 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Mon, 23 Oct 2023 13:05:22 -0400 Subject: [PATCH 081/101] Bump alpha version. --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 9e0b03cca..436b65dac 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dcicutils" -version = "8.0.0.1-alpha.11" # to become "8.1.0" +version = "8.0.0.1-alpha.12" # to become "8.1.0" description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources" authors = ["4DN-DCIC Team "] license = "MIT" From e6815f770a230ba12b3c768489d73de4c411a0cb Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Mon, 23 Oct 2023 13:49:03 -0400 Subject: [PATCH 082/101] Rearrange some items in dcicutils/common.py. No functional change. --- dcicutils/common.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/dcicutils/common.py b/dcicutils/common.py index 1f3c163ce..a390db668 100644 --- a/dcicutils/common.py +++ b/dcicutils/common.py @@ -67,8 +67,7 @@ KeyValuestringDict = Dict[Literal['Key', 'Value'], str] KeyValuestringDictList = List[KeyValuestringDict] -S3KeyName = str -S3BucketName = str +# ===== Miscellaneous Data ===== UrlString = str @@ -80,6 +79,9 @@ # ===== AWS Data ===== +S3KeyName = str +S3BucketName = str + # Refs: # * https://docs.aws.amazon.com/AmazonS3/latest/userguide/storage-class-intro.html # * https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html From 7cb98bbcd16fcd45efbdc699c54726e874028eb5 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Mon, 23 Oct 2023 13:49:30 -0400 Subject: [PATCH 083/101] Revert some changes to glacier_utils.py --- dcicutils/glacier_utils.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/dcicutils/glacier_utils.py b/dcicutils/glacier_utils.py index bbcf77893..7609ab316 100644 --- a/dcicutils/glacier_utils.py +++ b/dcicutils/glacier_utils.py @@ -58,6 +58,10 @@ def __init__(self, env_name: str): self.env_key = self.key_manager.get_keydict_for_env(env_name) self.health_page = get_health_page(key=self.env_key, ff_env=env_name) + @property + def kms_key_id(self) -> str: + return self.health_page.get("s3_encrypt_key_id", "") + @classmethod def is_glacier_storage_class(cls, storage_class: S3StorageClass): return storage_class in S3_GLACIER_CLASSES @@ -295,6 +299,9 @@ def _do_multipart_upload(self, bucket: str, key: str, total_size: int, part_size } if tags: cmu['Tagging'] = tags + if self.kms_key_id: + cmu['ServerSideEncryption'] = 'aws:kms' + cmu['SSEKMSKeyId'] = self.kms_key_id mpu = self.s3.create_multipart_upload(**cmu) mpu_upload_id = mpu['UploadId'] except Exception as e: @@ -381,16 +388,21 @@ def copy_object_back_to_original_location(self, bucket: str, key: str, storage_c else: # Force copy the object into standard in a single operation copy_source = {'Bucket': bucket, 'Key': key} - copy_target = { + copy_args = { 'Bucket': bucket, 'Key': key, 'StorageClass': storage_class, } if version_id: copy_source['VersionId'] = version_id - copy_target['CopySourceVersionId'] = version_id + copy_args['CopySourceVersionId'] = version_id if tags: - copy_target['Tagging'] = tags - response = self.s3.copy_object(CopySource=copy_source, **copy_target) + copy_args['Tagging'] = tags + if self.kms_key_id: + copy_args['ServerSideEncryption'] = 'aws:kms' + copy_args['SSEKMSKeyId'] = self.kms_key_id + response = self.s3.copy_object( + **copy_args, CopySource=copy_source + ) PRINT(f'Response from boto3 copy:\n{response}') PRINT(f'Object {bucket}/{key} copied back to its original location in S3') return response From 04a5aaea48b05209d2935de7ddf921260cd1016b Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Mon, 23 Oct 2023 15:22:14 -0400 Subject: [PATCH 084/101] Update changelog. --- CHANGELOG.rst | 79 +++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 58 insertions(+), 21 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 750ece5ca..67fb24567 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -6,43 +6,80 @@ dcicutils Change Log ---------- -8.0.0 +8.1.0 ===== -* Update Python to 3.11; and nixed Python 3.7. -* Updated boto3/botocore versions. -* Updatad pyyaml version to ^6.0.1; Mac M1 has issues building 5.4.1 (though 5.3.1 works). - See PyYAML 6.0 change log here: https://github.com/yaml/pyyaml/blob/master/CHANGES - The only incompatible change seems to be that yaml.load now requires a Loader argument; - and searching our GitHub organizations (4dn-dcic, dbmi-bgm, smaht-dac) the only ones which might - be affected are cwltools and parliament2, neither of which are dependent on dcicutils in any way. +* New module ``bundle_utils.py`` that is intended for schema-respecting worksheets ("metadata bundle"). + There are various modular bits of functionality here, but the main entry point here is: -7.14.0 -====== + * ``load_items`` to load data from a given table set, doing certain notational canonicalizations, and + checking that things are in the appropriate format. -* New module ``sheet_utils`` for loading workbooks. +* In ``common.py``, new hint types: - * Important things of interest: + * ``CsvReader`` + * ``JsonSchema`` + * ``Regexp`` - * Class ``ItemManager`` for loading Item-style data - from any ``.xlsx``, ``.csv`` or ``.tsv`` files. +* In ``lang_utils.py``: - * Function ``load_items`` that does the same as ``ItemManager.load``. + * New arguments ``just_are=`` to ``there_are`` get verb conjugation without the details. - * Various lower-level implementation classes such as: + * Add "while" to "which" and "that" as clause handlers in the string pluralizer + (e.g., so that "error while parsing x" pluralizes as "errors while parsing x") - * Classes ``XlsxManager``, ``CsvManager`` and ``TsvManager`` for loading raw data - from ``.xlsx``, ``.csv``, and ``.tsv`` files, respectively. +* In ``misc_utils.py``, miscellaneous new functionality: - * Classes ``XlsxItemManager``, ``CsvItemManager``, and ``TsvItemManager`` for loading Item-style data - from ``.xlsx``, ``.csv``, and ``.tsv`` files, respectively. + * New class ``AbstractVirtualApp`` that is either an actual VirtualApp or can be used to make mocks + if the thing being called expects an ``AbstractVirtualApp`` instead of a ``VirtualApp``. -* New functionality in ``misc_utils``: + * New function ``to_snake_case`` that assumes its argument is either a CamelCase string or snake_case string + and returns the snake_case form. * New function ``is_uuid`` (migrated from Fourfront) + * New function ``pad_to`` + * New class ``JsonLinesReader`` +* In ``qa_checkers.py``: + + * Change the ``VERSION_IS_BETA_PATTERN`` to recognize alpha or beta patterns. Probably a rename would be better, + but also incompatible. As far as I know, this is used only to not fuss if you haven't made a changelog entry + for a beta (or now also alpha). + +* New module ``sheet_utils.py`` for loading workbooks in a variety of formats, but without schema interpretation. + + A lot of this is implementation classes for each of the kinds of files, but the main entry point + is intended to be ``load_table_set`` if you are not working with schemas. For schema-related support, + see ``bundle_utils.py``. + +* New module ``validation_utils.py`` with these facilities: + + * New class ``SchemaManager`` for managing a set of schemas so that programs asking for a schema by name + only download one time and then use a cache. There are also facilities here for populating a dictionary + with all schemas in a table set (the kind of thing returned by ``load_table_set`` in ``sheet_utils.py``) + in order to pre-process it as a metadata bundle for checking purposes. + + * New functions: + + * ``validate_data_against_schemas`` to validate that table sets (workbooks, or the equivalent) have rows + in each tab conforming to the schema for that tab. + + * ``summary_of_data_validation_errors`` to summarize the errors obtained from ``validate_data_against_schemas``. + + +8.0.0 +===== + +* Update Python to 3.11; and nixed Python 3.7. +* Updated boto3/botocore versions. +* Updatad pyyaml version to ^6.0.1; Mac M1 has issues building 5.4.1 (though 5.3.1 works). + See PyYAML 6.0 change log here: https://github.com/yaml/pyyaml/blob/master/CHANGES + The only incompatible change seems to be that yaml.load now requires a Loader argument; + and searching our GitHub organizations (4dn-dcic, dbmi-bgm, smaht-dac) the only ones which might + be affected are cwltools and parliament2, neither of which are dependent on dcicutils in any way. + 7.13.0 ====== From 68d945977e8abb70ef8bceecdf23217831f423ee Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Mon, 23 Oct 2023 15:37:31 -0400 Subject: [PATCH 085/101] Bump alpha version. --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 436b65dac..7e1520b69 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dcicutils" -version = "8.0.0.1-alpha.12" # to become "8.1.0" +version = "8.0.0.1-alpha.13" # to become "8.1.0" description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources" authors = ["4DN-DCIC Team "] license = "MIT" From 46a2c0946f4c9ba44fbbd8237a15a32cb77ae47d Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Wed, 25 Oct 2023 05:54:37 -0400 Subject: [PATCH 086/101] Begin to address David's problems in C4-1111. --- dcicutils/bundle_utils.py | 55 ++++++++++++++++++++++++------ dcicutils/sheet_utils.py | 22 +++++++++++- dcicutils/validation_utils.py | 55 ++++++++++++++++++------------ pyproject.toml | 2 +- test/test_bundle_utils.py | 47 ++++++++++++++++++-------- test/test_validation_utils.py | 63 ++++++++++++++++++++++++++++++++--- 6 files changed, 192 insertions(+), 52 deletions(-) diff --git a/dcicutils/bundle_utils.py b/dcicutils/bundle_utils.py index 5e37e8660..fce23f71b 100644 --- a/dcicutils/bundle_utils.py +++ b/dcicutils/bundle_utils.py @@ -9,9 +9,9 @@ from .sheet_utils import ( LoadTableError, prefer_number, TabbedJsonSchemas, Header, Headers, TabbedHeaders, ParsedHeader, ParsedHeaders, TabbedParsedHeaders, SheetCellValue, TabbedSheetData, - TableSetManagerRegistry, AbstractTableSetManager, InsertsManager, load_table_set, + TableSetManagerRegistry, AbstractTableSetManager, InsertsManager, TableSetManager, load_table_set, ) -from .validation_utils import SchemaManager +from .validation_utils import SchemaManager, validate_data_against_schemas, summary_of_data_validation_errors PatchPrototype = Dict @@ -378,10 +378,19 @@ def load_table_structures(filename: str, *, apply_heuristics: bool = True, class TableChecker(InflatableTabbedDataManager, TypeHintContext): - def __init__(self, tabbed_sheet_data: TabbedSheetData, schemas: Optional[TabbedJsonSchemas] = None, + def __init__(self, tabbed_sheet_data: TabbedSheetData, *, flattened: bool, + override_schemas: Optional[TabbedJsonSchemas] = None, portal_env: Optional[str] = None, portal_vapp: Optional[AbstractVirtualApp] = None, apply_heuristics: bool = False): + self.flattened = flattened + if not flattened: + # TODO: Need to implement something that depends on this flattened attribute. + # Also, it's possible that we can default this once we see if the new strategy is general-purpose, + # rather than it being a required argument. But for now let's require it be passed. + # -kmp 25-Oct-2023 + raise ValueError("Only flattened=True is supported by TableChecker for now.") + if portal_env is None and portal_vapp is None: portal_env = public_env_name(EnvUtils.PRD_ENV_NAME) # InflatableTabbedDataManager supplies: @@ -394,7 +403,7 @@ def __init__(self, tabbed_sheet_data: TabbedSheetData, schemas: Optional[TabbedJ self.portal_env = portal_env self.portal_vapp = portal_vapp self.schema_manager: SchemaManager = SchemaManager(portal_env=portal_env, portal_vapp=portal_vapp, - schemas=schemas) + override_schemas=override_schemas) self.schemas = self.schema_manager.fetch_relevant_schemas(self.tab_names) # , schemas=schemas) self.lookup_tables_by_tab_name: Dict[str, Dict[str, Dict]] = { tab_name: self.build_lookup_table_for_tab(tab_name, rows=rows) @@ -463,6 +472,7 @@ def validate_ref(self, item_type, item_ref): if self.contains_ref(item_type=item_type, item_ref=item_ref): return True try: + # TODO: This probably needs a cache info = get_metadata(f"/{to_camel_case(item_type)}/{item_ref}") # Basically return True if there's a value at all, # but still check it's not an error message that didn't get raised. @@ -499,10 +509,13 @@ def check_row(self, row: Dict, *, tab_name: str, row_number: int, prototype: Dic return patch_item @classmethod - def check(cls, tabbed_sheet_data: TabbedSheetData, schemas: Optional[TabbedJsonSchemas] = None, + def check(cls, tabbed_sheet_data: TabbedSheetData, *, + flattened: bool, + override_schemas: Optional[TabbedJsonSchemas] = None, apply_heuristics: bool = False, portal_env: Optional[str] = None, portal_vapp: Optional[AbstractVirtualApp] = None): - checker = cls(tabbed_sheet_data, schemas=schemas, apply_heuristics=apply_heuristics, + checker = cls(tabbed_sheet_data, flattened=flattened, + override_schemas=override_schemas, apply_heuristics=apply_heuristics, portal_env=portal_env, portal_vapp=portal_vapp) checked = checker.check_tabs() return checked @@ -538,14 +551,34 @@ def create_tab_processor_state(self, tab_name: str) -> SheetState: def load_items(filename: str, tab_name: Optional[str] = None, escaping: Optional[bool] = None, - schemas: Optional[TabbedJsonSchemas] = None, apply_heuristics: bool = False, + override_schemas: Optional[TabbedJsonSchemas] = None, apply_heuristics: bool = False, portal_env: Optional[str] = None, portal_vapp: Optional[AbstractVirtualApp] = None, - validate: bool = False, **kwargs): - tabbed_rows = load_table_set(filename=filename, tab_name=tab_name, escaping=escaping, prefer_number=False, + # TODO: validate= is presently False (i.e., disabled) by default while being debugged, + # but for production use maybe should not be? -kmp 25-Oct-2023 + validate: bool = False, + **kwargs): + annotated_data = TableSetManager.load_annotated(filename=filename, tab_name=tab_name, escaping=escaping, prefer_number=False, **kwargs) - checked_items = check(tabbed_rows, schemas=schemas, portal_env=portal_env, portal_vapp=portal_vapp, - apply_heuristics=apply_heuristics) + tabbed_rows = annotated_data['content'] + flattened = annotated_data['flattened'] + if flattened: + checked_items = TableChecker.check(tabbed_rows, flattened=flattened, + override_schemas=override_schemas, + portal_env=portal_env, portal_vapp=portal_vapp, + apply_heuristics=apply_heuristics) + else: + # No fancy checking for things like .json, etc. for now. Only check things that came from + # spreadsheet-like data, where structural datatypes are forced into strings. + checked_items = tabbed_rows + if validate: + problems = validate_data_against_schemas(checked_items, portal_env=portal_env, portal_vapp=portal_vapp, + override_schemas=override_schemas) + error_summary = summary_of_data_validation_errors(problems) + if error_summary: + for item in error_summary: + print(item) + raise Exception("Validation problems were seen.") # TODO: Maybe connect validation here. Although another option is to just call validation separately # once this is successfully loaded. Needs thought. However, David's validation_utils can do # the validation if we decide to do it, it would just need to be connected up. diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index 9242bb765..66ce523fa 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -682,10 +682,30 @@ def load(cls, filename: str, tab_name: Optional[str] = None, escaping: Optional[ """ Given a filename and various options """ + annotated_content = cls.load_annotated(filename=filename, tab_name=tab_name, escaping=escaping, **kwargs) + content: TabbedSheetData = annotated_content['content'] + return content + + @classmethod + def load_annotated(cls, filename: str, tab_name: Optional[str] = None, escaping: Optional[bool] = None, + **kwargs) -> Dict: + """ + Given a filename and various options + """ + orig_filename = filename with maybe_unpack(filename) as filename: manager = cls.create_implementation_manager(filename=filename, tab_name=tab_name, escaping=escaping, **kwargs) - return manager.load_content() + content: TabbedSheetData = manager.load_content() + return { + 'filename': filename, + 'content': content, + 'tab_name': tab_name, + 'escaping': escaping, + 'singleton': isinstance(manager, SingleTableMixin), + 'flattened': isinstance(manager, FlattenedTableSetManager), + 'packed': orig_filename != filename, # tar or zip file that had to be unpacked somehow + } load_table_set = TableSetManager.load diff --git a/dcicutils/validation_utils.py b/dcicutils/validation_utils.py index 3e8169000..c4da5a856 100644 --- a/dcicutils/validation_utils.py +++ b/dcicutils/validation_utils.py @@ -8,33 +8,39 @@ from .ff_utils import get_schema from .env_utils import EnvUtils, public_env_name from .lang_utils import there_are, maybe_pluralize, disjoined_list -from .misc_utils import AbstractVirtualApp, PRINT +from .misc_utils import AbstractVirtualApp, PRINT, to_snake_case from .sheet_utils import JsonSchema, TabbedJsonSchemas, SheetData, TabbedSheetData from .task_utils import pmap class SchemaManager: - SCHEMA_CACHE = {} # Shared cache. Do not override. Use .clear_schema_cache() to clear it. - @classmethod @contextlib.contextmanager def fresh_schema_manager_context_for_testing(cls): - old_schema_cache = cls.SCHEMA_CACHE - try: - cls.SCHEMA_CACHE = {} - yield - finally: - cls.SCHEMA_CACHE = old_schema_cache - - def __init__(self, schemas: Optional[TabbedJsonSchemas] = None, + # TODO: Remove references to this once reimplementation using an instance variable for SCHEMA_CACHE is working. + yield + # old_schema_cache = cls.SCHEMA_CACHE + # try: + # cls.SCHEMA_CACHE = {} + # yield + # finally: + # cls.SCHEMA_CACHE = old_schema_cache + + def __init__(self, *, override_schemas: Optional[TabbedJsonSchemas] = None, portal_env: Optional[str] = None, portal_vapp: Optional[AbstractVirtualApp] = None): + self.SCHEMA_CACHE = {} # Shared cache. Do not override. Use .clear_schema_cache() to clear it. if portal_env is None and portal_vapp is None: portal_env = public_env_name(EnvUtils.PRD_ENV_NAME) PRINT(f"The portal_env was not explicitly supplied. Schemas will come from portal_env={portal_env!r}.") self.portal_env = portal_env self.portal_vapp = portal_vapp - self.schemas = {} if schemas is None else schemas.copy() + self.override_schemas = ( + {} + if override_schemas is None + else {to_snake_case(key): value # important to both canonicalize the case and copy the dict + for key, value in override_schemas.items()} + ) def fetch_relevant_schemas(self, schema_names: List[str]): # , schemas: Optional[TabbedSchemas] = None): # if schemas is None: @@ -51,7 +57,8 @@ def schema_exists(self, schema_name: str): return bool(self.fetch_schema(schema_name=schema_name)) def fetch_schema(self, schema_name: str): - override_schema = self.schemas.get(schema_name) + schema_name = to_snake_case(schema_name) + override_schema = self.override_schemas.get(schema_name) if override_schema is not None: return override_schema schema: Optional[AnyJsonData] = self.SCHEMA_CACHE.get(schema_name) @@ -60,10 +67,12 @@ def fetch_schema(self, schema_name: str): self.SCHEMA_CACHE[schema_name] = schema return schema - @classmethod - def clear_schema_cache(cls): - for key in list(cls.SCHEMA_CACHE.keys()): # important to get the list of keys as a separate object first - cls.SCHEMA_CACHE.pop(key, None) + # Should not be needed given SCHEMA_CACHE is an instance variable. + # + # @classmethod + # def clear_schema_cache(cls): + # for key in list(cls.SCHEMA_CACHE.keys()): # important to get the list of keys as a separate object first + # cls.SCHEMA_CACHE.pop(key, None) def identifying_properties(self, schema: Optional[JsonSchema] = None, schema_name: Optional[str] = None, among: Optional[List[str]] = None): @@ -76,7 +85,7 @@ def identifying_properties(self, schema: Optional[JsonSchema] = None, schema_nam if prop in possible_identifying_properties)) return identifying_properties - @classmethod + @classmethod # This operation doesn't actually use the schemas so is safe as a class method def identifying_value(cls, data_item: Dict[str, AnyJsonData], identifying_properties) -> AnyJsonData: if not identifying_properties: raise ValueError("No identifying properties were specified.") @@ -89,9 +98,10 @@ def identifying_value(cls, data_item: Dict[str, AnyJsonData], identifying_proper f' in {json.dumps(data_item)}.') -def validate_data_against_schemas(data: TabbedSheetData, +def validate_data_against_schemas(data: TabbedSheetData, *, + portal_env: Optional[str] = None, portal_vapp: Optional[AbstractVirtualApp] = None, - schemas: Optional[TabbedJsonSchemas] = None) -> Optional[Dict]: + override_schemas: Optional[TabbedJsonSchemas] = None) -> Optional[Dict]: """ Validates the given data against the corresponding schema(s). The given data is assumed to be in a format as returned by sheet_utils, i.e. a dictionary of lists of objects where each @@ -148,7 +158,7 @@ def validate_data_against_schemas(data: TabbedSheetData, the given data, which can be useful in identifying the object in the source data if it is unidentified. """ - schema_manager = SchemaManager(portal_vapp=portal_vapp, schemas=schemas) + schema_manager = SchemaManager(portal_env=portal_env, portal_vapp=portal_vapp, override_schemas=override_schemas) errors = [] schemas = schema_manager.fetch_relevant_schemas(list(data.keys())) @@ -156,7 +166,8 @@ def validate_data_against_schemas(data: TabbedSheetData, for data_type in data: schema = schemas.get(data_type) if not schema: - errors.append({"error": f"No schema found for: {data_type}"}) + if schema is None: # if Schema is {}, we're deliberately suppressing schema checking (not an error) + errors.append({"error": f"No schema found for: {data_type}"}) continue data_errors = validate_data_items_against_schemas(data[data_type], data_type, schema) errors.extend(data_errors) diff --git a/pyproject.toml b/pyproject.toml index 7e1520b69..0d661cd71 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dcicutils" -version = "8.0.0.1-alpha.13" # to become "8.1.0" +version = "8.0.0.1-alpha.14" # to become "8.1.0" description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources" authors = ["4DN-DCIC Team "] license = "MIT" diff --git a/test/test_bundle_utils.py b/test/test_bundle_utils.py index ce716878f..8b5777404 100644 --- a/test/test_bundle_utils.py +++ b/test/test_bundle_utils.py @@ -218,10 +218,19 @@ def test_load_table_structures(): assert str(exc.value) == "Unknown file type: something.else" -def test_load_items(): +@contextlib.contextmanager +def no_schemas(): with mock.patch.object(validation_utils_module, "get_schema") as mock_get_schema: mock_get_schema.return_value = {} + yield + + +def test_load_items(): + + # with mock.patch.object(validation_utils_module, "get_schema") as mock_get_schema: + # mock_get_schema.return_value = {} + with no_schemas(): assert load_items(SAMPLE_XLSX_FILE, apply_heuristics=True) == SAMPLE_XLSX_FILE_ITEM_CONTENT assert load_items(SAMPLE_CSV_FILE, apply_heuristics=True) == SAMPLE_CSV_FILE_ITEM_CONTENT @@ -385,29 +394,35 @@ def test_load_items_with_schema(): print("Case 2") file_base_name = os.path.splitext(os.path.basename(SAMPLE_CSV_FILE2))[0] expected = SAMPLE_CSV_FILE2_ITEM_CONTENT - actual = load_items(SAMPLE_CSV_FILE2, schemas={file_base_name: {}}, apply_heuristics=True) + actual = load_items(SAMPLE_CSV_FILE2, override_schemas={file_base_name: {}}, apply_heuristics=True) assert actual == expected print("Case 3") expected = SAMPLE_CSV_FILE2_PERSON_CONTENT_HINTED - actual = load_items(SAMPLE_CSV_FILE2, schemas=SAMPLE_CSV_FILE2_SCHEMAS, tab_name='Person') + actual = load_items(SAMPLE_CSV_FILE2, override_schemas=SAMPLE_CSV_FILE2_SCHEMAS, tab_name='Person') assert actual == expected def test_sample_items_csv_vs_json(): - csv_content = load_items(SAMPLE_CSV_FILE2, tab_name='Person', schemas=SAMPLE_CSV_FILE2_SCHEMAS) + csv_content = load_items(SAMPLE_CSV_FILE2, tab_name='Person', override_schemas=SAMPLE_CSV_FILE2_SCHEMAS) - json_content = load_items(SAMPLE_JSON_FILE2, tab_name="Person", schemas=SAMPLE_CSV_FILE2_SCHEMAS) + json_content = load_items(SAMPLE_JSON_FILE2, tab_name="Person", override_schemas=SAMPLE_CSV_FILE2_SCHEMAS) assert csv_content == json_content def test_sample_items_json_vs_yaml(): - tabs_data_from_json = load_items(SAMPLE_JSON_TABS_FILE) - tabs_data_from_yaml = load_items(SAMPLE_YAML_TABS_FILE) - assert tabs_data_from_json == tabs_data_from_yaml + with SchemaManager.fresh_schema_manager_context_for_testing(): + + # with mock.patch.object(validation_utils_module, "get_schema") as mock_get_schema: + # mock_get_schema.return_value = {} # no schema checking + with no_schemas(): + + tabs_data_from_json = load_items(SAMPLE_JSON_TABS_FILE) + tabs_data_from_yaml = load_items(SAMPLE_YAML_TABS_FILE) + assert tabs_data_from_json == tabs_data_from_yaml @using_fresh_ff_state_for_testing() @@ -421,7 +436,7 @@ def test_schema_autoload_mixin_caching(portal_env): assert schema_manager.portal_env == 'data' # it should have defaulted even if we didn't supply it - assert SchemaManager.SCHEMA_CACHE == {} + assert schema_manager.SCHEMA_CACHE == {} sample_schema_name = 'foo' sample_schema = {'mock_schema_for': 'foo'} @@ -431,7 +446,7 @@ def test_schema_autoload_mixin_caching(portal_env): assert schema_manager.fetch_schema(sample_schema_name) == sample_schema schema_cache_with_sample_schema = {sample_schema_name: sample_schema} - assert SchemaManager.SCHEMA_CACHE == schema_cache_with_sample_schema + assert schema_manager.SCHEMA_CACHE == schema_cache_with_sample_schema @using_fresh_ff_state_for_testing() @@ -639,7 +654,9 @@ def test_table_checker(): with printed_output() as printed: with pytest.raises(Exception) as exc: - checker = TableChecker(SAMPLE_WORKBOOK_WITH_UNMATCHED_UUID_REFS, portal_env=mock_ff_env) + checker = TableChecker(SAMPLE_WORKBOOK_WITH_UNMATCHED_UUID_REFS, + flattened=True, + portal_env=mock_ff_env) checker.check_tabs() assert str(exc.value) == "There were 2 problems while compiling hints." assert printed.lines == [ @@ -648,8 +665,12 @@ def test_table_checker(): f" {SAMPLE_INSTITUTION_UUID!r}") ] - checker = TableChecker(SAMPLE_WORKBOOK_WITH_MATCHED_UUID_REFS, portal_env=mock_ff_env) + checker = TableChecker(SAMPLE_WORKBOOK_WITH_MATCHED_UUID_REFS, + flattened=True, + portal_env=mock_ff_env) checker.check_tabs() - checker = TableChecker(SAMPLE_WORKBOOK_WITH_NAME_REFS, portal_env=mock_ff_env) + checker = TableChecker(SAMPLE_WORKBOOK_WITH_NAME_REFS, + flattened=True, + portal_env=mock_ff_env) checker.check_tabs() diff --git a/test/test_validation_utils.py b/test/test_validation_utils.py index ce1730d1e..d0039a957 100644 --- a/test/test_validation_utils.py +++ b/test/test_validation_utils.py @@ -4,13 +4,68 @@ import re from dcicutils.bundle_utils import inflate -from dcicutils.misc_utils import AbstractVirtualApp, NamedObject, json_file_contents, to_snake_case -from dcicutils.qa_utils import MockResponse +from dcicutils.misc_utils import AbstractVirtualApp, NamedObject, json_file_contents, to_snake_case, to_camel_case +from dcicutils.qa_utils import MockResponse, printed_output from dcicutils.validation_utils import SchemaManager, validate_data_against_schemas, summary_of_data_validation_errors from .conftest_settings import TEST_DIR from .helpers_for_bundles import SAMPLE_WORKBOOK_WITH_NAME_REFS +def test_schema_manager_simple(): + + print() # start on a fresh line + + with printed_output() as printed: + + schema_manager_1 = SchemaManager() + + assert printed.lines == [ + "The portal_env was not explicitly supplied. Schemas will come from portal_env='data'." + ] + + assert schema_manager_1.SCHEMA_CACHE == {} + + # Test that schema-lookup works, since that's kinda what these are about + user_schema = schema_manager_1.fetch_schema('user') + assert isinstance(user_schema, dict) + assert user_schema.get('title') == 'User' + + assert schema_manager_1.override_schemas == {} + + +@pytest.mark.parametrize('schema_id', ['user', 'User']) +def test_schema_manager_with_schemas(schema_id): + + print() # start on a fresh line + + with printed_output() as printed: + + schemas = {schema_id: {}} + snake_id = to_snake_case(schema_id) + camel_id = to_camel_case(schema_id) + + # Just to make sure to_snake_case and to_camel_case aren't doing something weird + assert schema_id == snake_id or schema_id == camel_id + + schema_manager_2 = SchemaManager(override_schemas=schemas) + + # whether 'User' or 'user' was an input, it will be canonicalized to snake case + assert schema_manager_2.override_schemas == {snake_id: {}} + + assert printed.lines == [ + "The portal_env was not explicitly supplied. Schemas will come from portal_env='data'." + ] + + assert schema_manager_2.fetch_schema(snake_id) == {} + assert schema_manager_2.fetch_schema(camel_id) == {} + + # even after using a camel case id, only the snake_id will be in the table + assert schema_manager_2.override_schemas == {snake_id: {}} + + # this would only get updated if we fetched something remotely + assert schema_manager_2.SCHEMA_CACHE == {} + + def test_schema_manager_identifying_value(): with pytest.raises(ValueError) as exc: @@ -54,14 +109,14 @@ def get(cls, path_url): good_workbook = inflate(SAMPLE_WORKBOOK_WITH_NAME_REFS) - assert validate_data_against_schemas(good_workbook, portal_vapp) is None + assert validate_data_against_schemas(good_workbook, portal_vapp=portal_vapp) is None bogus_workbook = copy.deepcopy(good_workbook) # modified immediately below user_items = bogus_workbook['User'] user_item0 = user_items[0] user_item0['bogus'] = 'item' - assert validate_data_against_schemas(bogus_workbook, portal_vapp) == { + assert validate_data_against_schemas(bogus_workbook, portal_vapp=portal_vapp) == { 'errors': [ { 'extraneous_properties': ['bogus'], From 263ce0a5e09d1fa7fc5f7d04523c2bfcaa5405c4 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Wed, 25 Oct 2023 06:03:41 -0400 Subject: [PATCH 087/101] PEP8 --- dcicutils/bundle_utils.py | 4 ++-- dcicutils/validation_utils.py | 25 ++++++++++++++++--------- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/dcicutils/bundle_utils.py b/dcicutils/bundle_utils.py index fce23f71b..22c014131 100644 --- a/dcicutils/bundle_utils.py +++ b/dcicutils/bundle_utils.py @@ -557,8 +557,8 @@ def load_items(filename: str, tab_name: Optional[str] = None, escaping: Optional # but for production use maybe should not be? -kmp 25-Oct-2023 validate: bool = False, **kwargs): - annotated_data = TableSetManager.load_annotated(filename=filename, tab_name=tab_name, escaping=escaping, prefer_number=False, - **kwargs) + annotated_data = TableSetManager.load_annotated(filename=filename, tab_name=tab_name, escaping=escaping, + prefer_number=False, **kwargs) tabbed_rows = annotated_data['content'] flattened = annotated_data['flattened'] if flattened: diff --git a/dcicutils/validation_utils.py b/dcicutils/validation_utils.py index c4da5a856..9b42069a0 100644 --- a/dcicutils/validation_utils.py +++ b/dcicutils/validation_utils.py @@ -67,7 +67,7 @@ def fetch_schema(self, schema_name: str): self.SCHEMA_CACHE[schema_name] = schema return schema - # Should not be needed given SCHEMA_CACHE is an instance variable. + # Should not be needed, given that SCHEMA_CACHE is an instance variable. # # @classmethod # def clear_schema_cache(cls): @@ -238,9 +238,9 @@ def extract_single_quoted_strings(message: str) -> List[str]: def summary_of_data_validation_errors(data_validation_errors: Dict, # These next three items are available from a portal's SubmissionFolio - data_file_name: str, - s3_data_file_location: str, - s3_details_location: str) -> List[str]: + data_file_name: Optional[str] = None, + s3_data_file_location: Optional[str] = None, + s3_details_location: Optional[str] = None) -> List[str]: """ Summarize the given data validation errors into a simple short list of English phrases; this will end up going into the additional_properties of the IngestionSubmission object @@ -269,14 +269,21 @@ def summary_of_data_validation_errors(data_validation_errors: Dict, if error.get("exception"): exception_count += 1 - return [ - f"Ingestion data validation error summary:", - f"Data file: {data_file_name}", - f"Data file in S3: {s3_data_file_location}", + result = [ + f"Ingestion data validation error summary:" + ] + if data_file_name: + result.append(f"Data file: {data_file_name}") + if s3_data_file_location: + result.append(f"Data file in S3: {s3_data_file_location}") + result = result + [ f"Items unidentified: {unidentified_count}", f"Items missing properties: {missing_properties_count}", f"Items with extraneous properties: {extraneous_properties_count}", f"Other errors: {unclassified_error_count}", f"Exceptions: {exception_count}", - f"Details: {s3_details_location}" ] + if s3_details_location: + result.append(f"Details: {s3_details_location}") + + return result From 60b5ef13aa901b104d2272591e167fdfcaa202fd Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Wed, 25 Oct 2023 10:16:36 -0400 Subject: [PATCH 088/101] Correct a testing problem (hopefully). --- test/test_validation_utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/test_validation_utils.py b/test/test_validation_utils.py index d0039a957..40878a222 100644 --- a/test/test_validation_utils.py +++ b/test/test_validation_utils.py @@ -4,6 +4,7 @@ import re from dcicutils.bundle_utils import inflate +from .helpers import using_fresh_ff_state_for_testing from dcicutils.misc_utils import AbstractVirtualApp, NamedObject, json_file_contents, to_snake_case, to_camel_case from dcicutils.qa_utils import MockResponse, printed_output from dcicutils.validation_utils import SchemaManager, validate_data_against_schemas, summary_of_data_validation_errors @@ -11,6 +12,7 @@ from .helpers_for_bundles import SAMPLE_WORKBOOK_WITH_NAME_REFS +@using_fresh_ff_state_for_testing() def test_schema_manager_simple(): print() # start on a fresh line @@ -33,6 +35,7 @@ def test_schema_manager_simple(): assert schema_manager_1.override_schemas == {} +@using_fresh_ff_state_for_testing() @pytest.mark.parametrize('schema_id', ['user', 'User']) def test_schema_manager_with_schemas(schema_id): @@ -66,6 +69,7 @@ def test_schema_manager_with_schemas(schema_id): assert schema_manager_2.SCHEMA_CACHE == {} +@using_fresh_ff_state_for_testing() def test_schema_manager_identifying_value(): with pytest.raises(ValueError) as exc: From d9fb9f686f950bf9f2dfdbed944602f243b25dfb Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Wed, 25 Oct 2023 13:59:50 -0400 Subject: [PATCH 089/101] Stub in support for checking non-flattened files. --- dcicutils/bundle_utils.py | 104 +++++++++++++++++++++++++++++++++----- pyproject.toml | 2 +- test/test_bundle_utils.py | 64 ++++++++++++++++++++++- 3 files changed, 154 insertions(+), 16 deletions(-) diff --git a/dcicutils/bundle_utils.py b/dcicutils/bundle_utils.py index 22c014131..2914e157d 100644 --- a/dcicutils/bundle_utils.py +++ b/dcicutils/bundle_utils.py @@ -1,11 +1,11 @@ import copy -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional, Tuple, Union from .common import AnyJsonData from .env_utils import EnvUtils, public_env_name from .ff_utils import get_metadata from .lang_utils import there_are -from .misc_utils import AbstractVirtualApp, ignored, PRINT, to_camel_case +from .misc_utils import AbstractVirtualApp, ignored, ignorable, PRINT, to_camel_case from .sheet_utils import ( LoadTableError, prefer_number, TabbedJsonSchemas, Header, Headers, TabbedHeaders, ParsedHeader, ParsedHeaders, TabbedParsedHeaders, SheetCellValue, TabbedSheetData, @@ -75,7 +75,9 @@ class NumHint(TypeHint): PREFERENCE_MAP = {'number': 'num', 'integer': 'int', 'float': 'float'} - def __init__(self, declared_type): + def __init__(self, declared_type: Optional[str] = None): + if declared_type is None: + declared_type = 'num' self.preferred_type = self.PREFERENCE_MAP.get(declared_type) def apply_hint(self, value): @@ -128,7 +130,51 @@ def apply_hint(self, value): return value -OptionalTypeHints = List[Optional[TypeHint]] +class OptionalTypeHints: + + def __init__(self, positional_hints: Optional[List[Optional[TypeHint]]] = None, + positional_breadcrumbs: Optional[List[Union[List, Tuple]]] = None): + self.other_hints: Dict[Any, TypeHint] = {} + self.positional_hints: List[Optional[TypeHint]] = [] if positional_hints is None else positional_hints + if positional_breadcrumbs and positional_hints: + n = len(positional_breadcrumbs) + if n != len(positional_hints): + raise Exception("positional_hints and positional_breadcrumbs must have the same length.") + for i in range(n): + # for convenience, we accept this as a list or tuple, but it must be a tuple to be a key + breadcrumbs = tuple(positional_breadcrumbs[i]) + if not isinstance(breadcrumbs, tuple): + raise Exception(f"Each of the positional breadcrumbs must be a tuple: {breadcrumbs}") + hint = positional_hints[i] + self.other_hints[breadcrumbs] = hint + + def __getitem__(self, key: Any) -> Optional[TypeHint]: + """ + For enumerated positional information, we consult our initial type vector. + For other situations, we do a general lookup of the hint in our lookup table. + """ + if isinstance(key, int): + hints = self.positional_hints + if key < 0: + raise ValueError(f"Negative hint positions are not allowed: {key}") + elif key >= len(hints): + return None + else: + return hints[key] + elif isinstance(key, tuple): # a parsed header (or schema breadcrumbs) + return self.other_hints.get(key) + else: + raise ValueError(f"Key of unexpected type for OptionalTypeHints: {key}") + + def __setitem__(self, key: Any, value: TypeHint): + if isinstance(key, int): + raise ValueError(f"Cannot assign OptionalTypeHints by position after initial creation: {key!r}") + elif key in self.other_hints: + raise ValueError(f"Attempt to redefine OptionalTypeHint key {key!r}.") + elif isinstance(key, tuple): + self.other_hints[key] = value + else: + raise ValueError(f"Attempt to set an OptionalTypeHints key to other than a breadcrumbs tuple: {key!r}") class AbstractStructureManager(AbstractTableSetManager): @@ -384,12 +430,12 @@ def __init__(self, tabbed_sheet_data: TabbedSheetData, *, flattened: bool, apply_heuristics: bool = False): self.flattened = flattened - if not flattened: - # TODO: Need to implement something that depends on this flattened attribute. - # Also, it's possible that we can default this once we see if the new strategy is general-purpose, - # rather than it being a required argument. But for now let's require it be passed. - # -kmp 25-Oct-2023 - raise ValueError("Only flattened=True is supported by TableChecker for now.") + # if not flattened: + # # TODO: Need to implement something that depends on this flattened attribute. + # # Also, it's possible that we can default this once we see if the new strategy is general-purpose, + # # rather than it being a required argument. But for now let's require it be passed. + # # -kmp 25-Oct-2023 + # raise ValueError("Only flattened=True is supported by TableChecker for now.") if portal_env is None and portal_vapp is None: portal_env = public_env_name(EnvUtils.PRD_ENV_NAME) @@ -494,6 +540,37 @@ def check_tab(self, tab_name: str): def check_row(self, row: Dict, *, tab_name: str, row_number: int, prototype: Dict, parsed_headers: ParsedHeaders, type_hints: OptionalTypeHints): + if self.flattened: + return self.check_flattened_row(row=row, tab_name=tab_name, row_number=row_number, prototype=prototype, + parsed_headers=parsed_headers, type_hints=type_hints) + else: + return self.check_inflated_row(row=row, tab_name=tab_name, row_number=row_number, prototype=prototype, + parsed_headers=parsed_headers, type_hints=type_hints) + + def check_inflated_row(self, row: Dict, *, tab_name: str, row_number: int, prototype: Dict, + parsed_headers: ParsedHeaders, type_hints: OptionalTypeHints): + ignorable(self, tab_name, row_number, prototype, parsed_headers, type_hints) # + # TODO: Make this work... + # def traverse(item, *, subschema, breadcrumbs): + # if isinstance(item, list): + # # check schema here to make sure it's supposed to be a list before proceeding + # for i, elem in enumerate(item): + # traverse(item, subschema=..., breadcrumbs=(*breadcrumbs, i)) + # elif isinstance(item, dict): + # # check schema here to make sure it's supposed to be a dict before proceeding + # for k, v in item.items(): + # traverse(v, subschema=..., breadcrumbs=(*breadcrumbs, k)) + # else: + # # look up hint. if there's not a hint for these breadcrumbs, make one + # # apply the hint for side-effect, to get an error if we have a bad value + # pass + # schema = self.schemas[tab_name] + # if schema: + # traverse(row, subschema=schema, breadcrumbs=()) # for side-effect + return row + + def check_flattened_row(self, row: Dict, *, tab_name: str, row_number: int, prototype: Dict, + parsed_headers: ParsedHeaders, type_hints: OptionalTypeHints): patch_item = copy.deepcopy(prototype) for column_number, column_value in enumerate(row.values()): parsed_value = ItemTools.parse_item_value(column_value, apply_heuristics=self.apply_heuristics) @@ -532,8 +609,9 @@ def compile_type_hints(self, tab_name: str) -> OptionalTypeHints: for required_header in self._schema_required_headers(schema): if required_header not in parsed_headers: self.note_problem("Missing required header") - type_hints = [ItemTools.find_type_hint(parsed_header, schema, context=self) if schema else None - for parsed_header in parsed_headers] + positional_type_hints = [ItemTools.find_type_hint(parsed_header, schema, context=self) if schema else None + for parsed_header in parsed_headers] + type_hints = OptionalTypeHints(positional_type_hints, positional_breadcrumbs=parsed_headers) return type_hints @classmethod @@ -577,7 +655,7 @@ def load_items(filename: str, tab_name: Optional[str] = None, escaping: Optional error_summary = summary_of_data_validation_errors(problems) if error_summary: for item in error_summary: - print(item) + PRINT(item) raise Exception("Validation problems were seen.") # TODO: Maybe connect validation here. Although another option is to just call validation separately # once this is successfully loaded. Needs thought. However, David's validation_utils can do diff --git a/pyproject.toml b/pyproject.toml index 0d661cd71..6a75f6c01 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dcicutils" -version = "8.0.0.1-alpha.14" # to become "8.1.0" +version = "8.0.0.1-alpha.15" # to become "8.1.0" description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources" authors = ["4DN-DCIC Team "] license = "MIT" diff --git a/test/test_bundle_utils.py b/test/test_bundle_utils.py index 8b5777404..a95aee1ef 100644 --- a/test/test_bundle_utils.py +++ b/test/test_bundle_utils.py @@ -14,8 +14,8 @@ # High-level interfaces load_table_structures, load_items, # Low-level implementation - SchemaManager, ItemTools, TableChecker, - BoolHint, + SchemaManager, ItemTools, TableChecker, OptionalTypeHints, + BoolHint, NumHint, # Probably we should test NumHint, TypeHint, EnumHint, RefHint, etc. as well. -kmp 23-Oct-2023 ) from dcicutils.common import AnyJsonData @@ -49,6 +49,66 @@ ) +def test_optional_type_hints(): + + x = OptionalTypeHints() + assert x.positional_hints == [] + assert x.other_hints == {} + assert x[0] is None + assert x[100] is None + with pytest.raises(ValueError) as exc: + print(x[-1]) + assert str(exc.value) == "Negative hint positions are not allowed: -1" + + bh = BoolHint() + nh = NumHint() + ih = NumHint(declared_type='int') + + x = OptionalTypeHints([bh, nh]) + assert x.positional_hints == [bh, nh] + assert x.other_hints == {} + assert x[0] is bh + assert x[1] is nh + assert x[2] is None + + x = OptionalTypeHints([bh, nh], positional_breadcrumbs=[('foo', 'x'), ('foo', 'y')]) + assert x.positional_hints == [bh, nh] + assert x.other_hints == { + ('foo', 'x'): bh, + ('foo', 'y'): nh, + } + assert x[0] is bh + assert x[1] is nh + assert x[2] is None + assert x[('something',)] is None + assert x[('foo', 'x')] is bh + assert x[('foo', 'y')] is nh + assert x[('foo', 'z')] is None + + with pytest.raises(ValueError) as exc: + x[2] = bh + assert str(exc.value) == "Cannot assign OptionalTypeHints by position after initial creation: 2" + assert x.positional_hints == [bh, nh] + + with pytest.raises(ValueError) as exc: + x['something'] = bh + assert str(exc.value) == "Attempt to set an OptionalTypeHints key to other than a breadcrumbs tuple: 'something'" + assert x.positional_hints == [bh, nh] + + x[('something',)] = ih + assert x.positional_hints == [bh, nh] + assert x.other_hints == { + ('foo', 'x'): bh, + ('foo', 'y'): nh, + ('something',): ih, + } + assert x[('something',)] == ih + + with pytest.raises(ValueError) as exc: + x[('something',)] = ih + assert str(exc.value) == "Attempt to redefine OptionalTypeHint key ('something',)." + + def test_item_tools_parse_sheet_header(): assert ItemTools.parse_sheet_header('.a') == ['a'] assert ItemTools.parse_sheet_header('a') == ['a'] From 748cde83aefe91380d8133a6e44905d373f8725f Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Wed, 25 Oct 2023 15:37:46 -0400 Subject: [PATCH 090/101] Refactor to make an extra entry point for type hinting. --- dcicutils/bundle_utils.py | 47 ++++++++++++++++++++++----------------- test/test_bundle_utils.py | 18 +++++++-------- 2 files changed, 35 insertions(+), 30 deletions(-) diff --git a/dcicutils/bundle_utils.py b/dcicutils/bundle_utils.py index 2914e157d..2c6304e8b 100644 --- a/dcicutils/bundle_utils.py +++ b/dcicutils/bundle_utils.py @@ -316,9 +316,25 @@ def set_path_value(cls, datum: Union[List, Dict], path: ParsedHeader, value: Any cls.set_path_value(datum[key], more_path, value) @classmethod - def find_type_hint(cls, parsed_header: Optional[ParsedHeader], schema: Any, - context: Optional[TypeHintContext] = None): + def find_type_hint_for_subschema(cls, subschema: Any, context: Optional[TypeHintContext] = None): + if subschema is not None: + t = subschema.get('type') + if t == 'string': + enum = subschema.get('enum') + if enum: + mapping = {e.lower(): e for e in enum} + return EnumHint(mapping) + link_to = subschema.get('linkTo') + if link_to and context.schema_exists(link_to): + return RefHint(schema_name=link_to, context=context) + elif t in ('integer', 'float', 'number'): + return NumHint(declared_type=t) + elif t == 'boolean': + return BoolHint() + @classmethod + def find_type_hint_for_parsed_header(cls, parsed_header: Optional[ParsedHeader], schema: Any, + context: Optional[TypeHintContext] = None): def finder(subheader, subschema): if not parsed_header: return None @@ -326,28 +342,15 @@ def finder(subheader, subschema): [key1, *other_headers] = subheader if isinstance(key1, str) and isinstance(subschema, dict): if subschema.get('type') == 'object': - def1 = subschema.get('properties', {}).get(key1) + subsubschema = subschema.get('properties', {}).get(key1) if not other_headers: - if def1 is not None: - t = def1.get('type') - if t == 'string': - enum = def1.get('enum') - if enum: - mapping = {e.lower(): e for e in enum} - return EnumHint(mapping) - link_to = def1.get('linkTo') - if link_to and context.schema_exists(link_to): - return RefHint(schema_name=link_to, context=context) - elif t in ('integer', 'float', 'number'): - return NumHint(declared_type=t) - elif t == 'boolean': - return BoolHint() - else: - pass # fall through to asking super() + hint = cls.find_type_hint_for_subschema(subsubschema, context=context) + if hint: + return hint else: pass # fall through to asking super() else: - return finder(subheader=other_headers, subschema=def1) + return finder(subheader=other_headers, subschema=subsubschema) return finder(subheader=parsed_header, subschema=schema) @@ -609,7 +612,9 @@ def compile_type_hints(self, tab_name: str) -> OptionalTypeHints: for required_header in self._schema_required_headers(schema): if required_header not in parsed_headers: self.note_problem("Missing required header") - positional_type_hints = [ItemTools.find_type_hint(parsed_header, schema, context=self) if schema else None + positional_type_hints = [(ItemTools.find_type_hint_for_parsed_header(parsed_header, schema, context=self) + if schema + else None) for parsed_header in parsed_headers] type_hints = OptionalTypeHints(positional_type_hints, positional_breadcrumbs=parsed_headers) return type_hints diff --git a/test/test_bundle_utils.py b/test/test_bundle_utils.py index a95aee1ef..b5d837339 100644 --- a/test/test_bundle_utils.py +++ b/test/test_bundle_utils.py @@ -219,13 +219,13 @@ def test_item_tools_set_path_value(): def test_item_tools_find_type_hint(): - assert ItemTools.find_type_hint(None, 'anything') is None + assert ItemTools.find_type_hint_for_parsed_header(None, 'anything') is None - assert ItemTools.find_type_hint(['foo', 'bar'], None) is None - assert ItemTools.find_type_hint(['foo', 'bar'], "something") is None - assert ItemTools.find_type_hint(['foo', 'bar'], {}) is None + assert ItemTools.find_type_hint_for_parsed_header(['foo', 'bar'], None) is None + assert ItemTools.find_type_hint_for_parsed_header(['foo', 'bar'], "something") is None + assert ItemTools.find_type_hint_for_parsed_header(['foo', 'bar'], {}) is None - actual = ItemTools.find_type_hint(['foo', 'bar'], {"type": "object"}) + actual = ItemTools.find_type_hint_for_parsed_header(['foo', 'bar'], {"type": "object"}) assert actual is None schema = { @@ -236,10 +236,10 @@ def test_item_tools_find_type_hint(): } } } - actual = ItemTools.find_type_hint(['foo', 'bar'], schema) + actual = ItemTools.find_type_hint_for_parsed_header(['foo', 'bar'], schema) assert actual is None - actual = ItemTools.find_type_hint(['foo'], schema) + actual = ItemTools.find_type_hint_for_parsed_header(['foo'], schema) assert isinstance(actual, BoolHint) schema = { @@ -255,10 +255,10 @@ def test_item_tools_find_type_hint(): } } } - actual = ItemTools.find_type_hint(['foo', 'bar'], schema) + actual = ItemTools.find_type_hint_for_parsed_header(['foo', 'bar'], schema) assert isinstance(actual, BoolHint) - actual = ItemTools.find_type_hint(['foo'], schema) + actual = ItemTools.find_type_hint_for_parsed_header(['foo'], schema) assert actual is None From 61b81556c2830ca93138dd5bbc5a9d2359715df2 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Mon, 30 Oct 2023 03:08:23 -0400 Subject: [PATCH 091/101] Support non-string elements of the sequences given to lang_utils.conjoined_list and .disjoined_list. --- CHANGELOG.rst | 3 +++ dcicutils/lang_utils.py | 5 +++++ pyproject.toml | 2 +- test/test_lang_utils.py | 6 ++++++ 4 files changed, 15 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 67fb24567..248887343 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -28,6 +28,9 @@ Change Log * Add "while" to "which" and "that" as clause handlers in the string pluralizer (e.g., so that "error while parsing x" pluralizes as "errors while parsing x") + * ``conjoin_list`` and ``disjoin_list`` now call ``str`` on their sequence elements so that things like + ``conjoined_list([2, 3, 4])`` are possible. + * In ``misc_utils.py``, miscellaneous new functionality: * New class ``AbstractVirtualApp`` that is either an actual VirtualApp or can be used to make mocks diff --git a/dcicutils/lang_utils.py b/dcicutils/lang_utils.py index de020f265..a87a7c2e6 100644 --- a/dcicutils/lang_utils.py +++ b/dcicutils/lang_utils.py @@ -521,6 +521,10 @@ def disjoined_list(cls, items, conjunction: str = 'or', comma: Union[bool, str] return cls.conjoined_list(items, conjunction=conjunction, comma=comma, oxford_comma=oxford_comma, whitespace=whitespace, nothing=nothing) + @classmethod + def _item_strings(cls, items): + return [str(x) for x in (sorted(items) if isinstance(items, set) else items)] + @classmethod def conjoined_list(cls, items, conjunction: str = 'and', comma: Union[bool, str] = ",", oxford_comma: Union[bool, str] = False, whitespace: str = " ", @@ -554,6 +558,7 @@ def conjoined_list(cls, items, conjunction: str = 'and', comma: Union[bool, str] :param nothing: a string to use if there are no items, to avoid an error being raised. """ + items = cls._item_strings(items) assert isinstance(conjunction, str), "The 'conjunction' argument must a string or boolean." conj = conjunction + whitespace diff --git a/pyproject.toml b/pyproject.toml index 6a75f6c01..542ba3e88 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dcicutils" -version = "8.0.0.1-alpha.15" # to become "8.1.0" +version = "8.0.0.1-alpha.16" # to become "8.1.0" description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources" authors = ["4DN-DCIC Team "] license = "MIT" diff --git a/test/test_lang_utils.py b/test/test_lang_utils.py index 7d7694361..bd480b887 100644 --- a/test/test_lang_utils.py +++ b/test/test_lang_utils.py @@ -438,6 +438,9 @@ def test_conjoined_list(): # For a set, we use the elements sorted. assert conjoined_list({'apple', 'lemon', 'grape'}) == 'apple, grape and lemon' + # We allow non-numbers, which will be converted to strings with str. + assert conjoined_list([2,3,4]) == "2, 3 and 4" + # For dictionary, we use the keys in the order they occur. assert conjoined_list({'apple': 'delicious', 'lemon': 'meyer', 'grape': 'seedless'}) == 'apple, lemon and grape' @@ -460,6 +463,9 @@ def test_disjoined_list(): # For dictionary, we use the keys in the order they occur. assert disjoined_list({'apple': 'delicious', 'lemon': 'meyer', 'grape': 'seedless'}) == 'apple, lemon or grape' + # We allow non-numbers, which will be converted to strings with str. + assert disjoined_list([2,3,4]) == "2, 3 or 4" + def test_there_are(): From d9108532cbc879652dc325983e4d6f929f775f6d Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Tue, 31 Oct 2023 13:11:54 -0400 Subject: [PATCH 092/101] PEP8 --- test/test_lang_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_lang_utils.py b/test/test_lang_utils.py index bd480b887..a4c41deb0 100644 --- a/test/test_lang_utils.py +++ b/test/test_lang_utils.py @@ -439,7 +439,7 @@ def test_conjoined_list(): assert conjoined_list({'apple', 'lemon', 'grape'}) == 'apple, grape and lemon' # We allow non-numbers, which will be converted to strings with str. - assert conjoined_list([2,3,4]) == "2, 3 and 4" + assert conjoined_list([2, 3, 4]) == "2, 3 and 4" # For dictionary, we use the keys in the order they occur. assert conjoined_list({'apple': 'delicious', 'lemon': 'meyer', 'grape': 'seedless'}) == 'apple, lemon and grape' @@ -464,7 +464,7 @@ def test_disjoined_list(): assert disjoined_list({'apple': 'delicious', 'lemon': 'meyer', 'grape': 'seedless'}) == 'apple, lemon or grape' # We allow non-numbers, which will be converted to strings with str. - assert disjoined_list([2,3,4]) == "2, 3 or 4" + assert disjoined_list([2, 3, 4]) == "2, 3 or 4" def test_there_are(): From dfc93e3610c9cae9244586c9ebff22d7d7dfd828 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Tue, 31 Oct 2023 13:26:26 -0400 Subject: [PATCH 093/101] De-beta as 8.1.0 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 542ba3e88..091a071a3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dcicutils" -version = "8.0.0.1-alpha.16" # to become "8.1.0" +version = "8.1.0" description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources" authors = ["4DN-DCIC Team "] license = "MIT" From e9505e0277997f31c611ac27ea3c0f3d6cd1d6f8 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Thu, 2 Nov 2023 10:27:05 -0400 Subject: [PATCH 094/101] Misc changes related to SMaHT ingestion. --- CHANGELOG.rst | 8 ++++++++ dcicutils/bundle_utils.py | 4 +++- dcicutils/ff_utils.py | 14 +++++++++++++- dcicutils/validation_utils.py | 16 +++++++++++++--- pyproject.toml | 2 +- 5 files changed, 38 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 248887343..c50a6d38a 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -6,6 +6,14 @@ dcicutils Change Log ---------- +8.2.0 +===== +* 2023-11-02 +* Added ``SchemaManager.get_identifying_properties`` in ``bundle_utils`` + which implicitly adds ``identifier`` to ``identifyingProperties``. +* Added support for ``portal_vapp`` to to `ff_utils.get_metadata``. + + 8.1.0 ===== diff --git a/dcicutils/bundle_utils.py b/dcicutils/bundle_utils.py index 2c6304e8b..462e75387 100644 --- a/dcicutils/bundle_utils.py +++ b/dcicutils/bundle_utils.py @@ -522,7 +522,8 @@ def validate_ref(self, item_type, item_ref): return True try: # TODO: This probably needs a cache - info = get_metadata(f"/{to_camel_case(item_type)}/{item_ref}") + info = get_metadata(f"/{to_camel_case(item_type)}/{item_ref}", + ff_env=self.portal_env, vapp=self.portal_vapp) # Basically return True if there's a value at all, # but still check it's not an error message that didn't get raised. return isinstance(info, dict) and 'uuid' in info @@ -657,6 +658,7 @@ def load_items(filename: str, tab_name: Optional[str] = None, escaping: Optional if validate: problems = validate_data_against_schemas(checked_items, portal_env=portal_env, portal_vapp=portal_vapp, override_schemas=override_schemas) + return checked_items, problems error_summary = summary_of_data_validation_errors(problems) if error_summary: for item in error_summary: diff --git a/dcicutils/ff_utils.py b/dcicutils/ff_utils.py index 6f011bea4..626551d52 100644 --- a/dcicutils/ff_utils.py +++ b/dcicutils/ff_utils.py @@ -277,7 +277,8 @@ def _sls(val): return val.lstrip('/') -def get_metadata(obj_id, key=None, ff_env=None, check_queue=False, add_on=''): +def get_metadata(obj_id, key=None, ff_env=None, check_queue=False, add_on='', vapp: Optional[VirtualApp] = None): + """ Function to get metadata for a given obj_id (uuid or @id, most likely). Either takes a dictionary form authentication (MUST include 'server') @@ -290,6 +291,13 @@ def get_metadata(obj_id, key=None, ff_env=None, check_queue=False, add_on=''): "frame=object&force_md5" *REQUIRES ff_env if check_queue is used* """ + if vapp: + url = f"/{obj_id}?{add_on}" + response = vapp.get(url) + if response and response.status_code in [301, 302, 303, 307, 308]: + response = response.follow() + return get_response_json(response) + auth = get_authentication_with_server(key, ff_env) if check_queue and stuff_in_queues(ff_env, check_secondary=False): add_on += '&datastore=database' @@ -989,6 +997,10 @@ def get_schema(name, key=None, ff_env: Optional[str] = None, portal_env: Optiona portal_env = resolve_portal_env(ff_env=ff_env, portal_env=portal_env, portal_vapp=portal_vapp) base_url = f"profiles/{to_camel_case(name)}.json" add_on = 'frame=raw' + + schema = get_metadata(obj_id=base_url, key=key, ff_env=portal_env, add_on=add_on, vapp=portal_vapp) # xyzzy: is this okay? test + return schema + if portal_vapp: full_url = f"/{base_url}?{add_on}" res = portal_vapp.get(full_url) diff --git a/dcicutils/validation_utils.py b/dcicutils/validation_utils.py index 9b42069a0..9e80146a0 100644 --- a/dcicutils/validation_utils.py +++ b/dcicutils/validation_utils.py @@ -77,7 +77,7 @@ def fetch_schema(self, schema_name: str): def identifying_properties(self, schema: Optional[JsonSchema] = None, schema_name: Optional[str] = None, among: Optional[List[str]] = None): schema = schema if schema is not None else self.fetch_schema(schema_name) - possible_identifying_properties = set(schema.get("identifyingProperties") or []) | {'uuid'} + possible_identifying_properties = set(self.get_identifying_properties(schema)) | {'uuid'} identifying_properties = sorted(possible_identifying_properties if among is None else (prop @@ -97,6 +97,16 @@ def identifying_value(cls, data_item: Dict[str, AnyJsonData], identifying_proper f' {disjoined_list([repr(x) for x in identifying_properties])}' f' in {json.dumps(data_item)}.') + @staticmethod + def get_identifying_properties(schema: dict) -> list: + if not schema: + return [] + identifying_properties = schema.get("identifyingProperties", []) + # Implicitly add "identifier" to "identifyingProperties", if it exists. + if "identifier" not in identifying_properties and "identifier" in schema.get("properties", {}): + identifying_properties.append("identifier") + return identifying_properties + def validate_data_against_schemas(data: TabbedSheetData, *, portal_env: Optional[str] = None, @@ -196,7 +206,7 @@ def validate_data_item_against_schemas(data_item: AnyJsonData, data_type: str, """ errors = [] - identifying_properties = schema.get("identifyingProperties", []) + identifying_properties = SchemaManager.get_identifying_properties(schema) identifying_value = SchemaManager.identifying_value(data_item, identifying_properties) if not identifying_value: errors.append({ @@ -264,7 +274,7 @@ def summary_of_data_validation_errors(data_validation_errors: Dict, missing_properties_count += 1 if error.get("extraneous_properties"): extraneous_properties_count += 1 - if error.get("unclassified_error_count"): + if error.get("unclassified_error"): unclassified_error_count += 1 if error.get("exception"): exception_count += 1 diff --git a/pyproject.toml b/pyproject.toml index 091a071a3..4094305b6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dcicutils" -version = "8.1.0" +version = "8.1.0.1b1" # TODO: To become 8.2.0 description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources" authors = ["4DN-DCIC Team "] license = "MIT" From 1a96dd7ebfb2bb6a2d48e66ce98b51964a7033a4 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Thu, 2 Nov 2023 10:36:51 -0400 Subject: [PATCH 095/101] typp --- dcicutils/ff_utils.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/dcicutils/ff_utils.py b/dcicutils/ff_utils.py index 626551d52..e18ded6fd 100644 --- a/dcicutils/ff_utils.py +++ b/dcicutils/ff_utils.py @@ -997,18 +997,7 @@ def get_schema(name, key=None, ff_env: Optional[str] = None, portal_env: Optiona portal_env = resolve_portal_env(ff_env=ff_env, portal_env=portal_env, portal_vapp=portal_vapp) base_url = f"profiles/{to_camel_case(name)}.json" add_on = 'frame=raw' - - schema = get_metadata(obj_id=base_url, key=key, ff_env=portal_env, add_on=add_on, vapp=portal_vapp) # xyzzy: is this okay? test - return schema - - if portal_vapp: - full_url = f"/{base_url}?{add_on}" - res = portal_vapp.get(full_url) - return get_response_json(res) - else: - schema = get_metadata(obj_id=base_url, key=key, ff_env=portal_env, add_on=add_on) - return schema - + return get_metadata(obj_id=base_url, key=key, ff_env=portal_env, add_on=add_on, vapp=portal_vapp) def get_schemas(key=None, ff_env: Optional[str] = None, *, allow_abstract: bool = True, require_id: bool = False, portal_env: Optional[str] = None, portal_vapp: Optional[VirtualApp] = None) -> Dict[str, Dict]: From 7e27c3f1ec7f6a6ec8517c6110116d33cfb5060f Mon Sep 17 00:00:00 2001 From: David Michaels Date: Thu, 2 Nov 2023 10:40:34 -0400 Subject: [PATCH 096/101] removed dead code --- dcicutils/bundle_utils.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/dcicutils/bundle_utils.py b/dcicutils/bundle_utils.py index 462e75387..7d7de0069 100644 --- a/dcicutils/bundle_utils.py +++ b/dcicutils/bundle_utils.py @@ -654,19 +654,8 @@ def load_items(filename: str, tab_name: Optional[str] = None, escaping: Optional # No fancy checking for things like .json, etc. for now. Only check things that came from # spreadsheet-like data, where structural datatypes are forced into strings. checked_items = tabbed_rows - if validate: problems = validate_data_against_schemas(checked_items, portal_env=portal_env, portal_vapp=portal_vapp, override_schemas=override_schemas) return checked_items, problems - error_summary = summary_of_data_validation_errors(problems) - if error_summary: - for item in error_summary: - PRINT(item) - raise Exception("Validation problems were seen.") - # TODO: Maybe connect validation here. Although another option is to just call validation separately - # once this is successfully loaded. Needs thought. However, David's validation_utils can do - # the validation if we decide to do it, it would just need to be connected up. - # -kmp 23-Oct-2023 - raise NotImplementedError("Need to implement validation.") return checked_items From 8a4adb361080ba1881a9b85456d596124318c2cb Mon Sep 17 00:00:00 2001 From: David Michaels Date: Thu, 2 Nov 2023 12:07:19 -0400 Subject: [PATCH 097/101] flake8 fixes --- dcicutils/bundle_utils.py | 9 +++++---- dcicutils/ff_utils.py | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/dcicutils/bundle_utils.py b/dcicutils/bundle_utils.py index 7d7de0069..fb76c74a5 100644 --- a/dcicutils/bundle_utils.py +++ b/dcicutils/bundle_utils.py @@ -4,14 +4,13 @@ from .common import AnyJsonData from .env_utils import EnvUtils, public_env_name from .ff_utils import get_metadata -from .lang_utils import there_are from .misc_utils import AbstractVirtualApp, ignored, ignorable, PRINT, to_camel_case from .sheet_utils import ( LoadTableError, prefer_number, TabbedJsonSchemas, Header, Headers, TabbedHeaders, ParsedHeader, ParsedHeaders, TabbedParsedHeaders, SheetCellValue, TabbedSheetData, TableSetManagerRegistry, AbstractTableSetManager, InsertsManager, TableSetManager, load_table_set, ) -from .validation_utils import SchemaManager, validate_data_against_schemas, summary_of_data_validation_errors +from .validation_utils import SchemaManager, validate_data_against_schemas PatchPrototype = Dict @@ -40,7 +39,8 @@ def __str__(self): class ValidationProblem(Exception): - pass + def __init__(self, problems: Optional[dict] = None): + self.problems = problems class TypeHint: @@ -506,7 +506,8 @@ def raise_any_pending_problems(self): if problems: for problem in problems: PRINT(f"Problem: {problem}") - raise Exception(there_are(problems, kind='problem while compiling hints', tense='past', show=False)) + raise ValidationProblem(problems) + # raise Exception(there_are(problems, kind='problem while compiling hints', tense='past', show=False)) def check_tabs(self): result = {tab_name: self.check_tab(tab_name) diff --git a/dcicutils/ff_utils.py b/dcicutils/ff_utils.py index e18ded6fd..4a4abf428 100644 --- a/dcicutils/ff_utils.py +++ b/dcicutils/ff_utils.py @@ -278,7 +278,6 @@ def _sls(val): def get_metadata(obj_id, key=None, ff_env=None, check_queue=False, add_on='', vapp: Optional[VirtualApp] = None): - """ Function to get metadata for a given obj_id (uuid or @id, most likely). Either takes a dictionary form authentication (MUST include 'server') @@ -999,6 +998,7 @@ def get_schema(name, key=None, ff_env: Optional[str] = None, portal_env: Optiona add_on = 'frame=raw' return get_metadata(obj_id=base_url, key=key, ff_env=portal_env, add_on=add_on, vapp=portal_vapp) + def get_schemas(key=None, ff_env: Optional[str] = None, *, allow_abstract: bool = True, require_id: bool = False, portal_env: Optional[str] = None, portal_vapp: Optional[VirtualApp] = None) -> Dict[str, Dict]: """ From cbc91bd6f68b6e566fb7bb26a795cc3d6858db6b Mon Sep 17 00:00:00 2001 From: David Michaels Date: Thu, 2 Nov 2023 13:21:06 -0400 Subject: [PATCH 098/101] Test fixups --- test/test_bundle_utils.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/test/test_bundle_utils.py b/test/test_bundle_utils.py index b5d837339..8025fe4db 100644 --- a/test/test_bundle_utils.py +++ b/test/test_bundle_utils.py @@ -14,7 +14,7 @@ # High-level interfaces load_table_structures, load_items, # Low-level implementation - SchemaManager, ItemTools, TableChecker, OptionalTypeHints, + SchemaManager, ItemTools, TableChecker, OptionalTypeHints, ValidationProblem, BoolHint, NumHint, # Probably we should test NumHint, TypeHint, EnumHint, RefHint, etc. as well. -kmp 23-Oct-2023 ) @@ -635,10 +635,7 @@ def get(self, path_url): old_count = portal_vapp.call_count with mock.patch.object(ff_utils_module, "get_authentication_with_server", mock_not_called("get_authentication_with_server")): - with mock.patch.object(ff_utils_module, "get_metadata", - mock_not_called("get_metadata")): - actual_items = load_items(SAMPLE_ITEMS_FOR_REAL_SCHEMAS_FILE, - tab_name='ExperimentSeq', portal_vapp=portal_vapp) + actual_items = load_items(SAMPLE_ITEMS_FOR_REAL_SCHEMAS_FILE, tab_name='ExperimentSeq', portal_vapp=portal_vapp) assert portal_vapp.call_count == old_count + 1 assert actual_items == expected_items @@ -718,12 +715,13 @@ def test_table_checker(): flattened=True, portal_env=mock_ff_env) checker.check_tabs() - assert str(exc.value) == "There were 2 problems while compiling hints." - assert printed.lines == [ - f"Problem: User[0].project: Unable to validate Project reference: {SAMPLE_PROJECT_UUID!r}", - (f"Problem: User[0].user_institution: Unable to validate Institution reference:" - f" {SAMPLE_INSTITUTION_UUID!r}") + expected_problems = [ + f"User[0].project: Unable to validate Project reference: {SAMPLE_PROJECT_UUID!r}", + f"User[0].user_institution: Unable to validate Institution reference: {SAMPLE_INSTITUTION_UUID!r}" ] + expected_problem_lines = [f"Problem: {problem}" for problem in expected_problems] + assert exc.value.problems == expected_problems + assert printed.lines == expected_problem_lines checker = TableChecker(SAMPLE_WORKBOOK_WITH_MATCHED_UUID_REFS, flattened=True, From 2e8dc6edd5f1ea1226d6f3c58544cd71d5e4e0c1 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Thu, 2 Nov 2023 13:22:56 -0400 Subject: [PATCH 099/101] flake8 fixes --- test/test_bundle_utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/test/test_bundle_utils.py b/test/test_bundle_utils.py index 8025fe4db..6e1da6685 100644 --- a/test/test_bundle_utils.py +++ b/test/test_bundle_utils.py @@ -14,7 +14,7 @@ # High-level interfaces load_table_structures, load_items, # Low-level implementation - SchemaManager, ItemTools, TableChecker, OptionalTypeHints, ValidationProblem, + SchemaManager, ItemTools, TableChecker, OptionalTypeHints, BoolHint, NumHint, # Probably we should test NumHint, TypeHint, EnumHint, RefHint, etc. as well. -kmp 23-Oct-2023 ) @@ -635,7 +635,8 @@ def get(self, path_url): old_count = portal_vapp.call_count with mock.patch.object(ff_utils_module, "get_authentication_with_server", mock_not_called("get_authentication_with_server")): - actual_items = load_items(SAMPLE_ITEMS_FOR_REAL_SCHEMAS_FILE, tab_name='ExperimentSeq', portal_vapp=portal_vapp) + actual_items = load_items(SAMPLE_ITEMS_FOR_REAL_SCHEMAS_FILE, + tab_name='ExperimentSeq', portal_vapp=portal_vapp) assert portal_vapp.call_count == old_count + 1 assert actual_items == expected_items From e0f55a3f4ad95054dda00efc1c7efbcb405f59f4 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Thu, 2 Nov 2023 13:59:27 -0400 Subject: [PATCH 100/101] Updates for test fixes --- dcicutils/ff_utils.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/dcicutils/ff_utils.py b/dcicutils/ff_utils.py index 4a4abf428..e2c6ac089 100644 --- a/dcicutils/ff_utils.py +++ b/dcicutils/ff_utils.py @@ -996,7 +996,19 @@ def get_schema(name, key=None, ff_env: Optional[str] = None, portal_env: Optiona portal_env = resolve_portal_env(ff_env=ff_env, portal_env=portal_env, portal_vapp=portal_vapp) base_url = f"profiles/{to_camel_case(name)}.json" add_on = 'frame=raw' - return get_metadata(obj_id=base_url, key=key, ff_env=portal_env, add_on=add_on, vapp=portal_vapp) + + # TODO + # Now that get_metadata supported portal_vapp we can do: + # return get_metadata(obj_id=base_url, key=key, ff_env=portal_env, add_on=add_on, vapp=portal_vapp) + # however this breaks test_ff_utils.test_get_schema_with_vapp and no time to fix. 2023-11-02. + + if portal_vapp: + full_url = f"/{base_url}?{add_on}" + res = portal_vapp.get(full_url) + return get_response_json(res) + else: + schema = get_metadata(obj_id=base_url, key=key, ff_env=portal_env, add_on=add_on) + return schema def get_schemas(key=None, ff_env: Optional[str] = None, *, allow_abstract: bool = True, require_id: bool = False, From 7c6908524c8f27f04d8867bdd3efd5541106b3de Mon Sep 17 00:00:00 2001 From: David Michaels Date: Thu, 2 Nov 2023 14:04:30 -0400 Subject: [PATCH 101/101] update to 8.2.0; pr approved; ready to merge to master --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 4094305b6..5c3863c3a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dcicutils" -version = "8.1.0.1b1" # TODO: To become 8.2.0 +version = "8.2.0" description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources" authors = ["4DN-DCIC Team "] license = "MIT"