diff --git a/README.md b/README.md index 870f602..6351d9b 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ import numpy as np import literature R = sparse.load_npz("data/thrm_vertex_matrix.npz") -mats = np.array(open("data/thrm_mats.txt", "r").read().splitlines) +mats = np.array(open("data/thrm_mats.txt", "r").read().splitlines()) props = ["thermoelectric"] ``` We have also included a file that includes publication year of the papers that we considered in our vertex matrix, which can be used to limit our focus to @@ -77,6 +77,11 @@ The sequences generated from the random walk process could then be used to train ``` import utils +with open("rw_seqs.txt", "w") as file: + for i in range(100): + rw_seqs = h.random_walk(length, size, start_inds=prop_ind, alpha=2, rand_seed=i)[0][0] # non-uniform sampling (alpha=1) + file.write(rw_seqs+'\n') + seqs = open("rw_seqs.txt").read().splitlines() # reading the sequences seqs_noauthors = utils.remove_authors_from_RW(seqs) # removing the author nodes open("rw_seqs_noauthors.txt", "w").write("\n".join(seqs_noauthors)+"\n") # saving the pruned sequences @@ -86,6 +91,8 @@ Now, we are ready to build our word embedding model. The default parameters are ``` seqs_noauthor_path = "rw_seqs_noauthors.txt" +import embedding + embed = embedding.dww2v(seqs_noauthor_path, workers=20) # initiating deepwalk model with a different value for parameter workers embed.build_model() embed.train() diff --git a/environment_acc.yaml b/environment_acc.yaml new file mode 100644 index 0000000..b426910 --- /dev/null +++ b/environment_acc.yaml @@ -0,0 +1,67 @@ +name: acc +channels: + - r + - defaults +dependencies: + - _libgcc_mutex=0.1=main + - _openmp_mutex=5.1=1_gnu + - ca-certificates=2023.08.22=h06a4308_0 + - ld_impl_linux-64=2.38=h1181459_1 + - libffi=3.4.4=h6a678d5_0 + - libgcc-ng=11.2.0=h1234567_1 + - libgomp=11.2.0=h1234567_1 + - libstdcxx-ng=11.2.0=h1234567_1 + - ncurses=6.4=h6a678d5_0 + - openssl=3.0.12=h7f8727e_0 + - pip=23.3=py38h06a4308_0 + - python=3.8.18=h955ad1f_0 + - readline=8.2=h5eee18b_0 + - setuptools=68.0.0=py38h06a4308_0 + - sqlite=3.41.2=h5eee18b_0 + - tk=8.6.12=h1ccaba5_0 + - wheel=0.41.2=py38h06a4308_0 + - xz=5.4.2=h5eee18b_0 + - zlib=1.2.13=h5eee18b_0 + - pip: + - asttokens==2.4.1 + - backcall==0.2.0 + - comm==0.2.0 + - debugpy==1.8.0 + - decorator==5.1.1 + - executing==2.0.1 + - gensim==3.8.0 + - importlib-metadata==6.8.0 + - ipykernel==6.26.0 + - ipython==8.12.3 + - jedi==0.19.1 + - joblib==1.1.0 + - jupyter-client==8.6.0 + - jupyter-core==5.5.0 + - matplotlib-inline==0.1.6 + - nest-asyncio==1.5.8 + - networkx==2.7.1 + - numpy==1.24.4 + - packaging==23.2 + - parso==0.8.3 + - pexpect==4.8.0 + - pickleshare==0.7.5 + - platformdirs==3.11.0 + - prompt-toolkit==3.0.39 + - psutil==5.9.6 + - ptyprocess==0.7.0 + - pure-eval==0.2.2 + - pygments==2.16.1 + - pymysql==0.9.3 + - python-dateutil==2.8.2 + - pyzmq==25.1.1 + - regex==2022.3.2 + - scipy==1.7.3 + - six==1.16.0 + - smart-open==6.4.0 + - stack-data==0.6.3 + - tornado==6.3.3 + - tqdm==4.63.0 + - traitlets==5.13.0 + - typing-extensions==4.8.0 + - wcwidth==0.2.9 + - zipp==3.17.0 diff --git a/rw_seqs.txt b/rw_seqs.txt new file mode 100644 index 0000000..2a5eba3 --- /dev/null +++ b/rw_seqs.txt @@ -0,0 +1,100 @@ +thermoelectric CeGePd CeGeNi Ce2Ge5Ni3 Ce3Ge4Ni4 CeGeNi CeGePd CeGePt a_123861 GePdPr GePdSm GeNdPd GePdPr Ge4Pd4Yb3 PdSiY LaPtSi GeLaPt EuGePt CeCu2Hg2K IrLaSi +thermoelectric a_879291 a_178403 a_111179 a_683402 a_872458 a_1199330 a_872458 a_1137276 a_683402 a_651505 Al33As100Ga67 a_451705 Al33As100Ga67 a_29234 AlAsGa AsFe2 GaGeMn a_812449 GaGeMn +thermoelectric a_147396 a_1172334 a_266769 a_892159 a_770923 +thermoelectric a_886390 a_885543 CuIr2Se4 a_250894 Nd2O3 Al2O3 a_510612 Al2O3 O2Zr a_214510 CsOV a_598578 CsOV a_248545 CsOV a_1391968 CsOV a_702306 CsOV +thermoelectric Ba2Ca2Cu3HgO8 Ba2Ca4Cu5HgO12 a_1357073 a_1161962 a_1706109 a_1161962 BaO2 CuNaO2 C50Ca239H300O580 CuNaO2 Ca239Cu300O580 CuNaO2 a_266618 a_38821 AlB2 Ag297Al253Pr50 a_419816 AlNiZr NiSiTi +thermoelectric La7Mn10O30Sr3 AlLaO3 a_306599 a_188619 a_306599 O3SrTi a_419892 a_1528441 a_746437 a_284483 a_1706398 a_284483 a_1527848 Se2Ti a_151900 a_1041516 CdSb TeZn a_440098 +thermoelectric a_888738 a_265149 Ca3La7Mn10O30 a_265149 CuO2 Ca2Cl2CuO2 a_248865 a_1366385 a_248865 a_1366385 a_267536 a_1366385 a_248865 a_282525 a_248865 a_248866 a_248865 Ca2Cl2CuO2 Cl2CuO2Sr2 +thermoelectric Bi2Te3 Bi2Se3 BiSb3Te6 a_376053 a_766791 a_1226992 a_478144 a_1226992 a_682058 a_478144 a_1226992 a_682058 a_478144 a_885821 a_682058 FeNi4 a_769245 a_1300116 a_209378 +thermoelectric CoSe4Ti2 a_1706181 a_284483 FeTe20Ti10 a_746437 FeTe20Ti10 a_1206056 FeTe20Ti10 a_1206056 a_746437 a_1195788 a_746437 a_887261 a_887263 CuO H3N a_248122 a_1233096 a_1017575 +thermoelectric Sb2Te3 a_283091 a_388569 a_388570 Sb2Te3 Te3V2 a_43652 a_422943 a_617422 a_1176430 AsGa GaP GaN a_639806 GaN AlN a_249535 InN a_1184093 +thermoelectric a_867583 a_1699829 a_867583 thermoelectric a_176969 NaNi4O12P3 KMn4O12P3 KNi4O12P3 a_420631 KNi4O12P3 a_176969 KMn4O12P3 a_176969 CrLi a_176969 LaNiO3 a_77172 a_171721 a_288386 +thermoelectric a_6613 O9P2V2 O7P2 Mg109O700P200 Na182Ni109O700P200 a_282814 O5P2 O5V2 a_55533 a_55532 O3Yb2 a_1222259 a_126954 CuO Cu2O ClCu BrCu CdS a_636492 +thermoelectric a_892182 thermoelectric Bi4ITe7V a_63607 a_251047 thermoelectric FeHgSe a_1706687 FeHgSe a_1037259 a_1706687 a_1037259 HgSe GaSb InSb a_230838 RuTe2 a_1159304 RuTe2 +thermoelectric a_2585 a_1109950 Ba33La67Mn100O300 a_1284922 Eu7La60Mn100O300Sr33 a_523572 a_1623679 a_1623677 a_523572 a_1623677 a_523570 a_523569 a_453650 a_886065 a_886283 a_453650 a_692048 a_1199279 a_453650 +thermoelectric B2CNi2Y a_125798 a_250115 a_125798 a_303511 a_870353 a_24695 a_399523 a_660928 a_1204142 a_24695 MgO a_34404 MgO a_97849 MgO a_740925 MgO a_624080 +thermoelectric a_250178 CuO2 a_1287039 a_1305509 a_1287039 a_1305509 a_1710318 a_1305509 a_1162022 a_1305509 a_1710317 a_1305509 a_1710318 Cr2O3 O2Ti H3N O2Si a_1323129 O2Si +thermoelectric AsGa a_203756 a_624825 a_1111570 AsGa a_427639 a_636815 a_496313 AsGaIn AsGa a_639695 AsGa a_315911 AsGa a_666696 a_658323 a_666696 a_658323 AsGaInP +thermoelectric a_1028155 a_181213 Cu2GdO8RuSr2 a_1205756 a_216140 a_7494 a_892288 a_584000 a_420536 a_581838 CrSbSe3 a_581838 a_420536 a_8548 a_581838 CrSbSe3 a_7494 a_459676 a_656753 +thermoelectric CuFe2O4 Cu11Fe18GeO40 CuFe2O4 Cr2CuO4 a_1594253 a_1594254 a_1213514 a_1608138 B2O3 B2Zr B2Ta Al3Ti Al61Mn14Ti25 Al57Mn14Ti29 Al61Mn14Ti25 a_1211440 Al59Mn14Ti27 a_1190176 Al57Mn14Ti29 +thermoelectric a_746030 a_24484 a_63744 thermoelectric a_442563 a_442560 a_75350 a_442560 FeSi SiV3 Cr3Si Cr5Si3 Ni3Si2 Cr3Si Al42H63Nb83 Al21Nb79 Cr3Si a_54334 a_54332 +thermoelectric a_1328986 thermoelectric a_57183 OPZn a_41320 CoNaO4P a_57183 a_14010 a_419700 a_26407 thermoelectric a_172466 a_1160687 a_1578961 a_1160687 a_1578960 a_98739 a_419910 CsO3P +thermoelectric a_419986 a_419985 a_419986 a_419989 a_497194 a_497196 a_229071 BCl3 B3C CH4 a_76568 CuNiZn a_13586 a_1744 a_12439 CuNiZn a_5786 a_1217026 a_1579010 +thermoelectric a_693950 a_1213612 a_492755 thermoelectric a_465126 a_181584 La7Mn10O30Sr3 a_7493 a_1136430 a_419687 CuLa2O4 KMnO4 a_745961 a_572945 ClK Cl2Mg a_44908 Cl2Mg a_334178 +thermoelectric CePd2Si2 a_427546 Cu2Si2Yb a_427546 CePd2Si2 a_812327 a_427546 thermoelectric a_662332 Si2Ti a_3138 a_1186228 a_3138 a_1217882 a_3138 Si2Ti Fe2O3 a_507334 N2O8U +thermoelectric a_1681096 a_1169611 a_1706375 a_497254 C2H4NO a_170990 C2H4NO a_1365291 C2H4NO C2F3HNO a_1365292 C2H4NO C2F3HNO CH3 a_44191 CH3 Na2O4S a_772281 Na2O4S +thermoelectric a_1207004 a_265149 a_1207004 a_888738 Ca3La7Mn10O30 a_45551 Ca4Cu20Pb35Sr40Y16 CuI C2H3N a_53084 O2Ti O3PbTiZr NTi a_210508 NTi a_1179811 NTi N4Si3 a_893867 +thermoelectric a_143073 a_1170170 a_79380 a_248907 a_48177 a_747536 a_747564 a_464830 F6P AsF6 F6P a_54918 F6P a_1173041 F6P AsF6 F6P AsF6 a_64621 +thermoelectric a_1136655 a_389112 FeGdO3 a_248638 La2MgO6Rh La2O6RhZn La2MgO6Rh a_248638 LaNi a_109957 a_884917 a_369611 a_365676 a_365546 a_359155 a_359153 a_359155 a_359039 a_466478 +thermoelectric B2Ti O2Zr O3W In2O3 InO3P InO4P a_12889 InO4P a_536826 AsGa a_678949 a_871116 a_1176348 a_473328 a_374459 a_1180583 a_467515 a_374459 a_1180583 +thermoelectric a_1116484 CrTe AsNi AsGa a_1517090 AsGa SeZn a_518626 OZn Cr2O3 CrN NTi a_216232 a_216231 a_216232 a_216231 a_216232 a_216231 a_551585 +thermoelectric a_358296 a_1575372 CSi a_6751 a_1623453 BeO Li4O4Si a_260799 Li4O4Si a_682181 S2Si a_3625 a_1355035 AlAsGa AsGa InP H2O BO4 BO3 +thermoelectric FeZr thermoelectric a_1146695 a_673344 a_1146695 a_1150427 a_1146695 a_1150427 a_1146695 thermoelectric a_315645 Ho3La11Mn20O60Sr6 a_504991 Ho3La11Mn20O60Sr6 a_504991 a_638542 a_1158199 a_872590 a_144895 +thermoelectric Co67Fe33 thermoelectric I3Sb BiI3 BiI6 IRb BiI3 a_440253 a_440284 BrK a_817406 ClK H2O4S ClH a_485771 ClH a_388987 a_388986 B4C +thermoelectric a_800860 a_1208738 a_326 Al3La10Ni47 a_25837 CoMn6Ni11TiV2Zr9 a_25837 a_2615 a_2614 a_2615 a_1224151 a_25837 a_2613 a_2614 a_25837 CoMn6Ni11TiV2Zr9 a_25837 a_2615 a_2613 +thermoelectric CO3Sr CO2 CH3NO2 a_110193 CH3NO2 CO2 CeN3O9 C3CeO3 CeN3O9 a_357980 CeCl3 AlCl3 a_2425 a_1702688 a_2425 AlCl3 a_1139411 AlCl3 Al3Ti +thermoelectric a_32740 a_264880 a_694884 a_1574239 OPb a_376081 a_376085 OPb S3Sb2 a_190986 S3Sb2 a_10076 a_10077 Te3Tl2 a_10077 FeS2 a_249051 FeS2 MnO2 +thermoelectric O5V2 a_71098 O5V2 a_746133 a_4094 Cr11O516V200 a_266203 a_18514 O5V2 a_1211213 a_1211212 O5V2 a_273326 a_271781 a_273323 CZr CTi NiTi a_1026991 +thermoelectric PbS a_637006 PbS a_496858 PbS a_203757 a_428655 a_72462 a_265483 a_1294437 a_265483 a_203757 PbS a_1171968 PbS a_152073 O2Si N4Si3 a_84532 +thermoelectric CePdSb AsCePd CePdSb CeRhSn a_249862 LaNiSn a_812269 CeNiSn a_1197912 CeNiSn CeNi CeNiSn CeGaPt a_390618 CeGaPt NiSiTi GeIrTb a_314269 Os2Si2Tb +thermoelectric a_452776 a_249886 Al5CeNi2 a_249886 Al5CeNi2 a_886058 a_705055 a_1528265 a_249886 thermoelectric a_419990 a_419986 thermoelectric a_496703 a_1209160 O2Si Bi2O9SrTa2 a_1137160 Ba80BiNd39O600Ta120Ti80 +thermoelectric a_1153854 a_1226791 a_1153854 thermoelectric Bi2Se3 BiSb3Te6 Sb2Te3 Sb2Se3 a_211504 a_211508 a_303139 a_479067 a_211508 As2S3 a_1174214 OPb Bi2O3 a_339726 a_1109856 +thermoelectric a_896645 a_896646 a_1212110 a_1212109 a_896646 a_1212109 a_896646 a_896645 a_1212109 a_896646 a_1212110 a_896646 a_1212109 a_1212110 a_1212109 a_896645 a_1212110 a_896646 thermoelectric +thermoelectric Fe2Si5 FeSi2 a_497049 FeSi2 a_512059 a_871974 a_50641 LiNbO3 a_239325 a_229786 a_239325 LiNbO3 a_514711 LiNbO3 a_285571 a_481052 GaN a_1706858 a_496993 +thermoelectric B2Ti a_894649 AlSiTi SiTi Al3Ti Al2O3 a_307712 a_307711 a_1311372 Al3Nb AlNi3 a_1224997 AlNi3 a_1190254 a_903101 AlNi3 H2O Ce2O3 H2O +thermoelectric a_420564 a_681555 a_681556 a_453650 a_681556 a_523766 a_19497 a_1299957 a_1109950 a_453650 a_142746 a_886065 a_888385 a_479171 a_241061 a_440733 a_479171 a_1707886 a_888385 +thermoelectric B6Sm a_68237 AlLaO3 a_636818 GaN GaInN GaN a_189502 GaN a_51800 a_285883 CSi Al2O3 a_229260 a_241904 a_229260 NiO a_1607623 NiO +thermoelectric PbTe a_1220820 PbTe O32Te PbTe a_287484 PbSe a_1706730 Ge20Pb133Sn247Te400 SnTe a_650985 SnTe GeTe a_1223602 Ge2Sb2Te5 a_495506 Ag8In14Sb55Te23 Ge2Sb2Te5 a_249473 +thermoelectric C39Ti50 C13Ti50 C49Ti100 C39Ti50 a_675001 a_2986 a_250820 a_205933 a_871922 a_2986 a_871921 a_2986 Ga3U a_888175 a_378251 a_888175 Ga3U Bi2U a_885525 +thermoelectric ClH a_258208 a_1391917 CO2 a_896642 a_184787 CSi C4H12Si CSi a_876242 a_212325 a_876242 CSi a_213214 CH4 a_674906 CH4 O3W BaO3Sn +thermoelectric a_147558 thermoelectric C2H a_747537 O4RuSr2 O4Ru O4RuSr2 a_872203 a_1026614 a_1026616 a_1026741 a_1026614 a_1026616 a_1026614 a_872203 a_872204 a_872203 O4RuSr2 a_1026616 +thermoelectric a_1305777 CSi a_899536 CSi a_1577927 CSi a_874070 CSi a_1218976 CSi a_1329009 CSi a_23255 a_58432 Al2O3 B2Ti Cr2O3 O3Y2 CeO2 +thermoelectric a_248786 CuO2 a_1305554 CuO2 a_248787 a_248788 a_534317 a_248788 a_248789 a_160911 a_1200187 NbSe3 S3Ta a_1200109 a_374328 K3Mo10O30 a_1117625 K3Mo10O30 S3Ta +thermoelectric a_1198542 thermoelectric a_147558 thermoelectric a_1710043 thermoelectric FeSi2 Mg2Si a_1619831 a_898402 a_265260 Al2O3 a_1078069 Al2O3 a_266953 Al2O3 O2Sn a_76168 a_512653 +thermoelectric a_755504 a_530772 a_184006 a_38708 a_530772 a_661783 a_770218 a_661783 a_214365 MgNi MgNi2 MgNi a_8234 Mg67Ni33 a_8234 Mg2Ni a_1223098 MgNi2 a_571680 +thermoelectric a_456282 AsGa GaP InP a_378857 InP AsIn AsGa a_1180621 AlAsGa a_1174495 a_495822 AsGa a_104007 AsIn HgTeZn AsIn AsGa AlAsGa +thermoelectric Cd2GeO4 a_304789 a_5162 a_304789 Cd2O4Pb Cd2GeO4 a_682193 Cd2GeO4 AgO3Sb Cd2O4Pb Cd2GeO4 a_682194 Cd2GeO4 AgO3Sb Cd2GeO4 Cd2O4Pb a_13407 a_304789 a_5162 +thermoelectric a_94016 a_442563 a_872601 a_54826 a_47628 a_46159 a_161187 CePt2Sn2 a_442135 Al6Fe6Tb Al6ErFe6 a_300880 Al6ErFe6 Al6Fe6Tb Al6ErFe6 Al6Fe6Tb Al6ErFe6 a_812459 a_390486 +thermoelectric a_146530 a_4835 a_146530 thermoelectric GaS8V4 GaMo4S8 GaS8V4 GaMo4S8 a_36730 a_775853 a_144672 a_485134 a_235508 AsGa a_541118 GeSi a_584970 AlAsGa Al3As10Ga7 +thermoelectric a_1222619 a_274147 a_1210204 Al2O3 AlCrNi3 NiO Al2O3 a_81198 a_389652 N4Si3 a_210001 N4Si3 a_481461 a_300385 a_17485 a_510441 GeSi3 H4Si H3Si +thermoelectric a_770069 a_192491 a_457842 a_796968 a_457842 a_449731 a_457842 a_449731 a_457842 a_449731 a_457842 a_796968 a_457842 a_770069 a_1706863 thermoelectric HgSe AsGa AlAsGa +thermoelectric a_147558 a_746054 a_444852 a_147558 a_24484 thermoelectric a_682776 a_160994 a_2576 F6LiP a_889410 F6LiP a_747039 F6LiP CoLiO2 a_1702731 a_889667 a_170381 LiNbO3 +thermoelectric a_27981 a_1117069 a_27981 a_1117069 a_393123 a_1227451 a_1630932 a_1227451 a_1630932 a_1227451 a_1630932 a_1227451 a_1528660 a_393123 a_27981 a_1117069 a_393123 a_27981 a_1227451 +thermoelectric In2O3 a_662447 In2O3 a_889204 In2O a_8322 In2O ClNa a_660912 a_660911 ClNa ClK a_327189 a_1466257 a_327189 a_1466257 a_327189 ClK BrK +thermoelectric SbU Be13U O4RuSr2 a_477976 a_1316873 O4RuSr2 a_1026015 O4RuSr2 a_419936 O4RuSr2 a_1606487 O4RuSr2 Pt3U CeRu2Si2 a_1029056 a_705055 a_886058 AlCePd a_1318244 +thermoelectric a_496383 I3Sb Bi40Se3Te57 I3Sb Bi40Se3Te57 I3Sb a_212665 a_212664 NTi a_15212 a_15211 O2Si AsNP a_755433 a_755432 a_755431 a_1217570 a_12211 a_755434 +thermoelectric a_249886 a_267075 a_872057 Al21Fe109La10 a_872057 Fe13Nd6Sn Fe13Pr6 Fe13Pr6Sn a_441237 a_1223290 a_157034 a_443837 a_709422 a_242644 a_1198422 a_888798 a_37858 O2Ti NTi +thermoelectric a_328351 Al21Fe4 a_328351 a_892195 a_1149190 a_892195 a_328351 a_892195 a_1149190 a_892195 a_328351 Al7Mn2 a_892195 a_328351 a_892195 Al7Mn2 a_892195 a_328351 a_892195 +thermoelectric a_287202 +thermoelectric a_693448 a_693182 a_159248 a_693448 SeTl InSe2Tl SeTl PSe SeZn a_521515 a_636549 SeZn a_9167 SeZn a_1190151 SeZn CdSeZn GaInN GaN +thermoelectric GeV a_1466757 a_1466751 a_829147 a_1466744 a_1466758 a_1466679 a_1515994 a_1507728 BP98 a_1477487 a_1481788 a_1481787 a_1477511 a_1481768 GeV a_1515489 GeV a_1483850 +thermoelectric a_1706109 thermoelectric a_79380 a_152391 a_57676 a_79380 a_249737 a_540250 a_13189 a_10712 Ba2NaNb5O15 a_1706654 a_849305 Ba2NaNb5O15 KO5PTi KTi K100O500P100Sn7Ti93 a_767945 a_442702 +thermoelectric a_420332 a_266801 a_1291255 a_420332 a_266801 a_1137087 a_266801 thermoelectric a_1201464 a_1200790 a_1201462 a_1098750 thermoelectric a_214777 +thermoelectric a_747120 a_195678 a_514788 a_681730 a_195678 a_681730 a_514788 a_195678 thermoelectric Sb2Se3 a_265272 a_265273 a_1150361 a_265272 As3S7 a_265273 AsSe a_265271 a_265269 +thermoelectric a_373477 thermoelectric a_286527 a_1206627 Ba2CuHgO4 Ba2Cu3O7Y a_768929 Ba2Cu3O7Y a_549782 a_149892 a_149888 a_287555 a_149892 a_872590 BCErNi B2CNi2Yb BCNiYb B2CNi2Yb a_506529 +thermoelectric a_902703 a_161724 Li4Mn5O12 Li2O a_271300 Li4O4Si a_271310 a_271311 a_271310 Li4O4Si CSi a_1574763 CSi a_1623473 CSi a_213838 a_891907 a_807485 a_213838 +thermoelectric a_205799 S2Ti LiS2Ti S2Ti MoSe2 S2W O5P2 a_678332 a_98130 O5P2 a_546764 a_1706827 a_1606449 O5P2 a_428610 Al2O3 O42S a_218576 O42S +thermoelectric O2P a_495622 CuFe2O4 thermoelectric GeSi a_12770 GeSi a_1322610 GeSi a_1176495 GeSi a_513650 GaN Cl3Ga H2O AlC3H9 a_8448 O2Si CoF3K +thermoelectric S2Ti a_2988 ILi LiNO3 LiMn2O4 a_867578 LiMn2O4 a_10424 MgO AlN a_8444 a_126149 a_495566 a_126149 a_211876 NTi NTi2 a_769530 a_210059 +thermoelectric B6P a_1110770 a_419962 a_381171 a_419962 thermoelectric a_1577508 thermoelectric a_1215358 Cd9Te10Zn a_1678830 a_636697 a_189663 a_189661 CdTe a_497132 CdTe a_85576 a_440170 +thermoelectric a_271982 a_271980 a_272178 a_1196359 a_271982 a_1196359 a_272178 a_484245 a_271980 a_272178 a_271982 a_271980 a_1154338 a_272288 a_1154338 a_272178 a_271984 a_271982 a_272359 +thermoelectric GaMo4Se4Te4 GaS8V4 GaMo4Se4Te4 GaS8V4 a_36730 a_161836 IrO4Sr2 a_1064 O3Rh2 a_1063 a_81728 O3Rh2 MgO4Rh2 O3Rh2 CH4 BCl3 GaInN GaN a_674106 +thermoelectric a_169319 a_636977 a_203172 K3Mo10O30 a_363256 K3Mo10O30 a_420105 S3Ta a_249918 a_1366493 a_249918 a_1366493 a_249919 S3Ta a_1170053 a_747547 a_172604 AlInN a_1215420 +thermoelectric CuNd2O4 a_812198 SeZn a_1706138 a_439833 SeZn Cd3Se20Zn17 CdSeZn SSeZn a_456282 AsGa InP a_481796 a_481798 a_83017 CdSeZn a_1706345 a_440461 CdSeZn +thermoelectric a_657376 a_694014 a_315094 a_693975 a_312544 a_204711 a_1213628 a_204711 a_1213628 a_694014 a_204711 a_694014 a_492749 a_204711 a_693975 a_312544 a_204711 a_1213611 a_204711 +thermoelectric a_419944 a_419947 a_303665 a_303666 a_303665 a_419946 a_419944 a_419946 thermoelectric a_1588759 thermoelectric a_636633 GeSi a_1218343 a_3125 a_1193492 a_1145847 GeSi CSi499 +thermoelectric CdCr2S4 Cr2MnS4 CdCr2S4 Cr2FeS4 Cr4CuFeS8 a_264484 a_419770 a_54497 F4K2Ni a_400081 AlCaO4Y a_1706095 AlO6 a_539834 a_480409 AlO6 O4Si CmO a_1136424 +thermoelectric a_643 thermoelectric a_1165338 a_473333 a_473338 a_880233 a_473338 a_47831 a_473338 a_1186319 a_473336 a_456580 AsGa Al7As20Ga13 a_439342 Al8As25Ga17 a_439341 a_812264 AsGa +thermoelectric CaCu2Fe16O27 a_865274 thermoelectric a_452776 LaPdSn CePdSn CeNiSn CeRhSb a_453070 a_161187 CeNiSn a_451827 a_43580 CePdSn a_886694 a_161187 InPt2U a_161187 a_265843 +thermoelectric Fe2O3 Al3Nb AlNb2 a_1311372 AlNb2 a_241237 FeNbP FePTi Fe2O3 a_423419 a_1329750 a_8379 a_8378 a_524381 a_524380 a_524381 a_524379 a_8379 a_1329750 +thermoelectric a_1207076 a_1207074 a_99078 a_248956 a_99078 Mo3Te4 Mo3Se4 a_388490 a_624278 a_266650 a_141327 a_540431 a_141327 a_519727 a_154585 a_149868 a_682126 a_540430 a_682127 +thermoelectric a_892182 thermoelectric a_286691 FeNbO4 a_286691 FeNbO4 a_277961 a_267153 a_267155 CeO2 Gd2O3 O3Y2 a_16359 GdMnO3 MnO3Y Cr2O3 C2Cr3 MoS2 Mo25S28 +thermoelectric a_161724 a_51395 Ga8Ge15Sr4 a_161724 FeZn13 a_1138509 a_1676910 FeZn13 a_161724 Fe2MgO4 a_250296 a_7115 a_388665 Cr3La4MgO12 a_7115 a_388665 CrHO2 CoHO2 a_1193559 +thermoelectric a_1706289 a_266224 thermoelectric a_205096 O4RuSr2 Pt3U a_812329 a_1026208 Ru2Si2U Pd2U Ru2U Ru2Si2U B6Sm a_210701 +thermoelectric a_523333 thermoelectric a_287202 MgO a_103968 KNbO3 Al2O3 CoFeNi a_1199353 CoFeNi a_1623 GaInP InP a_16907 AlAsIn AsGaIn AlAsGa AsGa a_1145981 +thermoelectric Bi8Ca5Pb2Sr10 a_1064122 a_232081 a_1206757 Bi8Ca5Pb2Sr10 a_1064122 Bi8Ca5Pb2Sr10 thermoelectric AlAsGa AsGa a_637460 AlGaInP a_172646 AlGaInP AsGa a_458393 a_458392 a_458393 a_156018 +thermoelectric a_5715 BaCuOY a_1206245 BaCuOY BaCuNdO O3SrTi CeO2 Bi2O3 BaO CO2 H4NO4S a_89872 H4NO4S a_109909 H4NO4S ClH O2Si a_1218077 O2Si +thermoelectric a_1139968 thermoelectric a_329323 Ba2Cu3O7Y a_885895 a_1308582 a_1139729 a_885895 a_1308582 Ba2Cu3O7Y a_1294125 Ba2Cu3O7Y a_483836 a_1366611 a_885725 a_528747 Bi2CaCu2O8Sr2 a_955649 a_1623640 +thermoelectric a_1620797 thermoelectric a_867583 Ni3S2 NiO4S a_635630 Cl22Ni a_635630 a_635629 a_635630 NiO4S AgNO3 a_867211 a_230639 a_1331294 AgNO3 a_541188 a_515279 CdHgTe +thermoelectric O9Pr5 O89Pr50 a_497709 a_249419 a_249418 a_695313 a_249418 a_695313 a_1217705 a_249419 a_497709 a_695313 a_249418 O4TiZr O3Y2 O2Zr a_1322572 O2Zr Fe2O3 +thermoelectric a_1207004 a_265149 thermoelectric a_147558 a_746030 a_1216878 a_746030 a_746053 a_1216878 thermoelectric Bi4O12Ti3 O6Ti a_298496 a_140521 a_298496 a_132219 a_17750 a_21625 KO5PTi diff --git a/rw_seqs_noauthors.txt b/rw_seqs_noauthors.txt new file mode 100644 index 0000000..5bb76da --- /dev/null +++ b/rw_seqs_noauthors.txt @@ -0,0 +1,92 @@ +thermoelectric CeGePd CeGeNi Ce2Ge5Ni3 Ce3Ge4Ni4 CeGeNi CeGePd CeGePt GePdPr GePdSm GeNdPd GePdPr Ge4Pd4Yb3 PdSiY LaPtSi GeLaPt EuGePt CeCu2Hg2K IrLaSi +thermoelectric Al33As100Ga67 Al33As100Ga67 AlAsGa AsFe2 GaGeMn GaGeMn +thermoelectric CuIr2Se4 Nd2O3 Al2O3 Al2O3 O2Zr CsOV CsOV CsOV CsOV CsOV +thermoelectric Ba2Ca2Cu3HgO8 Ba2Ca4Cu5HgO12 BaO2 CuNaO2 C50Ca239H300O580 CuNaO2 Ca239Cu300O580 CuNaO2 AlB2 Ag297Al253Pr50 AlNiZr NiSiTi +thermoelectric La7Mn10O30Sr3 AlLaO3 O3SrTi Se2Ti CdSb TeZn +thermoelectric Ca3La7Mn10O30 CuO2 Ca2Cl2CuO2 Ca2Cl2CuO2 Cl2CuO2Sr2 +thermoelectric Bi2Te3 Bi2Se3 BiSb3Te6 FeNi4 +thermoelectric CoSe4Ti2 FeTe20Ti10 FeTe20Ti10 FeTe20Ti10 CuO H3N +thermoelectric Sb2Te3 Sb2Te3 Te3V2 AsGa GaP GaN GaN AlN InN +thermoelectric thermoelectric NaNi4O12P3 KMn4O12P3 KNi4O12P3 KNi4O12P3 KMn4O12P3 CrLi LaNiO3 +thermoelectric O9P2V2 O7P2 Mg109O700P200 Na182Ni109O700P200 O5P2 O5V2 O3Yb2 CuO Cu2O ClCu BrCu CdS +thermoelectric thermoelectric Bi4ITe7V thermoelectric FeHgSe FeHgSe HgSe GaSb InSb RuTe2 RuTe2 +thermoelectric Ba33La67Mn100O300 Eu7La60Mn100O300Sr33 +thermoelectric B2CNi2Y MgO MgO MgO MgO +thermoelectric CuO2 Cr2O3 O2Ti H3N O2Si O2Si +thermoelectric AsGa AsGa AsGaIn AsGa AsGa AsGa AsGaInP +thermoelectric Cu2GdO8RuSr2 CrSbSe3 CrSbSe3 +thermoelectric CuFe2O4 Cu11Fe18GeO40 CuFe2O4 Cr2CuO4 B2O3 B2Zr B2Ta Al3Ti Al61Mn14Ti25 Al57Mn14Ti29 Al61Mn14Ti25 Al59Mn14Ti27 Al57Mn14Ti29 +thermoelectric thermoelectric FeSi SiV3 Cr3Si Cr5Si3 Ni3Si2 Cr3Si Al42H63Nb83 Al21Nb79 Cr3Si +thermoelectric thermoelectric OPZn CoNaO4P thermoelectric CsO3P +thermoelectric BCl3 B3C CH4 CuNiZn CuNiZn +thermoelectric thermoelectric La7Mn10O30Sr3 CuLa2O4 KMnO4 ClK Cl2Mg Cl2Mg +thermoelectric CePd2Si2 Cu2Si2Yb CePd2Si2 thermoelectric Si2Ti Si2Ti Fe2O3 N2O8U +thermoelectric C2H4NO C2H4NO C2H4NO C2F3HNO C2H4NO C2F3HNO CH3 CH3 Na2O4S Na2O4S +thermoelectric Ca3La7Mn10O30 Ca4Cu20Pb35Sr40Y16 CuI C2H3N O2Ti O3PbTiZr NTi NTi NTi N4Si3 +thermoelectric F6P AsF6 F6P F6P F6P AsF6 F6P AsF6 +thermoelectric FeGdO3 La2MgO6Rh La2O6RhZn La2MgO6Rh LaNi +thermoelectric B2Ti O2Zr O3W In2O3 InO3P InO4P InO4P AsGa +thermoelectric CrTe AsNi AsGa AsGa SeZn OZn Cr2O3 CrN NTi +thermoelectric CSi BeO Li4O4Si Li4O4Si S2Si AlAsGa AsGa InP H2O BO4 BO3 +thermoelectric FeZr thermoelectric thermoelectric Ho3La11Mn20O60Sr6 Ho3La11Mn20O60Sr6 +thermoelectric Co67Fe33 thermoelectric I3Sb BiI3 BiI6 IRb BiI3 BrK ClK H2O4S ClH ClH B4C +thermoelectric Al3La10Ni47 CoMn6Ni11TiV2Zr9 CoMn6Ni11TiV2Zr9 +thermoelectric CO3Sr CO2 CH3NO2 CH3NO2 CO2 CeN3O9 C3CeO3 CeN3O9 CeCl3 AlCl3 AlCl3 AlCl3 Al3Ti +thermoelectric OPb OPb S3Sb2 S3Sb2 Te3Tl2 FeS2 FeS2 MnO2 +thermoelectric O5V2 O5V2 Cr11O516V200 O5V2 O5V2 CZr CTi NiTi +thermoelectric PbS PbS PbS PbS PbS O2Si N4Si3 +thermoelectric CePdSb AsCePd CePdSb CeRhSn LaNiSn CeNiSn CeNiSn CeNi CeNiSn CeGaPt CeGaPt NiSiTi GeIrTb Os2Si2Tb +thermoelectric Al5CeNi2 Al5CeNi2 thermoelectric thermoelectric O2Si Bi2O9SrTa2 Ba80BiNd39O600Ta120Ti80 +thermoelectric thermoelectric Bi2Se3 BiSb3Te6 Sb2Te3 Sb2Se3 As2S3 OPb Bi2O3 +thermoelectric Fe2Si5 FeSi2 FeSi2 LiNbO3 LiNbO3 LiNbO3 GaN +thermoelectric B2Ti AlSiTi SiTi Al3Ti Al2O3 Al3Nb AlNi3 AlNi3 AlNi3 H2O Ce2O3 H2O +thermoelectric B6Sm AlLaO3 GaN GaInN GaN GaN CSi Al2O3 NiO NiO +thermoelectric PbTe PbTe O32Te PbTe PbSe Ge20Pb133Sn247Te400 SnTe SnTe GeTe Ge2Sb2Te5 Ag8In14Sb55Te23 Ge2Sb2Te5 +thermoelectric C39Ti50 C13Ti50 C49Ti100 C39Ti50 Ga3U Ga3U Bi2U +thermoelectric ClH CO2 CSi C4H12Si CSi CSi CH4 CH4 O3W BaO3Sn +thermoelectric thermoelectric C2H O4RuSr2 O4Ru O4RuSr2 O4RuSr2 +thermoelectric CSi CSi CSi CSi CSi CSi Al2O3 B2Ti Cr2O3 O3Y2 CeO2 +thermoelectric CuO2 CuO2 NbSe3 S3Ta K3Mo10O30 K3Mo10O30 S3Ta +thermoelectric thermoelectric thermoelectric thermoelectric FeSi2 Mg2Si Al2O3 Al2O3 Al2O3 O2Sn +thermoelectric MgNi MgNi2 MgNi Mg67Ni33 Mg2Ni MgNi2 +thermoelectric AsGa GaP InP InP AsIn AsGa AlAsGa AsGa AsIn HgTeZn AsIn AsGa AlAsGa +thermoelectric Cd2GeO4 Cd2O4Pb Cd2GeO4 Cd2GeO4 AgO3Sb Cd2O4Pb Cd2GeO4 Cd2GeO4 AgO3Sb Cd2GeO4 Cd2O4Pb +thermoelectric CePt2Sn2 Al6Fe6Tb Al6ErFe6 Al6ErFe6 Al6Fe6Tb Al6ErFe6 Al6Fe6Tb Al6ErFe6 +thermoelectric thermoelectric GaS8V4 GaMo4S8 GaS8V4 GaMo4S8 AsGa GeSi AlAsGa Al3As10Ga7 +thermoelectric Al2O3 AlCrNi3 NiO Al2O3 N4Si3 N4Si3 GeSi3 H4Si H3Si +thermoelectric thermoelectric HgSe AsGa AlAsGa +thermoelectric thermoelectric F6LiP F6LiP F6LiP CoLiO2 LiNbO3 +thermoelectric In2O3 In2O3 In2O In2O ClNa ClNa ClK ClK BrK +thermoelectric SbU Be13U O4RuSr2 O4RuSr2 O4RuSr2 O4RuSr2 O4RuSr2 Pt3U CeRu2Si2 AlCePd +thermoelectric I3Sb Bi40Se3Te57 I3Sb Bi40Se3Te57 I3Sb NTi O2Si AsNP +thermoelectric Al21Fe109La10 Fe13Nd6Sn Fe13Pr6 Fe13Pr6Sn O2Ti NTi +thermoelectric Al21Fe4 Al7Mn2 Al7Mn2 +thermoelectric SeTl InSe2Tl SeTl PSe SeZn SeZn SeZn SeZn CdSeZn GaInN GaN +thermoelectric GeV BP98 GeV GeV +thermoelectric thermoelectric Ba2NaNb5O15 Ba2NaNb5O15 KO5PTi KTi K100O500P100Sn7Ti93 +thermoelectric thermoelectric Sb2Se3 As3S7 AsSe +thermoelectric thermoelectric Ba2CuHgO4 Ba2Cu3O7Y Ba2Cu3O7Y BCErNi B2CNi2Yb BCNiYb B2CNi2Yb +thermoelectric Li4Mn5O12 Li2O Li4O4Si Li4O4Si CSi CSi CSi +thermoelectric S2Ti LiS2Ti S2Ti MoSe2 S2W O5P2 O5P2 O5P2 Al2O3 O42S O42S +thermoelectric O2P CuFe2O4 thermoelectric GeSi GeSi GeSi GeSi GaN Cl3Ga H2O AlC3H9 O2Si CoF3K +thermoelectric S2Ti ILi LiNO3 LiMn2O4 LiMn2O4 MgO AlN NTi NTi2 +thermoelectric B6P thermoelectric thermoelectric Cd9Te10Zn CdTe CdTe +thermoelectric GaMo4Se4Te4 GaS8V4 GaMo4Se4Te4 GaS8V4 IrO4Sr2 O3Rh2 O3Rh2 MgO4Rh2 O3Rh2 CH4 BCl3 GaInN GaN +thermoelectric K3Mo10O30 K3Mo10O30 S3Ta S3Ta AlInN +thermoelectric CuNd2O4 SeZn SeZn Cd3Se20Zn17 CdSeZn SSeZn AsGa InP CdSeZn CdSeZn +thermoelectric thermoelectric thermoelectric GeSi GeSi CSi499 +thermoelectric CdCr2S4 Cr2MnS4 CdCr2S4 Cr2FeS4 Cr4CuFeS8 F4K2Ni AlCaO4Y AlO6 AlO6 O4Si CmO +thermoelectric thermoelectric AsGa Al7As20Ga13 Al8As25Ga17 AsGa +thermoelectric CaCu2Fe16O27 thermoelectric LaPdSn CePdSn CeNiSn CeRhSb CeNiSn CePdSn InPt2U +thermoelectric Fe2O3 Al3Nb AlNb2 AlNb2 FeNbP FePTi Fe2O3 +thermoelectric Mo3Te4 Mo3Se4 +thermoelectric thermoelectric FeNbO4 FeNbO4 CeO2 Gd2O3 O3Y2 GdMnO3 MnO3Y Cr2O3 C2Cr3 MoS2 Mo25S28 +thermoelectric Ga8Ge15Sr4 FeZn13 FeZn13 Fe2MgO4 Cr3La4MgO12 CrHO2 CoHO2 +thermoelectric thermoelectric O4RuSr2 Pt3U Ru2Si2U Pd2U Ru2U Ru2Si2U B6Sm +thermoelectric thermoelectric MgO KNbO3 Al2O3 CoFeNi CoFeNi GaInP InP AlAsIn AsGaIn AlAsGa AsGa +thermoelectric Bi8Ca5Pb2Sr10 Bi8Ca5Pb2Sr10 Bi8Ca5Pb2Sr10 thermoelectric AlAsGa AsGa AlGaInP AlGaInP AsGa +thermoelectric BaCuOY BaCuOY BaCuNdO O3SrTi CeO2 Bi2O3 BaO CO2 H4NO4S H4NO4S H4NO4S ClH O2Si O2Si +thermoelectric thermoelectric Ba2Cu3O7Y Ba2Cu3O7Y Ba2Cu3O7Y Bi2CaCu2O8Sr2 +thermoelectric thermoelectric Ni3S2 NiO4S Cl22Ni NiO4S AgNO3 AgNO3 CdHgTe +thermoelectric O9Pr5 O89Pr50 O4TiZr O3Y2 O2Zr O2Zr Fe2O3 +thermoelectric thermoelectric thermoelectric Bi4O12Ti3 O6Ti KO5PTi diff --git a/test.ipynb b/test.ipynb new file mode 100644 index 0000000..ae50c0d --- /dev/null +++ b/test.ipynb @@ -0,0 +1,465 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/linglong/data/linglong/.conda/envs/acc/lib/python3.8/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.24.4\n", + " warnings.warn(f\"A NumPy version >={np_minversion} and <{np_maxversion}\"\n" + ] + } + ], + "source": [ + "from scipy import sparse\n", + "import numpy as np\n", + "import literature\n", + "\n", + "R = sparse.load_npz(\"data/thrm_vertex_matrix.npz\")\n", + "mats = np.array(open(\"data/thrm_mats.txt\", \"r\").read().splitlines())\n", + "props = [\"thermoelectric\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "yrs = np.loadtxt('data/thrm_years.txt')\n", + "R = R[(yrs>=1996)*(yrs<=2000),:]" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "h = literature.hypergraph(R, mats, props)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "length = 20 # length of the walk\n", + "size = 1 # number of the walk\n", + "prop_ind = R.shape[1]-1 # column index of the property as the starting node " + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(['thermoelectric CeGeNi a_886058 a_885358 Al10Ce10NiPd9 a_886058 a_885610 thermoelectric a_425231 a_524526 thermoelectric a_1710042 thermoelectric a_815535 KO5PTi K2O Na2O GeO2 a_99281 GeO2'],\n", + " ['50739 50739 50737 50729 50729 52319 50739 9151 9151 9151 83553 83553 18658 18633 49949 19121 36848 646 646'])" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "h.random_walk(length, size, start_inds=prop_ind, rand_seed=0) # uniform sampling\n", + "\n", + "# resulting in the following output: \n", + "# (the first array is the sequence of selected nodes; the second array is the selected papers along the walk):\n", + "# ---------------------\n", + "# (['thermoelectric a_1244326 a_1084770 a_1085357 CoCrFeMnNi a_281555 a_1076970 CSi a_10764 Al2O3\n", + "# K2O a_1672448 CaF2 a_460834 BaF2 a_638548 a_1287239 a_955446 a_955445 a_955447'],\n", + "# ['962469 1191497 746280 1191497 1421491 734403 1115449 132804 46832 1194889 1400463 1400463 23\n", + "# 2314 232314 894012 1035899 1035899 615755 1075096'])" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "with open(\"rw_seqs.txt\", \"w\") as file:\n", + " for i in range(100):\n", + " rw_seqs = h.random_walk(length, size, start_inds=prop_ind, alpha=2, rand_seed=i)[0][0] # non-uniform sampling (alpha=1)\n", + " file.write(rw_seqs+'\\n')" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "6759" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import utils \n", + "seqs = open(\"rw_seqs.txt\").read().splitlines() # reading the sequences\n", + "seqs_noauthors = utils.remove_authors_from_RW(seqs) # removing the author nodes\n", + "open(\"rw_seqs_noauthors.txt\", \"w\").write(\"\\n\".join(seqs_noauthors)+\"\\n\") # saving the pruned sequences" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-11-07 15:15:20,950 : INFO : Parsing lines (sentences) in: rw_seqs_noauthors.txt: \n", + "2023-11-07 15:15:20,951 : INFO : Parameters for parsing phrases are as follows:\n", + "2023-11-07 15:15:20,951 : INFO : \tdepth: 2\n", + "2023-11-07 15:15:20,952 : INFO : \tphrase_min_count: 10\n", + "2023-11-07 15:15:20,952 : INFO : \tphrase_threshold: 15\n", + "2023-11-07 15:15:20,953 : INFO : collecting all words and their counts\n", + "2023-11-07 15:15:20,953 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types\n", + "2023-11-07 15:15:20,955 : INFO : collected 1015 word types from a corpus of 842 words (unigram + bigrams) and 92 sentences\n", + "2023-11-07 15:15:20,956 : INFO : using 1015 counts as vocab in Phrases<0 vocab, min_count=10, threshold=15, max_vocab_size=40000000>\n", + "2023-11-07 15:15:20,956 : INFO : source_vocab length 1015\n", + "2023-11-07 15:15:20,962 : INFO : Phraser built with 0 phrasegrams\n", + "0it [00:00, ?it/s]\n", + "2023-11-07 15:15:20,967 : INFO : collecting all words and their counts\n", + "2023-11-07 15:15:20,968 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types\n", + "2023-11-07 15:15:20,972 : INFO : collected 1015 word types from a corpus of 842 words (unigram + bigrams) and 92 sentences\n", + "2023-11-07 15:15:20,972 : INFO : using 1015 counts as vocab in Phrases<0 vocab, min_count=10, threshold=15, max_vocab_size=40000000>\n", + "2023-11-07 15:15:20,973 : INFO : source_vocab length 1015\n", + "2023-11-07 15:15:20,983 : INFO : Phraser built with 0 phrasegrams\n", + "0it [00:00, ?it/s]\n", + "2023-11-07 15:15:20,987 : INFO : collecting all words and their counts\n", + "2023-11-07 15:15:20,988 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types\n", + "2023-11-07 15:15:20,993 : INFO : collected 380 word types from a corpus of 842 raw words and 92 sentences\n", + "2023-11-07 15:15:20,994 : INFO : Loading a fresh vocabulary\n", + "2023-11-07 15:15:20,995 : INFO : effective_min_count=5 retains 20 unique words (5% of original 380, drops 360)\n", + "2023-11-07 15:15:20,995 : INFO : effective_min_count=5 leaves 281 word corpus (33% of original 842, drops 561)\n", + "2023-11-07 15:15:20,996 : INFO : deleting the raw counts dictionary of 380 items\n", + "2023-11-07 15:15:20,997 : INFO : sample=0.0001 downsamples 20 most-common words\n", + "2023-11-07 15:15:20,997 : INFO : downsampling leaves estimated 11 word corpus (4.0% of prior 281)\n", + "2023-11-07 15:15:20,998 : INFO : constructing a huffman tree from 20 words\n", + "2023-11-07 15:15:20,999 : INFO : built huffman tree with maximum node depth 6\n", + "2023-11-07 15:15:21,000 : INFO : estimated required memory for 20 words and 200 dimensions: 62000 bytes\n", + "2023-11-07 15:15:21,000 : INFO : resetting layer weights\n", + "2023-11-07 15:15:21,006 : INFO : training model with 20 workers on 20 vocabulary and 200 features, using sg=1 hs=1 sample=0.0001 negative=15 window=8\n", + "2023-11-07 15:15:21,007 : INFO : training on a 0 raw words (0 effective words) took 0.0s, 0 effective words/s\n", + "2023-11-07 15:15:21,008 : WARNING : under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay\n", + "2023-11-07 15:15:21,009 : INFO : Training the model using the following parameters:\n", + "2023-11-07 15:15:21,009 : INFO : \tphrase_min_count: 10\n", + "2023-11-07 15:15:21,010 : INFO : \tsize: 200\n", + "2023-11-07 15:15:21,010 : INFO : \twindow: 8\n", + "2023-11-07 15:15:21,011 : INFO : \tmin_count: 5\n", + "2023-11-07 15:15:21,011 : INFO : \tsg: True\n", + "2023-11-07 15:15:21,012 : INFO : \ths: True\n", + "2023-11-07 15:15:21,013 : INFO : \tworkers: 20\n", + "2023-11-07 15:15:21,016 : INFO : \tnegative: 15\n", + "2023-11-07 15:15:21,017 : INFO : \tstart_alpha: 0.001\n", + "2023-11-07 15:15:21,017 : INFO : \tend_alpha: 0.0001\n", + "2023-11-07 15:15:21,018 : INFO : \tsubsample: 0.0001\n", + "2023-11-07 15:15:21,018 : INFO : \tbatch: 5000\n", + "2023-11-07 15:15:21,019 : INFO : \tepochs: 5\n", + "2023-11-07 15:15:21,020 : INFO : The model will be saved in None\n", + "2023-11-07 15:15:21,020 : INFO : training model with 20 workers on 20 vocabulary and 200 features, using sg=1 hs=1 sample=0.0001 negative=15 window=8\n", + "2023-11-07 15:15:21,035 : INFO : worker thread finished; awaiting finish of 19 more threads\n", + "2023-11-07 15:15:21,036 : INFO : worker thread finished; awaiting finish of 18 more threads\n", + "2023-11-07 15:15:21,037 : INFO : worker thread finished; awaiting finish of 17 more threads\n", + "2023-11-07 15:15:21,038 : INFO : worker thread finished; awaiting finish of 16 more threads\n", + "2023-11-07 15:15:21,038 : INFO : worker thread finished; awaiting finish of 15 more threads\n", + "2023-11-07 15:15:21,039 : INFO : worker thread finished; awaiting finish of 14 more threads\n", + "2023-11-07 15:15:21,040 : INFO : worker thread finished; awaiting finish of 13 more threads\n", + "2023-11-07 15:15:21,041 : INFO : worker thread finished; awaiting finish of 12 more threads\n", + "2023-11-07 15:15:21,042 : INFO : worker thread finished; awaiting finish of 11 more threads\n", + "2023-11-07 15:15:21,043 : INFO : worker thread finished; awaiting finish of 10 more threads\n", + "2023-11-07 15:15:21,043 : INFO : worker thread finished; awaiting finish of 9 more threads\n", + "2023-11-07 15:15:21,044 : INFO : worker thread finished; awaiting finish of 8 more threads\n", + "2023-11-07 15:15:21,045 : INFO : worker thread finished; awaiting finish of 7 more threads\n", + "2023-11-07 15:15:21,046 : INFO : worker thread finished; awaiting finish of 6 more threads\n", + "2023-11-07 15:15:21,047 : INFO : worker thread finished; awaiting finish of 5 more threads\n", + "2023-11-07 15:15:21,047 : INFO : worker thread finished; awaiting finish of 4 more threads\n", + "2023-11-07 15:15:21,048 : INFO : worker thread finished; awaiting finish of 3 more threads\n", + "2023-11-07 15:15:21,048 : INFO : worker thread finished; awaiting finish of 2 more threads\n", + "2023-11-07 15:15:21,049 : INFO : worker thread finished; awaiting finish of 1 more threads\n", + "2023-11-07 15:15:21,049 : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2023-11-07 15:15:21,050 : INFO : EPOCH - 1 : training on 842 raw words (12 effective words) took 0.0s, 549 effective words/s\n", + "2023-11-07 15:15:21,050 : INFO : 1 Epoch(s) done. Loss: 0.0, LR: 0.001\n", + "2023-11-07 15:15:21,068 : INFO : worker thread finished; awaiting finish of 19 more threads\n", + "2023-11-07 15:15:21,069 : INFO : worker thread finished; awaiting finish of 18 more threads\n", + "2023-11-07 15:15:21,070 : INFO : worker thread finished; awaiting finish of 17 more threads\n", + "2023-11-07 15:15:21,071 : INFO : worker thread finished; awaiting finish of 16 more threads\n", + "2023-11-07 15:15:21,072 : INFO : worker thread finished; awaiting finish of 15 more threads\n", + "2023-11-07 15:15:21,072 : INFO : worker thread finished; awaiting finish of 14 more threads\n", + "2023-11-07 15:15:21,073 : INFO : worker thread finished; awaiting finish of 13 more threads\n", + "2023-11-07 15:15:21,074 : INFO : worker thread finished; awaiting finish of 12 more threads\n", + "2023-11-07 15:15:21,075 : INFO : worker thread finished; awaiting finish of 11 more threads\n", + "2023-11-07 15:15:21,075 : INFO : worker thread finished; awaiting finish of 10 more threads\n", + "2023-11-07 15:15:21,076 : INFO : worker thread finished; awaiting finish of 9 more threads\n", + "2023-11-07 15:15:21,077 : INFO : worker thread finished; awaiting finish of 8 more threads\n", + "2023-11-07 15:15:21,077 : INFO : worker thread finished; awaiting finish of 7 more threads\n", + "2023-11-07 15:15:21,078 : INFO : worker thread finished; awaiting finish of 6 more threads\n", + "2023-11-07 15:15:21,079 : INFO : worker thread finished; awaiting finish of 5 more threads\n", + "2023-11-07 15:15:21,079 : INFO : worker thread finished; awaiting finish of 4 more threads\n", + "2023-11-07 15:15:21,080 : INFO : worker thread finished; awaiting finish of 3 more threads\n", + "2023-11-07 15:15:21,081 : INFO : worker thread finished; awaiting finish of 2 more threads\n", + "2023-11-07 15:15:21,081 : INFO : worker thread finished; awaiting finish of 1 more threads\n", + "2023-11-07 15:15:21,082 : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2023-11-07 15:15:21,082 : INFO : EPOCH - 2 : training on 842 raw words (16 effective words) took 0.0s, 657 effective words/s\n", + "2023-11-07 15:15:21,083 : INFO : 2 Epoch(s) done. Loss: 0.0, LR: 0.001\n", + "2023-11-07 15:15:21,104 : INFO : worker thread finished; awaiting finish of 19 more threads\n", + "2023-11-07 15:15:21,105 : INFO : worker thread finished; awaiting finish of 18 more threads\n", + "2023-11-07 15:15:21,106 : INFO : worker thread finished; awaiting finish of 17 more threads\n", + "2023-11-07 15:15:21,107 : INFO : worker thread finished; awaiting finish of 16 more threads\n", + "2023-11-07 15:15:21,108 : INFO : worker thread finished; awaiting finish of 15 more threads\n", + "2023-11-07 15:15:21,109 : INFO : worker thread finished; awaiting finish of 14 more threads\n", + "2023-11-07 15:15:21,110 : INFO : worker thread finished; awaiting finish of 13 more threads\n", + "2023-11-07 15:15:21,111 : INFO : worker thread finished; awaiting finish of 12 more threads\n", + "2023-11-07 15:15:21,111 : INFO : worker thread finished; awaiting finish of 11 more threads\n", + "2023-11-07 15:15:21,112 : INFO : worker thread finished; awaiting finish of 10 more threads\n", + "2023-11-07 15:15:21,113 : INFO : worker thread finished; awaiting finish of 9 more threads\n", + "2023-11-07 15:15:21,114 : INFO : worker thread finished; awaiting finish of 8 more threads\n", + "2023-11-07 15:15:21,115 : INFO : worker thread finished; awaiting finish of 7 more threads\n", + "2023-11-07 15:15:21,116 : INFO : worker thread finished; awaiting finish of 6 more threads\n", + "2023-11-07 15:15:21,116 : INFO : worker thread finished; awaiting finish of 5 more threads\n", + "2023-11-07 15:15:21,117 : INFO : worker thread finished; awaiting finish of 4 more threads\n", + "2023-11-07 15:15:21,117 : INFO : worker thread finished; awaiting finish of 3 more threads\n", + "2023-11-07 15:15:21,118 : INFO : worker thread finished; awaiting finish of 2 more threads\n", + "2023-11-07 15:15:21,118 : INFO : worker thread finished; awaiting finish of 1 more threads\n", + "2023-11-07 15:15:21,119 : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2023-11-07 15:15:21,119 : INFO : EPOCH - 3 : training on 842 raw words (13 effective words) took 0.0s, 484 effective words/s\n", + "2023-11-07 15:15:21,120 : INFO : 3 Epoch(s) done. Loss: 58.624534606933594, LR: 0.001\n", + "2023-11-07 15:15:21,137 : INFO : worker thread finished; awaiting finish of 19 more threads\n", + "2023-11-07 15:15:21,138 : INFO : worker thread finished; awaiting finish of 18 more threads\n", + "2023-11-07 15:15:21,139 : INFO : worker thread finished; awaiting finish of 17 more threads\n", + "2023-11-07 15:15:21,140 : INFO : worker thread finished; awaiting finish of 16 more threads\n", + "2023-11-07 15:15:21,140 : INFO : worker thread finished; awaiting finish of 15 more threads\n", + "2023-11-07 15:15:21,141 : INFO : worker thread finished; awaiting finish of 14 more threads\n", + "2023-11-07 15:15:21,142 : INFO : worker thread finished; awaiting finish of 13 more threads\n", + "2023-11-07 15:15:21,143 : INFO : worker thread finished; awaiting finish of 12 more threads\n", + "2023-11-07 15:15:21,143 : INFO : worker thread finished; awaiting finish of 11 more threads\n", + "2023-11-07 15:15:21,144 : INFO : worker thread finished; awaiting finish of 10 more threads\n", + "2023-11-07 15:15:21,145 : INFO : worker thread finished; awaiting finish of 9 more threads\n", + "2023-11-07 15:15:21,146 : INFO : worker thread finished; awaiting finish of 8 more threads\n", + "2023-11-07 15:15:21,146 : INFO : worker thread finished; awaiting finish of 7 more threads\n", + "2023-11-07 15:15:21,147 : INFO : worker thread finished; awaiting finish of 6 more threads\n", + "2023-11-07 15:15:21,147 : INFO : worker thread finished; awaiting finish of 5 more threads\n", + "2023-11-07 15:15:21,148 : INFO : worker thread finished; awaiting finish of 4 more threads\n", + "2023-11-07 15:15:21,148 : INFO : worker thread finished; awaiting finish of 3 more threads\n", + "2023-11-07 15:15:21,149 : INFO : worker thread finished; awaiting finish of 2 more threads\n", + "2023-11-07 15:15:21,149 : INFO : worker thread finished; awaiting finish of 1 more threads\n", + "2023-11-07 15:15:21,150 : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2023-11-07 15:15:21,150 : INFO : EPOCH - 4 : training on 842 raw words (11 effective words) took 0.0s, 478 effective words/s\n", + "2023-11-07 15:15:21,151 : INFO : 4 Epoch(s) done. Loss: 29.71660614013672, LR: 0.001\n", + "2023-11-07 15:15:21,172 : INFO : worker thread finished; awaiting finish of 19 more threads\n", + "2023-11-07 15:15:21,175 : INFO : worker thread finished; awaiting finish of 18 more threads\n", + "2023-11-07 15:15:21,176 : INFO : worker thread finished; awaiting finish of 17 more threads\n", + "2023-11-07 15:15:21,177 : INFO : worker thread finished; awaiting finish of 16 more threads\n", + "2023-11-07 15:15:21,179 : INFO : worker thread finished; awaiting finish of 15 more threads\n", + "2023-11-07 15:15:21,179 : INFO : worker thread finished; awaiting finish of 14 more threads\n", + "2023-11-07 15:15:21,180 : INFO : worker thread finished; awaiting finish of 13 more threads\n", + "2023-11-07 15:15:21,181 : INFO : worker thread finished; awaiting finish of 12 more threads\n", + "2023-11-07 15:15:21,181 : INFO : worker thread finished; awaiting finish of 11 more threads\n", + "2023-11-07 15:15:21,182 : INFO : worker thread finished; awaiting finish of 10 more threads\n", + "2023-11-07 15:15:21,182 : INFO : worker thread finished; awaiting finish of 9 more threads\n", + "2023-11-07 15:15:21,183 : INFO : worker thread finished; awaiting finish of 8 more threads\n", + "2023-11-07 15:15:21,183 : INFO : worker thread finished; awaiting finish of 7 more threads\n", + "2023-11-07 15:15:21,184 : INFO : worker thread finished; awaiting finish of 6 more threads\n", + "2023-11-07 15:15:21,184 : INFO : worker thread finished; awaiting finish of 5 more threads\n", + "2023-11-07 15:15:21,184 : INFO : worker thread finished; awaiting finish of 4 more threads\n", + "2023-11-07 15:15:21,185 : INFO : worker thread finished; awaiting finish of 3 more threads\n", + "2023-11-07 15:15:21,185 : INFO : worker thread finished; awaiting finish of 2 more threads\n", + "2023-11-07 15:15:21,186 : INFO : worker thread finished; awaiting finish of 1 more threads\n", + "2023-11-07 15:15:21,186 : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2023-11-07 15:15:21,187 : INFO : EPOCH - 5 : training on 842 raw words (12 effective words) took 0.0s, 460 effective words/s\n", + "2023-11-07 15:15:21,187 : INFO : 5 Epoch(s) done. Loss: 55.20796203613281, LR: 0.001\n", + "2023-11-07 15:15:21,190 : INFO : training on a 4210 raw words (64 effective words) took 0.2s, 379 effective words/s\n", + "2023-11-07 15:15:21,190 : WARNING : under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay\n" + ] + } + ], + "source": [ + "seqs_noauthor_path = \"rw_seqs_noauthors.txt\"\n", + "\n", + "import embedding\n", + "embed = embedding.dww2v(seqs_noauthor_path, workers=20) # initiating deepwalk model with a different value for parameter workers\n", + "embed.build_model()\n", + "embed.train()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "sims,_,reordered_mats = embed.similarities(['thermoelectric'], mats, return_nan=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "full_R = sparse.load_npz(\"data/thrm_vertex_matrix.npz\")\n", + "subgraph_R = full_R[yrs<=2000]\n", + "studied_mats = mats[np.asarray(np.sum(subgraph_R[:,h.nA:-1].multiply(subgraph_R[:,-1]), axis=0)>0)[0,:]]\n", + "candidate_mats = mats[~np.isin(mats,studied_mats)]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['F6S', 'H2O', 'O2Si', ..., 'Si12Zr13', 'Si29Zr21', 'AgGe4SbTe6'],\n", + " dtype='