diff --git a/docs/desc-0000-qp-photo-z_approximation/acknowledgments.tex b/docs/desc-0000-qp-photo-z_approximation/acknowledgments.tex index ff2992ea..b9ff95c0 100644 --- a/docs/desc-0000-qp-photo-z_approximation/acknowledgments.tex +++ b/docs/desc-0000-qp-photo-z_approximation/acknowledgments.tex @@ -1,3 +1,3 @@ -% -This is the text imported from \code{acknowledgments.tex}, and will be replaced by some standard LSST DESC boilerplate at some point. -% +% +%This is the text imported from \code{acknowledgments.tex}, and will be replaced by some standard LSST DESC boilerplate at some point. +% diff --git a/docs/desc-0000-qp-photo-z_approximation/authors.csv b/docs/desc-0000-qp-photo-z_approximation/authors.csv index 0e2feed8..5966aab1 100644 --- a/docs/desc-0000-qp-photo-z_approximation/authors.csv +++ b/docs/desc-0000-qp-photo-z_approximation/authors.csv @@ -31,7 +31,7 @@ Lastname,Firstname,Authorname,AuthorType,Affiliation,Contribution,Email Malz,Alex,A.I.~Malz,Contact,"Center for Cosmology and Particle Physics, New York University, 726 Broadway, New York, 10003","Initiated project, led development work.",aimalz@nyu.edu Marshall,Phil,P.J.~Marshall,Contributor,"SLAC National Accelerator Laboratory, Menlo Park, CA 94025, USA","Advised on statistics, and project design and management.",dr.phil.marshall@gmail.com -Schmidt,Samuel,S.J.~Schmidt,Contributor,"Dept. of Physics, University of California, One Shields Ave., Davis, CA, 95616","Provided the lower quality data mock catalog.",sschmidt@physics.usdavis.edu -Graham,Melissa,M.~Graham,Contributor,"University of Washington","Provided the higher quality data mock catalog.",mlg3k@uw.edu -DeRose,Joe,J.~DeRose,Contributor,"Stanford University","Contributed to the production of the lower quality data mock catalog.",jderose@stanford.edu -Wechsler,Risa,R.~Wechsler,Contributor,"Stanford University","Contributed to the production of the lower quality data mock catalog.",rwechsler@stanford.edu +Schmidt,Samuel,S.J.~Schmidt,Contributor,"Dept. of Physics, University of California, One Shields Ave., Davis, CA, 95616","Produced the PDFs for the fainter mock catalog.",sschmidt@physics.usdavis.edu +Graham,Melissa,M.L.~Graham,Contributor,"Department of Astronomy, University of Washington, Box 351580 U.W., Seattle, WA 98195-1580","Produced the photometry and PDFs for the brighter mock catalog.",mlg3k@uw.edu +DeRose,Joe,J.~DeRose,Contributor,"Stanford University","Produced the photometry for the fainter mock catalog.",jderose@stanford.edu +Wechsler,Risa,R.~Wechsler,Contributor,"Stanford University","Produced the photometry for the fainter mock catalog.",rwechsler@stanford.edu diff --git a/docs/desc-0000-qp-photo-z_approximation/authors.tex b/docs/desc-0000-qp-photo-z_approximation/authors.tex index b86cf629..ee4961f5 100644 --- a/docs/desc-0000-qp-photo-z_approximation/authors.tex +++ b/docs/desc-0000-qp-photo-z_approximation/authors.tex @@ -7,8 +7,8 @@ \affiliation{SLAC National Accelerator Laboratory, Menlo Park, CA 94025, USA} \author{S.J.~Schmidt} \affiliation{Dept. of Physics, University of California, One Shields Ave., Davis, CA, 95616} -\author{M.~Graham} -\affiliation{University of Washington} +\author{M.L.~Graham} +\affiliation{Department of Astronomy, University of Washington, Box 351580 U.W., Seattle, WA 98195-1580} \author{J.~DeRose} \affiliation{Stanford University} \author{R.~Wechsler} diff --git a/docs/desc-0000-qp-photo-z_approximation/contributions.tex b/docs/desc-0000-qp-photo-z_approximation/contributions.tex index a0e01b03..6150a71a 100644 --- a/docs/desc-0000-qp-photo-z_approximation/contributions.tex +++ b/docs/desc-0000-qp-photo-z_approximation/contributions.tex @@ -1,7 +1,7 @@ Author contributions are listed below. \\ A.I.~Malz: Initiated project, led development work. \\ P.J.~Marshall: Advised on statistics, and project design and management. \\ -S.J.~Schmidt: Provided the Optical dataset. \\ -M.~Graham: Provided the Optical+IR dataset. \\ -J.~DeRose: Contributed to the production of the Optical dataset. \\ -R.~Wechsler: Contributed to the production of the Optical dataset. \\ +S.J.~Schmidt: Produced the PDFs for the fainter mock catalog. \\ +M.L.~Graham: Produced the photometry and PDFs for the brighter mock catalog. \\ +J.~DeRose: Produced the photometry for the fainter mock catalog. \\ +R.~Wechsler: Produced the photometry for the fainter mock catalog. \\ diff --git a/docs/desc-0000-qp-photo-z_approximation/figures/demo_pz.pdf b/docs/desc-0000-qp-photo-z_approximation/figures/demo_pz.pdf new file mode 100644 index 00000000..4bf7a231 Binary files /dev/null and b/docs/desc-0000-qp-photo-z_approximation/figures/demo_pz.pdf differ diff --git a/docs/desc-0000-qp-photo-z_approximation/figures/demo_pz.png b/docs/desc-0000-qp-photo-z_approximation/figures/demo_pz.png deleted file mode 100644 index d04ba035..00000000 Binary files a/docs/desc-0000-qp-photo-z_approximation/figures/demo_pz.png and /dev/null differ diff --git a/docs/desc-0000-qp-photo-z_approximation/figures/graham_kld.png b/docs/desc-0000-qp-photo-z_approximation/figures/graham_kld.png deleted file mode 100644 index d8f4a948..00000000 Binary files a/docs/desc-0000-qp-photo-z_approximation/figures/graham_kld.png and /dev/null differ diff --git a/docs/desc-0000-qp-photo-z_approximation/figures/graham_moments.png b/docs/desc-0000-qp-photo-z_approximation/figures/graham_moments.png deleted file mode 100644 index 0b85ecb1..00000000 Binary files a/docs/desc-0000-qp-photo-z_approximation/figures/graham_moments.png and /dev/null differ diff --git a/docs/desc-0000-qp-photo-z_approximation/figures/graham_nz_err.pdf b/docs/desc-0000-qp-photo-z_approximation/figures/graham_nz_err.pdf new file mode 100644 index 00000000..51a2c0e5 Binary files /dev/null and b/docs/desc-0000-qp-photo-z_approximation/figures/graham_nz_err.pdf differ diff --git a/docs/desc-0000-qp-photo-z_approximation/figures/graham_nz_kld.pdf b/docs/desc-0000-qp-photo-z_approximation/figures/graham_nz_kld.pdf new file mode 100644 index 00000000..1450d10a Binary files /dev/null and b/docs/desc-0000-qp-photo-z_approximation/figures/graham_nz_kld.pdf differ diff --git a/docs/desc-0000-qp-photo-z_approximation/figures/graham_pz_err.pdf b/docs/desc-0000-qp-photo-z_approximation/figures/graham_pz_err.pdf new file mode 100644 index 00000000..6567952e Binary files /dev/null and b/docs/desc-0000-qp-photo-z_approximation/figures/graham_pz_err.pdf differ diff --git a/docs/desc-0000-qp-photo-z_approximation/figures/graham_pz_kld.pdf b/docs/desc-0000-qp-photo-z_approximation/figures/graham_pz_kld.pdf new file mode 100644 index 00000000..92560de7 Binary files /dev/null and b/docs/desc-0000-qp-photo-z_approximation/figures/graham_pz_kld.pdf differ diff --git a/docs/desc-0000-qp-photo-z_approximation/figures/graham_pzs.pdf b/docs/desc-0000-qp-photo-z_approximation/figures/graham_pzs.pdf new file mode 100644 index 00000000..1fccaf5b Binary files /dev/null and b/docs/desc-0000-qp-photo-z_approximation/figures/graham_pzs.pdf differ diff --git a/docs/desc-0000-qp-photo-z_approximation/figures/graham_pzs.png b/docs/desc-0000-qp-photo-z_approximation/figures/graham_pzs.png deleted file mode 100644 index 982d1bd3..00000000 Binary files a/docs/desc-0000-qp-photo-z_approximation/figures/graham_pzs.png and /dev/null differ diff --git a/docs/desc-0000-qp-photo-z_approximation/figures/graham_stacked.png b/docs/desc-0000-qp-photo-z_approximation/figures/graham_stacked.png deleted file mode 100644 index dbd0966f..00000000 Binary files a/docs/desc-0000-qp-photo-z_approximation/figures/graham_stacked.png and /dev/null differ diff --git a/docs/desc-0000-qp-photo-z_approximation/figures/individual.png b/docs/desc-0000-qp-photo-z_approximation/figures/individual.png deleted file mode 100644 index ad493fa9..00000000 Binary files a/docs/desc-0000-qp-photo-z_approximation/figures/individual.png and /dev/null differ diff --git a/docs/desc-0000-qp-photo-z_approximation/figures/individual_kld.pdf b/docs/desc-0000-qp-photo-z_approximation/figures/individual_kld.pdf new file mode 100644 index 00000000..9b403af1 Binary files /dev/null and b/docs/desc-0000-qp-photo-z_approximation/figures/individual_kld.pdf differ diff --git a/docs/desc-0000-qp-photo-z_approximation/figures/kld_precision.png b/docs/desc-0000-qp-photo-z_approximation/figures/kld_precision.png deleted file mode 100644 index 8d47ba36..00000000 Binary files a/docs/desc-0000-qp-photo-z_approximation/figures/kld_precision.png and /dev/null differ diff --git a/docs/desc-0000-qp-photo-z_approximation/figures/kld_tension.png b/docs/desc-0000-qp-photo-z_approximation/figures/kld_tension.png deleted file mode 100644 index ab85f45c..00000000 Binary files a/docs/desc-0000-qp-photo-z_approximation/figures/kld_tension.png and /dev/null differ diff --git a/docs/desc-0000-qp-photo-z_approximation/figures/precision.pdf b/docs/desc-0000-qp-photo-z_approximation/figures/precision.pdf new file mode 100644 index 00000000..a4fc208f Binary files /dev/null and b/docs/desc-0000-qp-photo-z_approximation/figures/precision.pdf differ diff --git a/docs/desc-0000-qp-photo-z_approximation/figures/schmidt_kld.png b/docs/desc-0000-qp-photo-z_approximation/figures/schmidt_kld.png deleted file mode 100644 index b2613294..00000000 Binary files a/docs/desc-0000-qp-photo-z_approximation/figures/schmidt_kld.png and /dev/null differ diff --git a/docs/desc-0000-qp-photo-z_approximation/figures/schmidt_moments.png b/docs/desc-0000-qp-photo-z_approximation/figures/schmidt_moments.png deleted file mode 100644 index cd32090e..00000000 Binary files a/docs/desc-0000-qp-photo-z_approximation/figures/schmidt_moments.png and /dev/null differ diff --git a/docs/desc-0000-qp-photo-z_approximation/figures/schmidt_nz_err.pdf b/docs/desc-0000-qp-photo-z_approximation/figures/schmidt_nz_err.pdf new file mode 100644 index 00000000..4bc01f7b Binary files /dev/null and b/docs/desc-0000-qp-photo-z_approximation/figures/schmidt_nz_err.pdf differ diff --git a/docs/desc-0000-qp-photo-z_approximation/figures/schmidt_nz_kld.pdf b/docs/desc-0000-qp-photo-z_approximation/figures/schmidt_nz_kld.pdf new file mode 100644 index 00000000..17f42967 Binary files /dev/null and b/docs/desc-0000-qp-photo-z_approximation/figures/schmidt_nz_kld.pdf differ diff --git a/docs/desc-0000-qp-photo-z_approximation/figures/schmidt_pz_err.pdf b/docs/desc-0000-qp-photo-z_approximation/figures/schmidt_pz_err.pdf new file mode 100644 index 00000000..b7abcdc3 Binary files /dev/null and b/docs/desc-0000-qp-photo-z_approximation/figures/schmidt_pz_err.pdf differ diff --git a/docs/desc-0000-qp-photo-z_approximation/figures/schmidt_pz_kld.pdf b/docs/desc-0000-qp-photo-z_approximation/figures/schmidt_pz_kld.pdf new file mode 100644 index 00000000..382fdeb7 Binary files /dev/null and b/docs/desc-0000-qp-photo-z_approximation/figures/schmidt_pz_kld.pdf differ diff --git a/docs/desc-0000-qp-photo-z_approximation/figures/schmidt_pzs.pdf b/docs/desc-0000-qp-photo-z_approximation/figures/schmidt_pzs.pdf new file mode 100644 index 00000000..fec99d00 Binary files /dev/null and b/docs/desc-0000-qp-photo-z_approximation/figures/schmidt_pzs.pdf differ diff --git a/docs/desc-0000-qp-photo-z_approximation/figures/stacked.pdf b/docs/desc-0000-qp-photo-z_approximation/figures/stacked.pdf new file mode 100644 index 00000000..3ccae76a Binary files /dev/null and b/docs/desc-0000-qp-photo-z_approximation/figures/stacked.pdf differ diff --git a/docs/desc-0000-qp-photo-z_approximation/figures/stacked.png b/docs/desc-0000-qp-photo-z_approximation/figures/stacked.png deleted file mode 100644 index e5e57379..00000000 Binary files a/docs/desc-0000-qp-photo-z_approximation/figures/stacked.png and /dev/null differ diff --git a/docs/desc-0000-qp-photo-z_approximation/figures/tension.pdf b/docs/desc-0000-qp-photo-z_approximation/figures/tension.pdf new file mode 100644 index 00000000..0cb50501 Binary files /dev/null and b/docs/desc-0000-qp-photo-z_approximation/figures/tension.pdf differ diff --git a/docs/desc-0000-qp-photo-z_approximation/main.bib b/docs/desc-0000-qp-photo-z_approximation/main.bib index 1a96a0d8..ab19775d 100644 --- a/docs/desc-0000-qp-photo-z_approximation/main.bib +++ b/docs/desc-0000-qp-photo-z_approximation/main.bib @@ -14,43 +14,25 @@ @article{carrasco_kind_sparse_2014 pages = {3550--3561}, } -@article{dark_energy_survey_collaboration_redshift_2016, - title = {Redshift distributions of galaxies in the {Dark} {Energy} {Survey} {Science} {Verification} shear catalogue and implications for weak lensing}, - volume = {94}, - url = {https://link.aps.org/doi/10.1103/PhysRevD.94.042005}, - doi = {10.1103/PhysRevD.94.042005}, - number = {4}, - urldate = {2017-07-19}, - journal = {Phys. Rev. D}, - author = {{Dark Energy Survey Collaboration} and Bonnett, C. and Troxel, M. A. and Hartley, W. and Amara, A. and Leistedt, B. and Becker, M. R. and Bernstein, G. M. and Bridle, S. L. and Bruderer, C. and Busha, M. T. and Carrasco Kind, M. and Childress, M. J. and Castander, F. J. and Chang, C. and Crocce, M. and Davis, T. M. and Eifler, T. F. and Frieman, J. and Gangkofner, C. and Gaztanaga, E. and Glazebrook, K. and Gruen, D. and Kacprzak, T. and King, A. and Kwan, J. and Lahav, O. and Lewis, G. and Lidman, C. and Lin, H. and MacCrann, N. and Miquel, R. and O’Neill, C. R. and Palmese, A. and Peiris, H. V. and Refregier, A. and Rozo, E. and Rykoff, E. S. and Sadeh, I. and Sánchez, C. and Sheldon, E. and Uddin, S. and Wechsler, R. H. and Zuntz, J. and Abbott, T. and Abdalla, F. B. and Allam, S. and Armstrong, R. and Banerji, M. and Bauer, A. H. and Benoit-Lévy, A. and Bertin, E. and Brooks, D. and Buckley-Geer, E. and Burke, D. L. and Capozzi, D. and Carnero Rosell, A. and Carretero, J. and Cunha, C. E. and D’Andrea, C. B. and da Costa, L. N. and DePoy, D. L. and Desai, S. and Diehl, H. T. and Dietrich, J. P. and Doel, P. and Fausti Neto, A. and Fernandez, E. and Flaugher, B. and Fosalba, P. and Gerdes, D. W. and Gruendl, R. A. and Honscheid, K. and Jain, B. and James, D. J. and Jarvis, M. and Kim, A. G. and Kuehn, K. and Kuropatkin, N. and Li, T. S. and Lima, M. and Maia, M. A. G. and March, M. and Marshall, J. L. and Martini, P. and Melchior, P. and Miller, C. J. and Neilsen, E. and Nichol, R. C. and Nord, B. and Ogando, R. and Plazas, A. A. and Reil, K. and Romer, A. K. and Roodman, A. and Sako, M. and Sanchez, E. and Santiago, B. and Smith, R. C. and Soares-Santos, M. and Sobreira, F. and Suchyta, E. and Swanson, M. E. C. and Tarle, G. and Thaler, J. and Thomas, D. and Vikram, V. and Walker, A. R.}, - month = aug, - year = {2016}, - pages = {042005}, -} - @article{krause_dark_2017, title = {Dark {Energy} {Survey} {Year} 1 {Results}: {Multi}-{Probe} {Methodology} and {Simulated} {Likelihood} {Analyses}}, shorttitle = {Dark {Energy} {Survey} {Year} 1 {Results}}, url = {http://arxiv.org/abs/1706.09359}, - urldate = {2017-07-24}, journal = {arXiv:1706.09359 [astro-ph]}, author = {Krause, E. and Eifler, T. F. and Zuntz, J. and Friedrich, O. and Troxel, M. A. and Dodelson, S. and Blazek, J. and Secco, L. F. and MacCrann, N. and Baxter, E. and Chang, C. and Chen, N. and Crocce, M. and DeRose, J. and Ferte, A. and Kokron, N. and Lacasa, F. and Miranda, V. and Omori, Y. and Porredon, A. and Rosenfeld, R. and Samuroff, S. and Wang, M. and Wechsler, R. H. and Abbott, T. M. C. and Abdalla, F. B. and Allam, S. and Annis, J. and Bechtol, K. and Benoit-Levy, A. and Bernstein, G. M. and Brooks, D. and Burke, D. L. and Capozzi, D. and Kind, M. Carrasco and Carretero, J. and D'Andrea, C. B. and da Costa, L. N. and Davis, C. and DePoy, D. L. and Desai, S. and Diehl, H. T. and Dietrich, J. P. and Evrard, A. E. and Flaugher, B. and Fosalba, P. and Frieman, J. and Garcia-Bellido, J. and Gaztanaga, E. and Giannantonio, T. and Gruen, D. and Gruendl, R. A. and Gschwend, J. and Gutierrez, G. and Honscheid, K. and James, D. J. and Jeltema, T. and Kuehn, K. and Kuhlmann, S. and Lahav, O. and Lima, M. and Maia, M. A. G. and March, M. and Marshall, J. L. and Martini, P. and Menanteau, F. and Miquel, R. and Nichol, R. C. and Plazas, A. A. and Romer, A. K. and Rykoff, E. S. and Sanchez, E. and Scarpine, V. and Schindler, R. and Schubnell, M. and Sevilla-Noarbe, I. and Smith, M. and Soares-Santos, M. and Sobreira, F. and Suchyta, E. and Swanson, M. E. C. and Tarle, G. and Tucker, D. L. and Vikram, V. and Walker, A. R. and Weller, J.}, month = jun, year = {2017}, note = {arXiv: 1706.09359}, - keywords = {Astrophysics - Cosmology and Nongalactic Astrophysics}, } @article{tanaka_photometric_2017, title = {Photometric {Redshifts} for {Hyper} {Suprime}-{Cam} {Subaru} {Strategic} {Program} {Data} {Release} 1}, url = {http://arxiv.org/abs/1704.05988}, - urldate = {2017-07-29}, journal = {arXiv:1704.05988 [astro-ph]}, author = {Tanaka, Masayuki and Coupon, Jean and Hsieh, Bau-Ching and Mineo, Sogo and Nishizawa, Atsushi J. and Speagle, Joshua and Furusawa, Hisanori and Miyazaki, Satoshi and Murayama, Hitoshi}, month = apr, year = {2017}, note = {arXiv: 1704.05988}, - keywords = {Astrophysics - Astrophysics of Galaxies}, } @article{benitez_bayesian_2000, @@ -61,9 +43,8 @@ @article{benitez_bayesian_2000 doi = {10.1086/308947}, language = {en}, number = {2}, - urldate = {2017-07-30}, journal = {ApJ}, - author = {Benítez, Narciso}, + author = {Ben{\'i}tez, Narciso}, year = {2000}, pages = {571}, } @@ -76,7 +57,6 @@ @article{sheldon_photometric_2012 doi = {10.1088/0067-0049/201/2/32}, language = {en}, number = {2}, - urldate = {2017-07-31}, journal = {ApJS}, author = {Sheldon, Erin S. and Cunha, Carlos E. and Mandelbaum, Rachel and Brinkmann, J. and Weaver, Benjamin A.}, year = {2012}, @@ -97,35 +77,6 @@ @article{myers_incorporating_2009 pages = {2279--2287}, } -@inproceedings{connolly_end--end_2014, - series = {Society of {Photo}-{Optical} {Instrumentation} {Engineers} ({SPIE}) {Conference} {Series}}, - title = {An end-to-end simulation framework for the {Large} {Synoptic} {Survey} {Telescope}}, - volume = {9150}, - url = {http://dx.doi.org/10.1117/12.2054953}, - doi = {10.1117/12.2054953}, - urldate = {2017-08-04}, - booktitle = {Modeling, {Systems} {Engineering}, and {Project} {Management} for {Astronomy} {VI}}, - author = {Connolly, Andrew J. and Angeli, George Z. and Chandrasekharan, Srinivasan and Claver, Charles F. and Cook, Kem and Ivezic, Zeljko and Jones, R. Lynne and Krughoff, K. Simon and Peng, En-Hsin and Peterson, John and Petry, Catherine and Rasmussen, Andrew P. and Ridgway, Stephen T. and Saha, Abhijit and Sembroski, Glenn and vanderPlas, Jacob and Yoachim, Peter}, - year = {2014}, - pages = {915014--915014--8} -} - -@article{springel_simulations_2005, - title = {Simulations of the formation, evolution and clustering of galaxies and quasars}, - volume = {435}, - copyright = {© 2005 Nature Publishing Group}, - issn = {0028-0836}, - url = {http://www.nature.com/nature/journal/v435/n7042/full/nature03597.html?foxtrotcallback=true}, - doi = {10.1038/nature03597}, - language = {en}, - number = {7042}, - journal = {Nature}, - author = {Springel, Volker and White, Simon D. M. and Jenkins, Adrian and Frenk, Carlos S. and Yoshida, Naoki and Gao, Liang and Navarro, Julio and Thacker, Robert and Croton, Darren and Helly, John and Peacock, John A. and Cole, Shaun and Thomas, Peter and Couchman, Hugh and Evrard, August and Colberg, Jörg and Pearce, Frazer}, - month = jun, - year = {2005}, - pages = {629--636} -} - @article{gonzalez-perez_how_2014, title = {How sensitive are predicted galaxy luminosities to the choice of stellar population synthesis model?}, volume = {439}, @@ -144,17 +95,15 @@ @article{ivezic_lsst:_2008 title = {{LSST}: from {Science} {Drivers} to {Reference} {Design} and {Anticipated} {Data} {Products}}, shorttitle = {{LSST}}, url = {http://arxiv.org/abs/0805.2366}, - urldate = {2017-08-04}, journal = {arXiv:0805.2366 [astro-ph]}, - author = {Ivezic, Z. and Tyson, J. A. and Abel, B. and Acosta, E. and Allsman, R. and AlSayyad, Y. and Anderson, S. F. and Andrew, J. and Angel, R. and Angeli, G. and Ansari, R. and Antilogus, P. and Arndt, K. T. and Astier, P. and Aubourg, E. and Axelrod, T. and Bard, D. J. and Barr, J. D. and Barrau, A. and Bartlett, J. G. and Bauman, B. J. and Beaumont, S. and Becker, A. C. and Becla, J. and Beldica, C. and Bellavia, S. and Blanc, G. and Blandford, R. D. and Bloom, J. S. and Bogart, J. and Borne, K. and Bosch, J. F. and Boutigny, D. and Brandt, W. N. and Brown, M. E. and Bullock, J. S. and Burchat, P. and Burke, D. L. and Cagnoli, G. and Calabrese, D. and Chandrasekharan, S. and Chesley, S. and Cheu, E. C. and Chiang, J. and Claver, C. F. and Connolly, A. J. and Cook, K. H. and Cooray, A. and Covey, K. R. and Cribbs, C. and Cui, W. and Cutri, R. and Daubard, G. and Daues, G. and Delgado, F. and Digel, S. and Doherty, P. and Dubois, R. and Dubois-Felsmann, G. P. and Durech, J. and Eracleous, M. and Ferguson, H. and Frank, J. and Freemon, M. and Gangler, E. and Gawiser, E. and Geary, J. C. and Gee, P. and Geha, M. and Gibson, R. R. and Gilmore, D. K. and Glanzman, T. and Goodenow, I. and Gressler, W. J. and Gris, P. and Guyonnet, A. and Hascall, P. A. and Haupt, J. and Hernandez, F. and Hogan, C. and Huang, D. and Huffer, M. E. and Innes, W. R. and Jacoby, S. H. and Jain, B. and Jee, J. and Jernigan, J. G. and Jevremovic, D. and Johns, K. and Jones, R. L. and Juramy-Gilles, C. and Juric, M. and Kahn, S. M. and Kalirai, J. S. and Kallivayalil, N. and Kalmbach, B. and Kantor, J. P. and Kasliwal, M. M. and Kessler, R. and Kirkby, D. and Knox, L. and Kotov, I. and Krabbendam, V. L. and Krughoff, S. and Kubanek, P. and Kuczewski, J. and Kulkarni, S. and Lambert, R. and Guillou, L. Le and Levine, D. and Liang, M. and Lim, K.-T. and Lintott, C. and Lupton, R. H. and Mahabal, A. and Marshall, P. and Marshall, S. and May, M. and McKercher, R. and Migliore, M. and Miller, M. and Mills, D. J. and Monet, D. G. and Moniez, M. and Neill, D. R. and Nief, J.-Y. and Nomerotski, A. and Nordby, M. and O'Connor, P. and Oliver, J. and Olivier, S. S. and Olsen, K. and Ortiz, S. and Owen, R. E. and Pain, R. and Peterson, J. R. and Petry, C. E. and Pierfederici, F. and Pietrowicz, S. and Pike, R. and Pinto, P. A. and Plante, R. and Plate, S. and Price, P. A. and Prouza, M. and Radeka, V. and Rajagopal, J. and Rasmussen, A. and Regnault, N. and Ridgway, S. T. and Ritz, S. and Rosing, W. and Roucelle, C. and Rumore, M. R. and Russo, S. and Saha, A. and Sassolas, B. and Schalk, T. L. and Schindler, R. H. and Schneider, D. P. and Schumacher, G. and Sebag, J. and Sembroski, G. H. and Seppala, L. G. and Shipsey, I. and Silvestri, N. and Smith, J. A. and Smith, R. C. and Strauss, M. A. and Stubbs, C. W. and Sweeney, D. and Szalay, A. and Takacs, P. and Thaler, J. J. and Van Berg, R. and Berk, D. Vanden and Vetter, K. and Virieux, F. and Xin, B. and Walkowicz, L. and Walter, C. W. and Wang, D. L. and Warner, M. and Willman, B. and Wittman, D. and Wolff, S. C. and Wood-Vasey, W. M. and Yoachim, P. and Zhan, H. and Collaboration, for the LSST}, + author = {Ivezi{\'c}, {\v Z}eljko and Tyson, J. A. and Abel, B. and Acosta, E. and Allsman, R. and AlSayyad, Y. and Anderson, S. F. and Andrew, J. and Angel, R. and Angeli, G. and Ansari, R. and Antilogus, P. and Arndt, K. T. and Astier, P. and Aubourg, E. and Axelrod, T. and Bard, D. J. and Barr, J. D. and Barrau, A. and Bartlett, J. G. and Bauman, B. J. and Beaumont, S. and Becker, A. C. and Becla, J. and Beldica, C. and Bellavia, S. and Blanc, G. and Blandford, R. D. and Bloom, J. S. and Bogart, J. and Borne, K. and Bosch, J. F. and Boutigny, D. and Brandt, W. N. and Brown, M. E. and Bullock, J. S. and Burchat, P. and Burke, D. L. and Cagnoli, G. and Calabrese, D. and Chandrasekharan, S. and Chesley, S. and Cheu, E. C. and Chiang, J. and Claver, C. F. and Connolly, A. J. and Cook, K. H. and Cooray, A. and Covey, K. R. and Cribbs, C. and Cui, W. and Cutri, R. and Daubard, G. and Daues, G. and Delgado, F. and Digel, S. and Doherty, P. and Dubois, R. and Dubois-Felsmann, G. P. and Durech, J. and Eracleous, M. and Ferguson, H. and Frank, J. and Freemon, M. and Gangler, E. and Gawiser, E. and Geary, J. C. and Gee, P. and Geha, M. and Gibson, R. R. and Gilmore, D. K. and Glanzman, T. and Goodenow, I. and Gressler, W. J. and Gris, P. and Guyonnet, A. and Hascall, P. A. and Haupt, J. and Hernandez, F. and Hogan, C. and Huang, D. and Huffer, M. E. and Innes, W. R. and Jacoby, S. H. and Jain, B. and Jee, J. and Jernigan, J. G. and Jevremovic, D. and Johns, K. and Jones, R. L. and Juramy-Gilles, C. and Juric, M. and Kahn, S. M. and Kalirai, J. S. and Kallivayalil, N. and Kalmbach, B. and Kantor, J. P. and Kasliwal, M. M. and Kessler, R. and Kirkby, D. and Knox, L. and Kotov, I. and Krabbendam, V. L. and Krughoff, S. and Kubanek, P. and Kuczewski, J. and Kulkarni, S. and Lambert, R. and Guillou, L. Le and Levine, D. and Liang, M. and Lim, K.-T. and Lintott, C. and Lupton, R. H. and Mahabal, A. and Marshall, P. and Marshall, S. and May, M. and McKercher, R. and Migliore, M. and Miller, M. and Mills, D. J. and Monet, D. G. and Moniez, M. and Neill, D. R. and Nief, J.-Y. and Nomerotski, A. and Nordby, M. and O'Connor, P. and Oliver, J. and Olivier, S. S. and Olsen, K. and Ortiz, S. and Owen, R. E. and Pain, R. and Peterson, J. R. and Petry, C. E. and Pierfederici, F. and Pietrowicz, S. and Pike, R. and Pinto, P. A. and Plante, R. and Plate, S. and Price, P. A. and Prouza, M. and Radeka, V. and Rajagopal, J. and Rasmussen, A. and Regnault, N. and Ridgway, S. T. and Ritz, S. and Rosing, W. and Roucelle, C. and Rumore, M. R. and Russo, S. and Saha, A. and Sassolas, B. and Schalk, T. L. and Schindler, R. H. and Schneider, D. P. and Schumacher, G. and Sebag, J. and Sembroski, G. H. and Seppala, L. G. and Shipsey, I. and Silvestri, N. and Smith, J. A. and Smith, R. C. and Strauss, M. A. and Stubbs, C. W. and Sweeney, D. and Szalay, A. and Takacs, P. and Thaler, J. J. and Van Berg, R. and Berk, D. Vanden and Vetter, K. and Virieux, F. and Xin, B. and Walkowicz, L. and Walter, C. W. and Wang, D. L. and Warner, M. and Willman, B. and Wittman, D. and Wolff, S. C. and Wood-Vasey, W. M. and Yoachim, P. and Zhan, H. and Collaboration, for the LSST}, month = may, year = {2008}, note = {arXiv: 0805.2366}, - keywords = {Astrophysics}, } @article{merson_lightcone_2013, - title = {Lightcone mock catalogues from semi-analytic models of galaxy formation – {I}. {Construction} and application to the {BzK} colour selection}, + title = {Lightcone mock catalogues from semi-analytic models of galaxy formation {\textendash} {I}. {Construction} and application to the {BzK} colour selection}, volume = {429}, issn = {0035-8711}, url = {https://academic.oup.com/mnras/article/429/1/556/1023267/Lightcone-mock-catalogues-from-semi-analytic}, @@ -170,14 +119,14 @@ @article{merson_lightcone_2013 @article{ilbert_accurate_2006, title = {Accurate photometric redshifts for the {CFHT} legacy survey calibrated using the {VIMOS} {VLT} deep survey}, volume = {457}, - copyright = {© ESO, 2006}, + copyright = {{\textcopyright} ESO, 2006}, issn = {0004-6361, 1432-0746}, url = {https://doi.org/10.1051/0004-6361:20065138}, doi = {10.1051/0004-6361:20065138}, language = {en}, number = {3}, journal = {A\&A}, - author = {Ilbert, O. and Arnouts, S. and McCracken, H. J. and Bolzonella, M. and Bertin, E. and Fèvre, O. Le and Mellier, Y. and Zamorani, G. and Pellò, R. and Iovino, A. and Tresse, L. and Brun, V. Le and Bottini, D. and Garilli, B. and Maccagni, D. and Picat, J. P. and Scaramella, R. and Scodeggio, M. and Vettolani, G. and Zanichelli, A. and Adami, C. and Bardelli, S. and Cappi, A. and Charlot, S. and Ciliegi, P. and Contini, T. and Cucciati, O. and Foucaud, S. and Franzetti, P. and Gavignaud, I. and Guzzo, L. and Marano, B. and Marinoni, C. and Mazure, A. and Meneux, B. and Merighi, R. and Paltani, S. and Pollo, A. and Pozzetti, L. and Radovich, M. and Zucca, E. and Bondi, M. and Bongiorno, A. and Busarello, G. and Torre, S. De La and Gregorini, L. and Lamareille, F. and Mathez, G. and Merluzzi, P. and Ripepi, V. and Rizzo, D. and Vergani, D.}, + author = {Ilbert, O. and Arnouts, S. and McCracken, H. J. and Bolzonella, M. and Bertin, E. and F{\`e}vre, O. Le and Mellier, Y. and Zamorani, G. and Pell{\`o}, R. and Iovino, A. and Tresse, L. and Brun, V. Le and Bottini, D. and Garilli, B. and Maccagni, D. and Picat, J. P. and Scaramella, R. and Scodeggio, M. and Vettolani, G. and Zanichelli, A. and Adami, C. and Bardelli, S. and Cappi, A. and Charlot, S. and Ciliegi, P. and Contini, T. and Cucciati, O. and Foucaud, S. and Franzetti, P. and Gavignaud, I. and Guzzo, L. and Marano, B. and Marinoni, C. and Mazure, A. and Meneux, B. and Merighi, R. and Paltani, S. and Pollo, A. and Pozzetti, L. and Radovich, M. and Zucca, E. and Bondi, M. and Bongiorno, A. and Busarello, G. and Torre, S. De La and Gregorini, L. and Lamareille, F. and Mathez, G. and Merluzzi, P. and Ripepi, V. and Rizzo, D. and Vergani, D.}, month = oct, year = {2006}, pages = {841--856}, @@ -187,9 +136,429 @@ @misc{juric_data_2017 title = {Data {Products} {Definition} {Document}}, shorttitle = {{LSE}-163}, url = {https://docushare.lsstcorp.org/docushare/dsweb/Get/LSE-163/}, - urldate = {2017-08-05}, journal = {LSST Corporation}, - author = {Juric, M. and Axelrod, T. and Becker, A. C. and Becla, J. and Bellm, E. and Bosch, J. F. and Ciardi, D. and Connolly, A. J. and Dubois-Felsmann, G. P. and Economou, F. and Freemon, M. and Gelman, M. and Graham, M. and Ivezic, Z. and Jenness, T. and Kantor, J. and Krughoff, K.S. and Lim, K.-T. and Lupton, R. H. and Mueller, F. and Nidever, D. and Patterson, M. and Petravick, D. and Shaw, D. and Slater, C. and Strauss, M. and Swinbank, J. and Tyson, J. A. and Wood-Vasey, M. and Wu, X.}, + author = {Juric, M. and Axelrod, T. and Becker, A. C. and Becla, J. and Bellm, E. and Bosch, J. F. and Ciardi, D. and Connolly, A. J. and Dubois-Felsmann, G. P. and Economou, F. and Freemon, M. and Gelman, M. and Graham, M. and Ivezi{\'c}, {\v Z}eljko and Jenness, T. and Kantor, J. and Krughoff, K.S. and Lim, K.-T. and Lupton, R. H. and Mueller, F. and Nidever, D. and Patterson, M. and Petravick, D. and Shaw, D. and Slater, C. and Strauss, M. and Swinbank, J. and Tyson, J. A. and Wood-Vasey, M. and Wu, X.}, + month = jul, + year = {2017}, +} + +@inproceedings{mckerns_building_2012, + title = {Building a {Framework} for {Predictive} {Science}}, + url = {http://arxiv.org/abs/1202.1056}, + booktitle = {Proceedings of the 10th {Python} in {Science} {Conference}}, + author = {McKerns, Michael M. and Strand, Leif and Sullivan, Tim and Fang, Alta and Aivazis, Michael A. G.}, + month = feb, + year = {2012}, + note = {arXiv: 1202.1056}, +} + +@misc{mckerns_pathos:_2010, + title = {pathos: a framework for heterogeneous computing}, + url = {http://trac.mystic.cacr.caltech.edu/project/pathos/wiki.html}, + author = {McKerns, Michael and Aivazis, Michael}, + year = {2010}, +} + +@article{graham_photometric_2017, + title = {Photometric {Redshifts} with the {LSST}: {Evaluating} {Survey} {Observing} {Strategies}}, + shorttitle = {Photometric {Redshifts} with the {LSST}}, + url = {http://arxiv.org/abs/1706.09507}, + journal = {arXiv:1706.09507 [astro-ph]}, + author = {Graham, Melissa L. and Connolly, Andrew J. and Ivezi{\'c}, {\v Z}eljko and Schmidt, Samuel J. and Jones, R. Lynne and Juri{\'c}, Mario and Daniel, Scott F. and Yoachim, Peter}, + month = jun, + year = {2017}, + note = {arXiv: 1706.09507}, +} + +@article{pizzocaro_results_2016, + title = {Results from {DROXO}: {IV}. {EXTraS} discovery of an {X}-ray flare from the {Class} {I} protostar candidate {ISO}-{Oph} 85}, + volume = {587}, + issn = {0004-6361, 1432-0746}, + shorttitle = {Results from {DROXO}}, + url = {http://www.aanda.org/10.1051/0004-6361/201526562}, + doi = {10.1051/0004-6361/201526562}, + journal = {Astronomy \& Astrophysics}, + author = {Pizzocaro, D. and Stelzer, B. and Paladini, R. and Tiengo, A. and Lisini, G. and Novara, G. and Vianello, G. and Belfiore, A. and Marelli, M. and Salvetti, D. and Pillitteri, I. and Sciortino, S. and D{\textquoteright}Agostino, D. and Haberl, F. and Watson, M. and Wilms, J. and Salvaterra, R. and De Luca, A.}, + month = mar, + year = {2016}, + pages = {A36}, +} + +@article{sun_star_2015, + title = {{THE} {STAR} {CLUSTER} {MASS}{\textendash}{GALACTOCENTRIC} {RADIUS} {RELATION}: {IMPLICATIONS} {FOR} {CLUSTER} {FORMATION}}, + volume = {816}, + issn = {1538-4357}, + shorttitle = {{THE} {STAR} {CLUSTER} {MASS}{\textendash}{GALACTOCENTRIC} {RADIUS} {RELATION}}, + url = {http://stacks.iop.org/0004-637X/816/i=1/a=9?key=crossref.0be3aa415b148a5dd4f07aa1471f52f1}, + doi = {10.3847/0004-637X/816/1/9}, + number = {1}, + journal = {The Astrophysical Journal}, + author = {Sun, Weijia and Grijs, Richard de and Fan, Zhou and Cameron, Ewan}, + month = dec, + year = {2015}, + pages = {9}, +} + +@article{de_vicente_dnf_2016, + title = {{DNF} {\textendash} {Galaxy} photometric redshift by {Directional} {Neighbourhood} {Fitting}}, + volume = {459}, + issn = {0035-8711}, + url = {https://academic.oup.com/mnras/article/459/3/3078/2595234/DNF-Galaxy-photometric-redshift-by-Directional}, + doi = {10.1093/mnras/stw857}, + number = {3}, + journal = {Mon Not R Astron Soc}, + author = {De Vicente, J. and S{\'a}nchez, E. and Sevilla-Noarbe, I.}, month = jul, + year = {2016}, + pages = {3078--3088}, +} + +@article{cavuoti_metaphor:_2017, + title = {{METAPHOR}: a machine-learning-based method for the probability density estimation of photometric redshifts}, + volume = {465}, + issn = {0035-8711}, + shorttitle = {{METAPHOR}}, + url = {https://academic.oup.com/mnras/article/465/2/1959/2525980/METAPHOR-a-machine-learning-based-method-for-the}, + doi = {10.1093/mnras/stw2930}, + number = {2}, + journal = {Mon Not R Astron Soc}, + author = {Cavuoti, S. and Amaro, V. and Brescia, M. and Vellucci, C. and Tortora, C. and Longo, G.}, + month = feb, + year = {2017}, + pages = {1959--1973}, +} + +@article{carrasco_kind_somz:_2014, + title = {{SOMz}: photometric redshift {PDFs} with self-organizing maps and random atlas}, + volume = {438}, + issn = {0035-8711}, + shorttitle = {{SOMz}}, + url = {https://academic.oup.com/mnras/article/438/4/3409/1108443/SOMz-photometric-redshift-PDFs-with-self}, + doi = {10.1093/mnras/stt2456}, + number = {4}, + journal = {Mon Not R Astron Soc}, + author = {Carrasco Kind, Matias and Brunner, Robert J.}, + month = mar, + year = {2014}, + pages = {3409--3421}, +} + +@article{mountrichas_measuring_2013, + title = {Measuring the dark matter halo mass of {X}-ray {AGN} at z \~{} 1 using photometric redshifts}, + volume = {430}, + issn = {0035-8711}, + url = {https://academic.oup.com/mnras/article/430/1/661/986786/Measuring-the-dark-matter-halo-mass-of-X-ray-AGN}, + doi = {10.1093/mnras/sts666}, + number = {1}, + journal = {Mon Not R Astron Soc}, + author = {Mountrichas, G. and Georgakakis, A. and Finoguenov, A. and Erfanianfar, G. and Cooper, M. C. and Coil, A. L. and Laird, E. S. and Nandra, K. and Newman, J. A.}, + month = mar, + year = {2013}, + pages = {661--675}, +} + +@article{pedregosa_scikit-learn:_2011, + title = {Scikit-learn: {Machine} learning in {Python}}, + volume = {12}, + shorttitle = {Scikit-learn}, + url = {http://www.jmlr.org/papers/v12/pedregosa11a.html}, + number = {Oct}, + journal = {Journal of Machine Learning Research}, + author = {Pedregosa, Fabian and Varoquaux, Ga{\"e}l and Gramfort, Alexandre and Michel, Vincent and Thirion, Bertrand and Grisel, Olivier and Blondel, Mathieu and Prettenhofer, Peter and Weiss, Ron and Dubourg, Vincent and {others}}, + year = {2011}, + pages = {2825--2830}, +} + +@article{lsst_science_collaboration_lsst_2009, + title = {{LSST} {Science} {Book}, {Version} 2.0}, + url = {http://arxiv.org/abs/0912.0201}, + journal = {arXiv:0912.0201 [astro-ph]}, + author = {LSST Science Collaboration and Abell, Paul A. and Allison, Julius and Anderson, Scott F. and Andrew, John R. and Angel, J. Roger P. and Armus, Lee and Arnett, David and Asztalos, S. J. and Axelrod, Tim S. and Bailey, Stephen and Ballantyne, D. R. and Bankert, Justin R. and Barkhouse, Wayne A. and Barr, Jeffrey D. and Barrientos, L. Felipe and Barth, Aaron J. and Bartlett, James G. and Becker, Andrew C. and Becla, Jacek and Beers, Timothy C. and Bernstein, Joseph P. and Biswas, Rahul and Blanton, Michael R. and Bloom, Joshua S. and Bochanski, John J. and Boeshaar, Pat and Borne, Kirk D. and Bradac, Marusa and Brandt, W. N. and Bridge, Carrie R. and Brown, Michael E. and Brunner, Robert J. and Bullock, James S. and Burgasser, Adam J. and Burge, James H. and Burke, David L. and Cargile, Phillip A. and Chandrasekharan, Srinivasan and Chartas, George and Chesley, Steven R. and Chu, You-Hua and Cinabro, David and Claire, Mark W. and Claver, Charles F. and Clowe, Douglas and Connolly, A. J. and Cook, Kem H. and Cooke, Jeff and Cooray, Asantha and Covey, Kevin R. and Culliton, Christopher S. and de Jong, Roelof and de Vries, Willem H. and Debattista, Victor P. and Delgado, Francisco and Dell'Antonio, Ian P. and Dhital, Saurav and Di Stefano, Rosanne and Dickinson, Mark and Dilday, Benjamin and Djorgovski, S. G. and Dobler, Gregory and Donalek, Ciro and Dubois-Felsmann, Gregory and Durech, Josef and Eliasdottir, Ardis and Eracleous, Michael and Eyer, Laurent and Falco, Emilio E. and Fan, Xiaohui and Fassnacht, Christopher D. and Ferguson, Harry C. and Fernandez, Yanga R. and Fields, Brian D. and Finkbeiner, Douglas and Figueroa, Eduardo E. and Fox, Derek B. and Francke, Harold and Frank, James S. and Frieman, Josh and Fromenteau, Sebastien and Furqan, Muhammad and Galaz, Gaspar and Gal-Yam, A. and Garnavich, Peter and Gawiser, Eric and Geary, John and Gee, Perry and Gibson, Robert R. and Gilmore, Kirk and Grace, Emily A. and Green, Richard F. and Gressler, William J. and Grillmair, Carl J. and Habib, Salman and Haggerty, J. S. and Hamuy, Mario and Harris, Alan W. and Hawley, Suzanne L. and Heavens, Alan F. and Hebb, Leslie and Henry, Todd J. and Hileman, Edward and Hilton, Eric J. and Hoadley, Keri and Holberg, J. B. and Holman, Matt J. and Howell, Steve B. and Infante, Leopoldo and Ivezi{\'c}, {\v Z}eljko and Jacoby, Suzanne H. and Jain, Bhuvnesh and R and Jedicke and Jee, M. James and Jernigan, J. Garrett and Jha, Saurabh W. and Johnston, Kathryn V. and Jones, R. Lynne and Juric, Mario and Kaasalainen, Mikko and Styliani and Kafka and Kahn, Steven M. and Kaib, Nathan A. and Kalirai, Jason and Kantor, Jeff and Kasliwal, Mansi M. and Keeton, Charles R. and Kessler, Richard and Knezevic, Zoran and Kowalski, Adam and Krabbendam, Victor L. and Krughoff, K. Simon and Kulkarni, Shrinivas and Kuhlman, Stephen and Lacy, Mark and Lepine, Sebastien and Liang, Ming and Lien, Amy and Lira, Paulina and Long, Knox S. and Lorenz, Suzanne and Lotz, Jennifer M. and Lupton, R. H. and Lutz, Julie and Macri, Lucas M. and Mahabal, Ashish A. and Mandelbaum, Rachel and Marshall, Phil and May, Morgan and McGehee, Peregrine M. and Meadows, Brian T. and Meert, Alan and Milani, Andrea and Miller, Christopher J. and Miller, Michelle and Mills, David and Minniti, Dante and Monet, David and Mukadam, Anjum S. and Nakar, Ehud and Neill, Douglas R. and Newman, Jeffrey A. and Nikolaev, Sergei and Nordby, Martin and O'Connor, Paul and Oguri, Masamune and Oliver, John and Olivier, Scot S. and Olsen, Julia K. and Olsen, Knut and Olszewski, Edward W. and Oluseyi, Hakeem and Padilla, Nelson D. and Parker, Alex and Pepper, Joshua and Peterson, John R. and Petry, Catherine and Pinto, Philip A. and Pizagno, James L. and Popescu, Bogdan and Prsa, Andrej and Radcka, Veljko and Raddick, M. Jordan and Rasmussen, Andrew and Rau, Arne and Rho, Jeonghee and Rhoads, James E. and Richards, Gordon T. and Ridgway, Stephen T. and Robertson, Brant E. and Roskar, Rok and Saha, Abhijit and Sarajedini, Ata and Scannapieco, Evan and Schalk, Terry and Schindler, Rafe and Schmidt, Samuel and Schmidt, Sarah and Schneider, Donald P. and Schumacher, German and Scranton, Ryan and Sebag, Jacques and Seppala, Lynn G. and Shemmer, Ohad and Simon, Joshua D. and Sivertz, M. and Smith, Howard A. and Smith, J. Allyn and Smith, Nathan and Spitz, Anna H. and Stanford, Adam and Stassun, Keivan G. and Strader, Jay and Strauss, Michael A. and Stubbs, Christopher W. and Sweeney, Donald W. and Szalay, Alex and Szkody, Paula and Takada, Masahiro and Thorman, Paul and Trilling, David E. and Trimble, Virginia and Tyson, Anthony and Van Berg, Richard and Berk, Daniel Vanden and VanderPlas, Jake and Verde, Licia and Vrsnak, Bojan and Walkowicz, Lucianne M. and Wandelt, Benjamin D. and Wang, Sheng and Wang, Yun and Warner, Michael and Wechsler, Risa H. and West, Andrew A. and Wiecha, Oliver and Williams, Benjamin F. and Willman, Beth and Wittman, David and Wolff, Sidney C. and Wood-Vasey, W. Michael and Wozniak, Przemek and Young, Patrick and Zentner, Andrew and Zhan, Hu}, + month = dec, + year = {2009}, + note = {arXiv: 0912.0201}, +} + +@article{fevre_vimos_2005, + title = {The {VIMOS} {VLT} deep survey - {First} epoch {VVDS}-deep survey: 11 564 spectra with 17.5 <= {I}\$\_{\textbackslash}textit\{{\textbackslash}textbf\{{\textbackslash}small {AB}\}\}\$ <= 24, and the redshift distribution over 0 <= z <= 5}, + volume = {439}, + copyright = {{\textcopyright} ESO, 2005}, + issn = {0004-6361, 1432-0746}, + shorttitle = {The {VIMOS} {VLT} deep survey - {First} epoch {VVDS}-deep survey}, + url = {https://doi.org/10.1051/0004-6361:20041960}, + doi = {10.1051/0004-6361:20041960}, + language = {en}, + number = {3}, + journal = {A\&A}, + author = {F{\`e}vre, O. Le and Vettolani, G. and Garilli, B. and Tresse, L. and Bottini, D. and Brun, V. Le and Maccagni, D. and Picat, J. P. and Scaramella, R. and Scodeggio, M. and Zanichelli, A. and Adami, C. and Arnaboldi, M. and Arnouts, S. and Bardelli, S. and Bolzonella, M. and Cappi, A. and Charlot, S. and Ciliegi, P. and Contini, T. and Foucaud, S. and Franzetti, P. and Gavignaud, I. and Guzzo, L. and Ilbert, O. and Iovino, A. and McCracken, H. J. and Marano, B. and Marinoni, C. and Mathez, G. and Mazure, A. and Meneux, B. and Merighi, R. and Paltani, S. and Pell{\`o}, R. and Pollo, A. and Pozzetti, L. and Radovich, M. and Zamorani, G. and Zucca, E. and Bondi, M. and Bongiorno, A. and Busarello, G. and Lamareille, F. and Mellier, Y. and Merluzzi, P. and Ripepi, V. and Rizzo, D.}, + month = sep, + year = {2005}, + pages = {845--862}, +} + +@article{sadeh_annz2:_2016, + title = {{ANNz}2: {Photometric} {Redshift} and {Probability} {Distribution} {Function} {Estimation} using {Machine} {Learning}}, + volume = {128}, + issn = {1538-3873}, + shorttitle = {{ANNz}2}, + url = {http://stacks.iop.org/1538-3873/128/i=968/a=104502}, + doi = {10.1088/1538-3873/128/968/104502}, + language = {en}, + number = {968}, + journal = {PASP}, + author = {Sadeh, I. and Abdalla, F. B. and Lahav, O.}, + year = {2016}, + pages = {104502}, +} + +@article{laycock_x-ray_2017, + title = {The {X}-{Ray} {Binary} {Population} of the {Nearby} {Dwarf} {Starburst} {Galaxy} {IC} 10: {Variable} and {Transient} {X}-{Ray} {Sources}}, + volume = {836}, + issn = {0004-637X}, + shorttitle = {The {X}-{Ray} {Binary} {Population} of the {Nearby} {Dwarf} {Starburst} {Galaxy} {IC} 10}, + url = {http://stacks.iop.org/0004-637X/836/i=1/a=50}, + doi = {10.3847/1538-4357/836/1/50}, + language = {en}, + number = {1}, + journal = {ApJ}, + author = {Laycock, Silas and Cappallo, Rigel and Williams, Benjamin F. and Prestwich, Andrea and Binder, Breanna and Christodoulou, Dimitris M.}, year = {2017}, -} \ No newline at end of file + pages = {50}, +} + +@article{bilicki_photometric_2017, + title = {Photometric redshifts for the {Kilo}-{Degree} {Survey}. {Machine}-learning analysis with artificial neural networks}, + url = {http://arxiv.org/abs/1709.04205}, + journal = {arXiv:1709.04205 [astro-ph]}, + author = {Bilicki, M. and Hoekstra, H. and Amaro, V. and Blake, C. and Brown, M. J. I. and Cavuoti, S. and de Jong, J. T. A. and Hildebrandt, H. and Wolf, C. and Amon, A. and Brescia, M. and Brough, S. and Costa-Duarte, M. V. and Erben, T. and Glazebrook, K. and Grado, A. and Heymans, C. and Jarrett, T. and Joudaki, S. and Kuijken, K. and Longo, G. and Napolitano, N. and Parkinson, D. and Vellucci, C. and Kleijn, G. A. Verdoes and Wang, L.}, + month = sep, + year = {2017}, + note = {arXiv: 1709.04205}, +} + +@article{bailer-jones_gaia_2013, + title = {The {Gaia} astrophysical parameters inference system ({Apsis}) - {Pre}-launch description}, + volume = {559}, + copyright = {{\textcopyright} ESO, 2013}, + issn = {0004-6361, 1432-0746}, + url = {https://www.aanda.org/articles/aa/abs/2013/11/aa22344-13/aa22344-13.html}, + doi = {10.1051/0004-6361/201322344}, + language = {en}, + journal = {A\&A}, + author = {Bailer-Jones, C. a. L. and Andrae, R. and Arcay, B. and Astraatmadja, T. and Bellas-Velidis, I. and Berihuete, A. and Bijaoui, A. and Carri{\'o}n, C. and Dafonte, C. and Damerdji, Y. and Dapergolas, A. and Laverny, P. de and Delchambre, L. and Drazinos, P. and Drimmel, R. and Fr{\'e}mat, Y. and Fustes, D. and Garc{\'i}a-Torres, M. and Gu{\'e}d{\'e}, C. and Heiter, U. and Janotto, A.-M. and Karampelas, A. and Kim, D.-W. and Knude, J. and Kolka, I. and Kontizas, E. and Kontizas, M. and Korn, A. J. and Lanzafame, A. C. and Lebreton, Y. and Lindstr{\o}m, H. and Liu, C. and Livanou, E. and Lobel, A. and Manteiga, M. and Martayan, C. and Ordenovic, Ch and Pichon, B. and Recio-Blanco, A. and Rocca-Volmerange, B. and Sarro, L. M. and Smith, K. and Sordo, R. and Soubiran, C. and Surdej, J. and Th{\'e}venin, F. and Tsalmantza, P. and Vallenari, A. and Zorec, J.}, + year = {2013}, + pages = {A74}, +} + +@inproceedings{connolly_end--end_2014, + title = {An end-to-end simulation framework for the {Large} {Synoptic} {Survey} {Telescope}}, + volume = {9150}, + url = {https://www.spiedigitallibrary.org/conference-proceedings-of-spie/9150/915014/An-end-to-end-simulation-framework-for-the-Large-Synoptic/10.1117/12.2054953.short}, + doi = {10.1117/12.2054953}, + booktitle = {Proceedings {SPIE}}, + publisher = {International Society for Optics and Photonics}, + author = {Connolly, Andrew J. and Angeli, George Z. and Chandrasekharan, Srinivasan and Claver, Charles F. and Cook, Kem and Ivezi{\'c}, {\v Z}eljko and Jones, R. Lynne and Krughoff, K. Simon and Peng, En-Hsin and Peterson, John and Petry, Catherine and Rasmussen, Andrew P. and Ridgway, Stephen T. and Saha, Abhijit and Sembroski, Glenn and vanderPlas, Jacob and Yoachim, Peter}, + month = aug, + year = {2014}, + pages = {915014}, +} + +@article{springel_simulations_2005, + title = {Simulations of the formation, evolution and clustering of galaxies and quasars}, + volume = {435}, + copyright = {{\textcopyright} 2005 Nature Publishing Group}, + issn = {0028-0836}, + url = {https://www.nature.com/nature/journal/v435/n7042/full/nature03597.html}, + doi = {10.1038/nature03597}, + language = {en}, + number = {7042}, + journal = {Nature}, + author = {Springel, Volker and White, Simon D. M. and Jenkins, Adrian and Frenk, Carlos S. and Yoshida, Naoki and Gao, Liang and Navarro, Julio and Thacker, Robert and Croton, Darren and Helly, John and Peacock, John A. and Cole, Shaun and Thomas, Peter and Couchman, Hugh and Evrard, August and Colberg, J{\"o}rg and Pearce, Frazer}, + month = jun, + year = {2005}, + pages = {629--636}, +} + +@article{mak_projected_2017, + title = {Projected support points, with application to optimal {MCMC} reduction}, + url = {http://arxiv.org/abs/1708.06897}, + journal = {arXiv:1708.06897 [stat]}, + author = {Mak, Simon and Joseph, V. Roshan}, + month = aug, + year = {2017}, + note = {arXiv: 1708.06897}, +} + +@article{radovich_searching_2017, + title = {Searching for galaxy clusters in the {Kilo}-{Degree} {Survey}}, + volume = {598}, + copyright = {{\textcopyright} ESO, 2017}, + issn = {0004-6361, 1432-0746}, + url = {https://www.aanda.org/articles/aa/abs/2017/02/aa29353-16/aa29353-16.html}, + doi = {10.1051/0004-6361/201629353}, + language = {en}, + journal = {A\&A}, + author = {Radovich, M. and Puddu, E. and Bellagamba, F. and Roncarelli, M. and Moscardini, L. and Bardelli, S. and Grado, A. and Getman, F. and Maturi, M. and Huang, Z. and Napolitano, N. and McFarland, J. and Valentijn, E. and Bilicki, M.}, + month = feb, + year = {2017}, + pages = {A107}, +} + +@article{jong_third_2017, + title = {The third data release of the {Kilo}-{Degree} {Survey} and associated data products}, + volume = {604}, + url = {https://ui.adsabs.harvard.edu/#abs/2017A&A...604A.134D/abstract}, + doi = {10.1051/0004-6361/201730747}, + journal = {Astronomy and Astrophysics}, + author = {Jong, De and A, Jelte T. and Kleijn, Gijs A. Verdoes and Erben, Thomas and Hildebrandt, Hendrik and Kuijken, Konrad and Sikkema, Gert and Brescia, Massimo and Bilicki, Maciej and Napolitano, Nicola R. and Amaro, Valeria and Begeman, Kor G. and Boxhoorn, Danny R. and Buddelmeijer, Hugo and Cavuoti, Stefano and Getman, Fedor and Grado, Aniello and Helmich, Ewout and Huang, Zhuoyi and Irisarri, Nancy and La Barbera, Francesco and Longo, Giuseppe and McFarland, John P. and Nakajima, Reiko and Paolillo, Maurizio and Puddu, Emanuella and Radovich, Mario and Rifatto, Agatino and Tortora, Crescenzo and Valentijn, Edwin A. and Vellucci, Civita and Vriend, Willem-Jan and Amon, Alexandra and Blake, Chris and Choi, Ami and Conti, Ian Fenech and Gwyn, Stephen D. J. and Herbonnet, Ricardo and Heymans, Catherine and Hoekstra, Henk and Klaes, Dominik and Merten, Julian and Miller, Lance and Schneider, Peter and Viola, Massimo}, + month = aug, + year = {2017}, + pages = {A134}, +} + +@article{bonnett_redshift_2016, + title = {Redshift distributions of galaxies in the {Dark} {Energy} {Survey} {Science} {Verification} shear catalogue and implications for weak lensing}, + volume = {94}, + number = {4}, + journal = {Physical Review D}, + author = {Bonnett, Christopher and Troxel, M. A. and Hartley, William and Amara, Adam and Leistedt, Boris and Becker, Matthew R. and Bernstein, Gary M. and Bridle, Sarah Louise and Bruderer, Claudio and Busha, M. T.}, + year = {2016}, + pages = {042005}, +} + +@article{polsterer_uncertain_2016, + title = {Uncertain {Photometric} {Redshifts}}, + url = {http://arxiv.org/abs/1608.08016}, + journal = {arXiv:1608.08016 [astro-ph]}, + author = {Polsterer, Kai Lars and D'Isanto, Antonio and Gieseke, Fabian}, + month = aug, + year = {2016}, + note = {arXiv: 1608.08016}, +} + +@article{hildebrandt_kids-450:_2017, + title = {{KiDS}-450: cosmological parameter constraints from tomographic weak gravitational lensing}, + volume = {465}, + issn = {0035-8711}, + shorttitle = {{KiDS}-450}, + url = {http://adsabs.harvard.edu/abs/2017MNRAS.465.1454H}, + doi = {10.1093/mnras/stw2805}, + journal = {Monthly Notices of the Royal Astronomical Society}, + author = {Hildebrandt, H. and Viola, M. and Heymans, C. and Joudaki, S. and Kuijken, K. and Blake, C. and Erben, T. and Joachimi, B. and Klaes, D. and Miller, L. and Morrison, C. B. and Nakajima, R. and Verdoes Kleijn, G. and Amon, A. and Choi, A. and Covone, G. and de Jong, J. T. A. and Dvornik, A. and Fenech Conti, I. and Grado, A. and Harnois-D{\'e}raps, J. and Herbonnet, R. and Hoekstra, H. and K{\"o}hlinger, F. and McFarland, J. and Mead, A. and Merten, J. and Napolitano, N. and Peacock, J. A. and Radovich, M. and Schneider, P. and Simon, P. and Valentijn, E. A. and van den Busch, J. L. and van Uitert, E. and Van Waerbeke, L.}, + month = feb, + year = {2017}, + pages = {1454--1498}, +} + +@article{harnois-deraps_kids-450:_2017, + title = {{KiDS}-450: tomographic cross-correlation of galaxy shear with {Planck} lensing}, + volume = {471}, + issn = {0035-8711}, + shorttitle = {{KiDS}-450}, + url = {https://academic.oup.com/mnras/article/471/2/1619/3922863}, + doi = {10.1093/mnras/stx1675}, + number = {2}, + journal = {Mon Not R Astron Soc}, + author = {Harnois-D{\'e}raps, Joachim and Tr{\"o}ster, Tilman and Chisari, Nora Elisa and Heymans, Catherine and van Waerbeke, Ludovic and Asgari, Marika and Bilicki, Maciej and Choi, Ami and Erben, Thomas and Hildebrandt, Hendrik and Hoekstra, Henk and Joudaki, Shahab and Kuijken, Konrad and Merten, Julian and Miller, Lance and Robertson, Naomi and Schneider, Peter and Viola, Massimo}, + month = oct, + year = {2017}, + pages = {1619--1633}, +} + +@article{troster_cross-correlation_2017, + title = {Cross-correlation of weak lensing and gamma rays: implications for the nature of dark matter}, + volume = {467}, + issn = {0035-8711}, + shorttitle = {Cross-correlation of weak lensing and gamma rays}, + url = {https://academic.oup.com/mnras/article/467/3/2706/2982886}, + doi = {10.1093/mnras/stx365}, + number = {3}, + journal = {Mon Not R Astron Soc}, + author = {Tr{\"o}ster, Tilman and Camera, Stefano and Fornasa, Mattia and Regis, Marco and van Waerbeke, Ludovic and Harnois-D{\'e}raps, Joachim and Ando, Shin'ichiro and Bilicki, Maciej and Erben, Thomas and Fornengo, Nicolao and Heymans, Catherine and Hildebrandt, Hendrik and Hoekstra, Henk and Kuijken, Konrad and Viola, Massimo}, + month = jun, + year = {2017}, + pages = {2706--2722}, +} + +@article{clampitt_galaxygalaxy_2017, + title = {Galaxy{\textendash}galaxy lensing in the {Dark} {Energy} {Survey} {Science} {Verification} data}, + volume = {465}, + issn = {0035-8711}, + url = {https://academic.oup.com/mnras/article/465/4/4204/2556144}, + doi = {10.1093/mnras/stw2988}, + number = {4}, + journal = {Mon Not R Astron Soc}, + author = {Clampitt, J. and S{\'a}nchez, C. and Kwan, J. and Krause, E. and MacCrann, N. and Park, Y. and Troxel, M. A. and Jain, B. and Rozo, E. and Rykoff, E. S. and Wechsler, R. H. and Blazek, J. and Bonnett, C. and Crocce, M. and Fang, Y. and Gaztanaga, E. and Gruen, D. and Jarvis, M. and Miquel, R. and Prat, J. and Ross, A. J. and Sheldon, E. and Zuntz, J. and Abbott, T. M. C. and Abdalla, F. B. and Armstrong, R. and Becker, M. R. and Benoit-L{\'e}vy, A. and Bernstein, G. M. and Bertin, E. and Brooks, D. and Burke, D. L. and Carnero Rosell, A. and Carrasco Kind, M. and Cunha, C. E. and D'Andrea, C. B. and Costa, Da and N, L. and Desai, S. and Diehl, H. T. and Dietrich, J. P. and Doel, P. and Estrada, J. and Evrard, A. E. and Fausti Neto, A. and Flaugher, B. and Fosalba, P. and Frieman, J. and Gruendl, R. A. and Honscheid, K. and James, D. J. and Kuehn, K. and Kuropatkin, N. and Lahav, O. and Lima, M. and March, M. and Marshall, J. L. and Martini, P. and Melchior, P. and Mohr, J. J. and Nichol, R. C. and Nord, B. and Plazas, A. A. and Romer, A. K. and Sanchez, E. and Scarpine, V. and Schubnell, M. and Sevilla-Noarbe, I. and Smith, R. C. and Soares-Santos, M. and Sobreira, F. and Suchyta, E. and Swanson, M. E. C. and Tarle, G. and Thomas, D. and Vikram, V. and Walker, A. R.}, + month = mar, + year = {2017}, + pages = {4204--4218}, +} + +@article{choi_cfhtlens_2016, + title = {{CFHTLenS} and {RCSLenS}: testing photometric redshift distributions using angular cross-correlations with spectroscopic galaxy surveys}, + volume = {463}, + issn = {0035-8711}, + shorttitle = {{CFHTLenS} and {RCSLenS}}, + url = {https://academic.oup.com/mnras/article/463/4/3737/2646329}, + doi = {10.1093/mnras/stw2241}, + number = {4}, + journal = {Mon Not R Astron Soc}, + author = {Choi, A. and Heymans, C. and Blake, C. and Hildebrandt, H. and Duncan, C. a. J. and Erben, T. and Nakajima, R. and Van Waerbeke, L. and Viola, M.}, + month = dec, + year = {2016}, + pages = {3737--3754}, +} + +@article{applegate_weighing_2014, + title = {Weighing the {Giants} {\textendash} {III}. {Methods} and measurements of accurate galaxy cluster weak-lensing masses}, + volume = {439}, + issn = {0035-8711}, + url = {https://academic.oup.com/mnras/article/439/1/48/963939}, + doi = {10.1093/mnras/stt2129}, + number = {1}, + journal = {Mon Not R Astron Soc}, + author = {Applegate, Douglas E. and von der Linden, Anja and Kelly, Patrick L. and Allen, Mark T. and Allen, Steven W. and Burchat, Patricia R. and Burke, David L. and Ebeling, Harald and Mantz, Adam and Morris, R. Glenn}, + month = mar, + year = {2014}, + pages = {48--72}, +} + +@article{polsterer_dealing_2016, + title = {Dealing with {Uncertain} {Multimodal} {Photometric} {Redshift} {Estimations}}, + volume = {12}, + issn = {1743-9213, 1743-9221}, + url = {https://www.cambridge.org/core/journals/proceedings-of-the-international-astronomical-union/article/dealing-with-uncertain-multimodal-photometric-redshift-estimations/28C34F4C86E013E752580331784A965A}, + doi = {10.1017/S1743921316013089}, + number = {S325}, + journal = {Proceedings of the International Astronomical Union}, + author = {Polsterer, Kai L.}, + month = oct, + year = {2016}, + pages = {156--165}, +} + +@article{amaro_metaphor:_2016, + title = {{METAPHOR}: {Probability} density estimation for machine learning based photometric redshifts}, + volume = {12}, + issn = {1743-9213, 1743-9221}, + shorttitle = {{METAPHOR}}, + url = {https://www.cambridge.org/core/journals/proceedings-of-the-international-astronomical-union/article/metaphor-probability-density-estimation-for-machine-learning-based-photometric-redshifts/2E414C2A511237966DF4D235C0363679}, + doi = {10.1017/S1743921317002186}, + number = {S325}, + journal = {Proceedings of the International Astronomical Union}, + author = {Amaro, V. and Cavuoti, S. and Brescia, M. and Vellucci, C. and Tortora, C. and Longo, G.}, + month = oct, + year = {2016}, + pages = {197--200}, +} + +@article{hoyle_dark_2017, + title = {Dark {Energy} {Survey} {Year} 1 {Results}: {Redshift} distributions of the weak lensing source galaxies}, + shorttitle = {Dark {Energy} {Survey} {Year} 1 {Results}}, + url = {http://arxiv.org/abs/1708.01532}, + journal = {arXiv:1708.01532 [astro-ph]}, + author = {Hoyle, B. and Gruen, D. and Bernstein, G. M. and Rau, M. M. and De Vicente, J. and Hartley, W. G. and Gaztanaga, E. and DeRose, J. and Troxel, M. A. and Davis, C. and Alarcon, A. and MacCrann, N. and Prat, J. and S{\'a}nchez, C. and Sheldon, E. and Wechsler, R. H. and Asorey, J. and Becker, M. R. and Bonnett, C. and Rosell, A. Carnero and Carollo, D. and Kind, M. Carrasco and Castander, F. J. and Cawthon, R. and Chang, C. and Childress, M. and Davis, T. M. and Drlica-Wagner, A. and Gatti, M. and Glazebrook, K. and Gschwend, J. and Hinton, S. R. and Hoormann, J. K. and Kim, A. G. and King, A. and Kuehn, K. and Lewis, G. and Lidman, C. and Lin, H. and Macaulay, E. and Maia, M. A. G. and Martini, P. and Mudd, D. and M{\"o}ller, A. and Nichol, R. C. and Ogando, R. L. C. and Rollins, R. P. and Roodman, A. and Ross, A. J. and Rozo, E. and Rykoff, E. S. and Samuroff, S. and Sevilla-Noarbe, I. and Sharp, R. and Sommer, N. E. and Tucker, B. E. and Uddin, S. A. and Varga, T. N. and Vielzeuf, P. and Yuan, F. and Zhang, B. and Abbott, T. M. C. and Abdalla, F. B. and Allam, S. and Annis, J. and Bechtol, K. and Benoit-L{\'e}vy, A. and Bertin, E. and Brooks, D. and Buckley-Geer, E. and Burke, D. L. and Busha, M. T. and Capozzi, D. and Carretero, J. and Crocce, M. and D'Andrea, C. B. and da Costa, L. N. and DePoy, D. L. and Desai, S. and Diehl, H. T. and Doel, P. and Eifler, T. F. and Estrada, J. and Evrard, A. E. and Fernandez, E. and Flaugher, B. and Fosalba, P. and Frieman, J. and Garc{\'i}a-Bellido, J. and Gerdes, D. W. and Giannantonio, T. and Goldstein, D. A. and Gruendl, R. A. and Gutierrez, G. and Honscheid, K. and James, D. J. and Jarvis, M. and Jeltema, T. and Johnson, M. W. G. and Johnson, M. D. and Kirk, D. and Krause, E. and Kuhlmann, S. and Kuropatkin, N. and Lahav, O. and Li, T. S. and Lima, M. and March, M. and Marshall, J. L. and Melchior, P. and Menanteau, F. and Miquel, R. and Nord, B. and O'Neill, C. R. and Plazas, A. A. and Romer, A. K. and Sako, M. and Sanchez, E. and Santiago, B. and Scarpine, V. and Schindler, R. and Schubnell, M. and Smith, M. and Smith, R. C. and Soares-Santos, M. and Sobreira, F. and Suchyta, E. and Swanson, M. E. C. and Tarle, G. and Thomas, D. and Tucker, D. L. and Vikram, V. and Walker, A. R. and Weller, J. and Wester, W. and Wolf, R. C. and Yanny, B. and Zuntz, J.}, + month = aug, + year = {2017}, + note = {arXiv: 1708.01532}, +} diff --git a/docs/desc-0000-qp-photo-z_approximation/main.tex b/docs/desc-0000-qp-photo-z_approximation/main.tex index 070fb8e4..51c4a4a6 100644 --- a/docs/desc-0000-qp-photo-z_approximation/main.tex +++ b/docs/desc-0000-qp-photo-z_approximation/main.tex @@ -13,9 +13,13 @@ \newcommand{\textul}{\underline} -\newcommand{\qp}{\texttt{qp}\xspace} -\newcommand{\pz}{photo-$z$ PDF\xspace} -\newcommand{\Pz}{Photo-$z$ PDF\xspace} +\newcommand{\qp}{\texttt{qp}} +\newcommand{\pz}{photo-$z$ PDF} +\newcommand{\Pz}{Photo-$z$ PDF} +\newcommand{\mgdata}{bright\xspace} +\newcommand{\Mgdata}{Bright\xspace} +\newcommand{\ssdata}{faint\xspace} +\newcommand{\Ssdata}{Faint\xspace} \begin{document} @@ -25,237 +29,272 @@ \begin{abstract} -Upcoming and ongoing galaxy surveys will produce redshift probability -distribution functions (PDFs) in addition to traditional photometric redshift -(photo-$z$) point estimates. +Modern galaxy surveys produce redshift probability distribution functions +(PDFs) in addition to traditional photometric redshift (photo-$z$) point +estimates. However, the storage of \pz s may present a challenge with increasingly large catalogs, as we face a trade-off between the accuracy of subsequent science -measurements and the storage cost. -This paper presents \qp, a Python package facilitating manipulation of -approximations of 1-dimensional PDFs, as suitable for \pz s. +measurements and the limitation of finite storage resources. +This paper presents \qp, a Python package for manipulating parametrizations of +1-dimensional PDFs, as suitable for \pz\ compression. We use \qp\ to investigate the performance of three simple PDF storage formats on two realistic mock datasets, representative of upcoming surveys with -different data qualities, as a function of the number of stored parameters per -\pz, using metrics of both individual \pz s and an estimator of the overall -redshift distribution function. +different data qualities. +Based on metrics of individual \pz s and an estimator of the overall redshift +distribution function, as an example of a science use case, as a function of +the number of stored parameters per \pz, we make recommendations for best +practices in choosing \pz\ approximation schemes. \end{abstract} -\dockeys{methods: data analysis, catalogs, surveys} +\dockeys{methods: statistical, methods: miscellaneous, astronomical databases: +miscellaneous, catalogs, galaxies: distances and redshifts} \maketitlepost - - - \section{Introduction} \label{sec:intro} -Ongoing and upcoming wide-field imaging surveys such as the Large Synoptic -Survey Telescope (LSST) will observe billions of galaxies; studies of cosmology -and galaxy evolution with these data will rely on the method of photometric -redshift (photo-$z$) estimation. +Upcoming wide-field imaging surveys such as the Large Synoptic Survey Telescope +(LSST)\footnote{\url{https://www.lsst.org/}}\citep{ivezic_lsst:_2008} will +observe tens of billions of galaxies photometrically, without follow-up +spectroscopy. +Over the past decade, the Kilo-Degree +Survey\footnote{\url{http://kids.strw.leidenuniv.nl/}}, Hyper Suprime-Cam +Subaru Strategic Program\footnote{\url{http://hsc.mtk.nao.ac.jp/ssp/}}, and +Dark Energy Survey\footnote{\url{https://www.darkenergysurvey.org/}} have paved +the way for LSST via similar survey strategies on tens of millions of galaxies. +Studies of precision cosmology and galaxy evolution with the anticipated data +will thus rely almost exclusively on the method of photometric redshift +(photo-$z$) estimation. Photo-$z$s are subject to a number of systematic errors, some caused by the estimation procedures and others intrinsic to the data itself. -The photo-$z$ community has come to favor methods that provide a redshift -probability distribution function (PDF) that includes information about the -potential for such systematic errors for each galaxy in the survey. - -Given the tremendous size of the surveys in question, storage of these -probability distributions involves making difficult decisions. -Each survey seeks to create a catalog of \pz s balancing accuracy against -storage cost. -For example, the \pz\ catalog that LSST will release will be limited to 200 -floating point numbers per galaxy, with plans to store \pz s derived by -multiple methods. \citep{juric_data_2017} -The problem of \pz\ approximation for large surveys was first addressed in -\citet{carrasco_kind_sparse_2014} in the context of a single galaxy survey, a -limited set of \pz\ approximation schemes, and metrics appropriate for -deterministic, not probabilistic, objects. -However, we expect the choice of \pz\ approximation, and the number of stored -parameters associated with it, will depend on the science case and its -requirements on \pz\ accuracy: different science cases will need different -accuracy metrics. +For the purpose of producing public photo-$z$ catalogs, the redshift estimation +community has thus come to favor methods that provide a photo-$z$ probability +distribution function (PDF) conveying the potential for such systematic errors +for each galaxy in the survey \citep{tanaka_photometric_2017, jong_third_2017, +sheldon_photometric_2012}. + +Given that the \pz\ catalogs of ongoing surveys already include $\sim10^{7}$ +galaxies, and that those of upcoming surveys will include $\sim10^{10}$ +galaxies, storage of these probability distributions must balance of accuracy +of the catalog against limited storage resources. +For example, the public catalog that LSST will release will be limited to +$\sim100$ floating point numbers per galaxy for all redshift information +\citep[Section 4.2.2]{juric_data_2017}, with plans to store \pz s derived by +multiple methods. +Furthermore, the problem of storing PDFs is not unique to galaxy surveys. +Gaia\footnote{\url{https://www.gaia-eso.eu/}}, for example, has committed to +providing a catalog of PDFs of stellar properties inclding velocities, so an +approach to optimizing the choice of PDF storage parametrization could benefit +astronomy more broadly. + +\citet{carrasco_kind_sparse_2014} first addressed the question of approximating +\pz s in the context of a single galaxy survey and metrics applicable to +deterministic, not probabilistic, data products. +However, we expect the optimal choice of \pz\ storage approximation to depend +on the intended science applications and their requirements on \pz\ accuracy +and the properties of the anticipated \pz s. +Different science cases will need different metrics, and different formats may +be appropriate for different datasets. In this paper, we address the question of \textit{how} these choices should be -made in general, by providing the publicly available \qp\ Python package to -enable each survey to optimize their \pz\ approximation via mathematically -motivated and science-driven metrics. -We demonstrate this approach on two sets of realistic mock data. +made, by providing the publicly available \qp\ Python +package\footnote{\url{https://github.com/aimalz/qp}} enabling each survey to +optimize their \pz\ approximation via mathematically motivated and +science-driven metrics. +We demonstrate this approach on two sets of realistic mock data in the context +of LSST. In Section~\ref{sec:methods}, we outline how \qp\ can be used to optimize the choice of \pz\ parametrization. -In Section~\ref{sec:data}, we describe the mock datasets on which we perform -such an analysis. +In Section~\ref{sec:data}, we describe the mock datasets on which we +demonstrate such an analysis. We present the results of this procedure in Section~\ref{sec:results} and make recommendations for the use of \qp\ by the photo-$z$ community in Section~\ref{sec:conclusions}. - - - - - - - \section{Methods} \label{sec:methods} - -We have developed the \qp\ Python package to facilitate the parametrization and -approximation of \pz s. -A \texttt{qp.PDF} object can carry a number of different parametrizations, each -associated with a representation. +We have developed the \qp\ Python package to facilitate the approximation of +one-dimensional PDFs, including \pz s, and comparisons between approximations. +A \texttt{qp.PDF} object is associated with sets of parameters for each +approximation considered. +Conversions between approximations are facilitated by the +\texttt{numpy}\footnote{\url{http://www.numpy.org/}}, \texttt{scipy} +\footnote{\url{https://www.scipy.org/}}, and +\texttt{scikit-learn}\footnote{\url{http://scikit-learn.org}} +\citep{pedregosa_scikit-learn:_2011} tools. The currently supported parametrizations are described in Section~\ref{sec:approx}. -The \qp\ package also provides a few built-in metrics of the accuracy of a -representation of a \pz\ relative to a given parametrization that has been -designated as "true." -The currently implemented metrics are described in Section~\ref{sec:metric}. -Large-scale tests can be conducted using the \texttt{qp.Ensemble} class that -provides a wrapper for collections of \texttt{qp.PDF} objects. + +The \qp\ package also provides a few built-in metrics for the accuracy of a +representation of a PDF relative to a given reference representation. +Built-in plots are made using +\texttt{matplotlib}\footnote{\url{https://matplotlib.org/}}. +A subset of the included metrics are described in Section~\ref{sec:metric}. + +Catalog-level manipulations are performed using the \texttt{qp.Ensemble} class +that serves as a wrapper for operations over collections of \texttt{qp.PDF} +objects. +Parallelization is facilitated by the \texttt{pathos} +\footnote{\noindent\url{http://trac.mystic.cacr.caltech.edu/project/pathos/wiki. +html}} \citep{mckerns_building_2012, mckerns_pathos:_2010} package. \subsection{Approximation Methods} \label{sec:approx} -First, we establish a vocabulary for the definitions of approximation methods. +First, we establish a vocabulary for the approximations. Each \textit{parametrization} of a \pz\ is defined in terms of the -\textit{format} function $\mathcal{F}$, \textit{metaparameters} $\vec{C}$, and -\textit{parameters} $\vec{c}$. +\textit{parameters} $\vec{c}$ unique to its catalog entry, the +\textit{metaparameters} $\vec{C}$ shared over the whole catalog, and the +\textit{format} function $\mathcal{F}$ that reconstructs a PDF from its +parameters and metaparameters. A parametrization in turn corresponds to a \textit{representation} \begin{align} \label{eq:definition} \hat{p}^{\mathcal{F}, \vec{C}, \vec{c}}(z) &\equiv \mathcal{F}_{\vec{C}}(z; \vec{c}) \end{align} -of the \pz, denoted as $\hat{p}(z)$ for brevity. -We often employ interpolation schemes with a generic interpolator function -$F_{\vec{C}'}(z; \vec{c})$ that comes with its own metaparameters $\vec{C}'$. -\qp\ supports all interpolation options available to the -\texttt{scipy.interpolate.interp1d} function, but we choose a default -interpolation scheme for each format to maximize the fidelity of the -approximations to the true PDFs. - -\qp\ supports conversion of \pz\ approximations between five formats: step -functions, samples, quantiles, evaluations, and mixture model components. -These formats may be associated with any number $N_{f}$ of stored parameters -$c_{i}$ per \pz%, which are presumed to be floating point numbers unless -otherwise specified. -Meanwhile, the metaparameters $C_{i}$ are the set of numbers necessary to -convert the stored \pz\ parameters $\vec{c}$ into a PDF over redshift. +of the approximated \pz, denoted as $\hat{p}(z)$ for brevity. +The \textit{dimensionality} of $\vec{c}$ is the number $N_{f}$ of stored +parameters per \pz, which are presumed to be scalar numbers unless otherwise +specified. +The number of elements of $\vec{C}$ is of little importance so long as the +metaparameters can be stored outside the catalog entries. + +\qp\ currently supports conversion of \pz\ approximations between five formats: +step functions, samples, quantiles, evaluations, and mixtures of +\texttt{scipy.rv\_continuous} objects. +When the format function is not specified by the approximation, we refer to a +generic interpolator function $F_{\vec{C}'}(z; \vec{c}, \mathcal{F}_{\vec{C}})$ +with its own metaparameters $\vec{C}'$, which must be chosen by the researcher. +\qp\ supports numerous interpolation schemes, including several from the +popular \texttt{scipy.interpolate} library. + In this work we consider special cases of three of these formats as candidates for large survey \pz\ catalog storage: regular binning (Section~\ref{sec:bins}), random samples (Section~\ref{sec:samples}), and -regular quantiles (Section~\ref{sec:quantiles}), while the other two, -evaluations and mixture models, are used solely for internal manipulations -within \qp. - -We have not yet included the \texttt{SparsePz} sparse basis representation of -\citet{carrasco_kind_sparse_2014}, which uses a mixture model of $N_{f}$ -members of a library of $\sim10^{4}$ functions and has impressive compression -properties. +regular quantiles (Section~\ref{sec:quantiles}). +These formats are illustrated on a multimodal PDF in Figure~\ref{fig:qp}. +\begin{figure} + \begin{center} + \includegraphics[width=\columnwidth]{figures/demo_pz.pdf} + \caption{\qp\ approximation of a continuous 1-dimensional PDF (thick, solid +gray line) using: the step function (orange dotted line), samples (green +dash-dotted line), and quantile formats (purple dashed line) with the same +number of stored parameters ($N_{f}=7$ in this case). + \label{fig:qp}} + \end{center} +\end{figure} + +In spite of its impressive compression properties, we have not yet included the +\texttt{SparsePz}\footnote{\url{https://github.com/mgckind/SparsePz}} sparse +basis representation of \citet{carrasco_kind_sparse_2014}, in which the +parameters are the integer identifiers of $N_{f}$ mixture model components from +a library of $\sim10^{4}$ functions. We omit this format because decomposition with \texttt{SparsePZ} does not -enforce that the stored parametrization be a probability distribution in the -mathematical sense of nonnegativity and integration to unity. +enforce the condition that the representation be a probability distribution in +the mathematical sense of nonnegativity and integration to unity. While normalizing the integral of a positive semidefinite function is always possible (if the endpoints of integration are specified), one can motivate multiple schemes for enforcing nonnegativity that result in different reconstructions $\hat{p}(z)$. We postpone to future work the exploration of adaptations of non-positive semidefinite representations and inclusion of the sparse basis representation -in \qp, restricting ourselves to mixture models of -\texttt{scipy.stats.rv\_continuous} objects. - -The various \qp\ formats are illustrated in Figure~\ref{fig:qp} on a multimodal -\pz\ with stored parameters. -\begin{figure} - \includegraphics[width=0.9\columnwidth]{figures/demo_pz.png} - \caption{\qp\ approximation of a continuous 1-dimensional PDF (solid black -line) using the step function (orange dotted line), samples (green dash-dotted -line), and quantile formats (purple dashed line) with the same number of stored -parameters ($N_{f}=20$ in this case). - \label{fig:qp}} -\end{figure} +in \qp. For each format, we address the following questions: \begin{itemize} - \item When/where has this format appeared in the literature as a published -catalog format, native \pz\ code output format, and science application input -format? - \item What exactly is stored under this format, per galaxy (the parameters) + \item When/where has the format appeared as a published catalog format, +native \pz\ code output format, and science application input format? + \item What exactly is stored under the format, per galaxy (the parameters) and per catalog (the metaparameters)? - \item Beyond fidelity to the original \pz, what are the a priori strengths -and weaknesses of this format? + \item What are the a priori strengths and weaknesses of the format? \end{itemize} \subsubsection{Regular Binning} \label{sec:bins} -By far the most popular format for approximating and storing \pz s is that of a +By far the most popular format for approximating and storing \pz s is the piecewise constant step function, also called a histogram binning. -It is the only format that has been used for public release of \pz\ catalogs -\citep{tanaka_photometric_2017, sheldon_photometric_2012}; it is unclear -whether this is a consequence or cause of the fact that it is the most common -format for using \pz s in cosmological inference, as tomographic binning is a -universal step between the \pz\ catalog and calculation of any two-point -correlation function. +It is the native output of a number of \pz\ codes +\citep{carrasco_kind_somz:_2014, sadeh_annz2:_2016, cavuoti_metaphor:_2017} and +the only format that has been used for public release of \pz\ catalogs +\citep{sheldon_photometric_2012, tanaka_photometric_2017, jong_third_2017}. The metaparameters of the binned parametrization are the ordered list of redshifts $\vec{C} = (z_{1}, z_{2}, \dots, z_{N_{f}}, z_{N_{f}+1})$ serving as bin endpoints shared by all galaxies in the catalog, each adjacent pair of -which is associated with a parameter $c_{i}=\int_{C_{i}}^{C_{i+1}}\ p(z)dz$. -The \qp\ histogram format assumes $p(z)=0$ when $zC_{N_{f}+1}$, -leading to the normalization condition $\sum_{i} c_{i}(C_{i+1}-C_{i})) = 1$. -\footnote{Note that this is not generally equivalent to the erroneous -normalization condition $\sum_{i} c_{i} = 1$ commonly enforced in public -catalogs.} -The histogram format function $\mathcal{F}^{h}$ is the sum of a set of $N_{f}$ -step functions, making the reconstructed estimator of the \pz +which is associated with a parameter $c_{i} = \int_{C_{i}}^{C_{i+1}} p(z) dz$. +The \qp\ histogram format assumes $p(z)=0$ when $zC_{N_{f}+1}$ +and enforces the normalization condition\footnote{ +Note that this is not generally equivalent to the erroneous normalization +condition $\sum_{i} c_{i} = 1$ commonly enforced in public catalogs such as the +Sloan Digital Sky Survey's Data Release 8 \citep{sheldon_photometric_2012}. +Unless redshift is treated as a discrete variable, the oversimplified +normalization condition only holds if $C_{N_{f}+1} - C_{1} = N_{f} (C_{i+1} - +C_{i})$ for all $i$ (under a regular binning). +} +\begin{align} + \label{eq:normed} + \sum_{i} c_{i} (C_{i+1} - C_{i}) &= 1. +\end{align} +The histogram format function $\mathcal{F}^{h}$ is thus the sum of a set of +$N_{f}$ step functions, making the reconstructed estimator of the \pz \begin{align} \label{eq:binned} - \hat{p}^{h}(z) &= \sum_{i}^{N_{f}}\ c_{i} + \hat{p}^{h}(z) &= \sum_{i=1}^{N_{f}}\ c_{i} \left\{\begin{tabular}{cc}$1$&$C_{i} C_{i+1}$\end{tabular}\right\}, \end{align} where the step functions may be considered their own interpolators. -Here we only consider a regular binning, with $C_{i+1}=C_{i}+\delta_{f}$ for a -constant $\delta_{f}=(C_{N_{f}+1}-C_{1})/N_{f}$, as this is the only type of -binning that has been used in the literature, but this condition is not -required by \qp. - -The regular histogram format may be considered wasteful in terms of data -storage; a \pz\ with a very compact (broad) probability distribution may have -many parameters taking the same value $c_{i}\approx0$ -($c_{i}\approx(C_{N_{f}+1}-C_{1})\delta^{-1}$) that are redundant in storage. -%It also requires the researcher to choose the minimum and maximum possible -redshifts $C_{1}$ and $C_{N_{f}+1}$ of the galaxy sample, which are unknown -quantities, so it would be preferable to leave them unconstrained when setting -up the catalog. +Though \qp\ supports arbitrary bin ends, here we only consider a regular +binning, with $C_{i+1} = C_{i} + \delta$ for a constant $\delta = (C_{N_{f}+1} +- C_{1}) / N_{f}$, as no irregular binning has yet been used for a public +catalog of \pz s. + +In terms of performance as a \pz\ storage format, we should anticipate the +regular histogram format to be wasteful in terms of information content; a \pz\ +with a very broad (compact) probability distribution may have many parameters +taking the same value $c_{i} \approx (C_{N_{f}+1} - C_{1}) / \delta$ +($c_{i}\approx0$) that are redundant in storage. +Additionally, we should expect the fidelity of $\hat{p}^{h}(z)$ to depend +strongly on the bin widths relative to the sizes of and distances between +features in the \pz s. \subsubsection{Random Samples} \label{sec:samples} -Samples are the native output format of many machine learning algorithms -dependent on random choices, such as random forests. -Such approaches typically produce large numbers of samples, far more than can -realistically be stored by any survey, so a subsample is commonly stored. +Samples are often the native output format of machine learning algorithms due +to the discrete nature of training sets \citep{de_vicente_dnf_2016}. +Such approaches by default typically produce large numbers of samples, far more +than can realistically be stored by any survey, so are commonly compressed by +subsampling\citep{hoyle_dark_2017}. +Samples are easy to use in standard science applications developed for redshift +point estimates, so they have an established presence in the +literature\citep{bonnett_redshift_2016}. +The samples format of PDF storage appears elsewhere in astronomy, including the +Gaia-ESO Survey's commitment to provide multi-dimensional PDFs of stellar +parameters in the samples format \citep{bailer-jones_gaia_2013}. The parameters of the samples format are the $N_{f}$ samples $\vec{c}=(z_{1}, z_{2}, \dots, z_{N_{f}-1}, z_{N_{f}})$, where $C=N_{f}$ is an implicit metaparameter. -Though it is possible to construct a catalog where $C$ is not uniform over the -catalog, but is instead somehow optimized for each galaxy, we leave its +Though it is possible to construct a catalog where $N_{f}$ is not uniform over +the catalog, but is instead somehow optimized for each galaxy, we leave its investigation to future work, as it has not yet appeared in the literature. The format function $\mathcal{F}^{s}$ that turns samples into a representation -of the \pz\ is simply the interpolator $F$; in the tests presented here, we use -the Gaussian kernel density estimate (KDE) of -\texttt{scipy.stats.gaussian\_kde} with the smoothing bandwidth $C'$ as a -metaparameter. The samples representation is then +of the \pz\ is simply the interpolator $F$. +In the tests presented here, we use the Gaussian kernel density estimate (KDE) +of \texttt{scipy.stats.gaussian\_kde}. +The samples representation is then \begin{align} \label{eq:sampled} - \hat{p}^{s}(z) &= F_{C'}(z; \vec{c}). + \hat{p}^{s}(z) &= \mathrm{KDE}_{C'}(z; \vec{c}). \end{align} Though samples are an obvious choice for \pz s with narrow features of high @@ -268,33 +307,50 @@ \subsubsection{Random Samples} \subsubsection{Regular Quantiles} \label{sec:quantiles} -One parametrization that has not previously been investigated is that of -quantiles, which are defined in terms of the cumulative distribution function -(CDF). -Under the quantile format, a \pz\ catalog shares $N_{f}$ ordered CDFs -$\vec{C}=(q_{1}, q_{2}, \dots, q_{N_{f}-1}, q_{N_{f}})$. -Each galaxy's catalog entry is the vector of redshifts $\vec{c}=(z_{1}, z_{2}, -\dots, z_{N_{f}-1}, z_{N_{f}})$ satisfying $CDF(c_{i})=C_{i}$, so the quantile -format function $\mathcal{F}_{q}$ is the derivative of an interpolation of the -inverse CDF $CDF^{-1}(C_{i})=c_{i}$ under the interpolation scheme $F$. -In this study, we test regular quantiles $C_{i}\equiv i(N_{f}+1)^{-1}$ using a -linear interpolation scheme but note that \qp\ does not require them. -The format function is then the convolution $\mathcal{F}^{q}=F(CDF^{-1})$, -making the quantile representation +One parametrization that has not previously been investigated in the context of +photometric redshifts is that of quantiles, though they have appeared elsewhere +in the astronomy literature \citep{sun_star_2015, pizzocaro_results_2016, +laycock_x-ray_2017}. +The quantiles are defined in terms of the cumulative distribution function +(CDF), which is the antiderivative of the PDF. + +Under the quantile format, a \pz\ catalog shares $N_{f}$ ordered CDFs $\vec{C} += (q_{1}, q_{2}, \dots, q_{N_{f}-1}, q_{N_{f}})$ where $0 < q_{i} < 1$ for all +$i$. +In this study, we test regular quantiles $C_{i} \equiv i / (N_{f} + 1)$. +Each galaxy's catalog entry is the vector of redshifts $\vec{c} = (z_{1}, +z_{2}, \dots, z_{N_{f}-1}, z_{N_{f}})$ satisfying $\mathrm{CDF}(c_{i}) = +C_{i}$, so the quantile format function $\mathcal{F}^{q}$ is the derivative of +an interpolation $F$ of the CDF. +Our interpolator $F$ in the tests presented here is a nonnegative +\texttt{scipy.interpolate} spline at $z_{1} \leq z \leq z_{N_{f}}$ and linear +extrapolation subject to the normalization conditions of $\vec{C}$ elsewhere. +The quantile representation is thus \begin{align} \label{eq:quantiles} - \hat{p}^{q}(z) &= F(z; \vec{c}, \frac{\Delta\vec{C}}{\Delta\vec{c}}). + \hat{p}^{q}(z) &= + \left\{ + \begin{tabular}{cc} + $\frac{d}{dz} \left[F(z; \vec{c}, \mathcal{F}^{q}_{\vec{C}})\right]$ & $c_{1} +\leq z \leq c_{N_{f}}$ \\ + $\hat{p}^{q}(c_{1})\left(\frac{\hat{p}^{q}(c_{1})}{2C_{1}} z - 1\right)$ & $z +< c_{1}$ \\ + $\hat{p}^{q}(c_{N_{f}})\left(1 - \frac{\hat{p}^{q}(c_{N_{f}})}{2(1 - +C_{N_{f}})} z\right)$ & $z > c_{N_{f}+1}$ + \end{tabular} + \right\}. \end{align} -We expect the quantile format to be an efficient approximation for \pz s -because it allocates storage evenly in the space of probability density. +The quantile parametrization (the namesake of the \texttt{qp} code) is expected +to be an efficient approximation for \pz s because it allocates storage evenly +in the space of probability density. In contrast, the histogram format stores data evenly spaced in redshift, and the samples format stores data randomly in probability density. As with the samples representation, an interpolation function must be chosen for reconstructing the \pz\ from the stored parameters. Depending on the native \pz\ output format, converting to the quantile format -may require $N_{f}$ numerical optimizations to find the quantiles. +may require $N_{f}$ numerical optimizations. We accelerate these optimizations by initializing at rough, approximate quantiles based on CDF evaluations on a grid. @@ -305,64 +361,138 @@ \subsubsection{Regular Quantiles} \subsection{Comparison Metrics} \label{sec:metric} -In this work, out aim is to probe how closely \pz s reconstructed from limited -set of stored parameters approximates the "true" PDF. +We aim to probe how closely \pz s reconstructed from limited set of stored +parameters approximate the original, high-resolution representation +$\hat{p}^{r}(z)$ of the reference catalog. This is done without reference to a galaxy's true redshift; there is, in fact, no notion of a true redshift in our analysis. (For a demonstration of how one might approach the distinct problem of -evaluating the accuracy of a \pz\ relative to a true redshift, see Schmidt, et -al.\ in preparation.) -The loss of information incurred when using an approximate PDF $\hat{P}(z)$ -instead of the true PDF $P(z)$ is given by the Kullback-Leibler divergence +evaluating the accuracy of a \pz\ relative to a true redshift, see +\citet{polsterer_uncertain_2016}, Schmidt, et al.\ in preparation.) + + +We consider as a metric the loss of information incurred when using an +approximation of the PDF $\hat{P}(z)$ instead of the best possible +representation of the PDF $P(z)$, given by the Kullback-Leibler divergence (KLD), which is defined as \begin{align} \label{eq:kld} - KLD[P(z) || \hat{P}(z)] &= \int_{-\infty}^{\infty}\ P(z)\ -\log\left[\frac{P(z)}{\hat{P}(z)}\right]\ dz\\ - &\approx \delta_{ff}\sum_{z=z_{1}}^{z_{N_{ff}}}\ P(z)\ -\log\left[\frac{P(z)}{\hat{P}(z)}\right], + \mathrm{KLD}[\hat{P}(z) | P(z)] &= \int_{-\infty}^{\infty}\ P(z)\ +\log\left[\frac{P(z)}{\hat{P}(z)}\right]\ dz, \end{align} -where $\log$ is the natural logarithm throughout this paper. +where $\log$ is the natural logarithm throughout this paper unless indicated +otherwise, such that the KLD is measured in nats (base $e$ digits, analogous to +base 2 bits). Because there is in general no closed-form expression for the KLD, we calculate -the KLD using evaluations of the PDF under each format on a very fine, regular -grid $(z_{1}, z_{2}, \dots, z_{N_{ff}-1}, z_{N_{ff}})$ with resolution -$\delta_{ff}\ll\delta_{f}$. -We review the properties of the KLD and provide some intuition for it in -Appendix~\ref{sec:kld}. +the discrete KLD +\begin{align} + \label{eq:kld_approx} + \mathrm{KLD}[\hat{P}(z) | P(z)] &\approx +\delta_{ff}\sum_{z=z_{1}}^{z_{N_{ff}}}\ P(z)\ +\log\left[\frac{P(z)}{\hat{P}(z)}\right] +\end{align} +using evaluations of the PDF under each format on a very fine, regular grid +$(z_{1}, z_{2}, \dots, z_{N_{ff}-1}, z_{N_{ff}})$ with resolution $\delta_{ff} +\ll \delta_{f}$. -\subsubsection{Individual \pz s} +The most important feature of the KLD is its asymmetry: it is not a distance, +like the root mean square error, that is the same from $P(z)$ to $P'(z)$ as it +is from $P'(z)$ to $P(z)$. +It is a \textit{divergence} of the information lost when using $P'(z)$ to +approximate $P(z)$. +The KLD requires that both functions $P(z)$ and $P'(z)$ be probability +distributions (always positive semidefinite and integrating to unity); this may +need to be explicitly enforced for some approximation formats. +The KLD is always positive, and a smaller value indicates better agreement +between the approximate representation $\hat{p}^{\mathcam{F}}(z)$ and the +reference representation $p^{r}(z)$. +In the Appendix, we review the properties of the KLD and provide some intuition +for it. + +Additionally, we consider the percent error +\begin{align} + \label{eq:percent_error} + \Delta_{m}[\hat{P} | P] &= \frac{M_{m}[P] - +M_{m}[\hat{P}]}{M_{m}[P]}\times100\% +\end{align} +of the $m^{\mathrm{th}}$ moment +\begin{align} + \label{eq:moment} + M_{m}[P] &= \int_{-\infty}^{\infty} z^{m}\ P(z)\ dz\ \approx\ +\delta_{ff}\sum_{z=z_{1}}^{z_{N_{ff}}}\ z^{m}\ P(z) +\end{align} +of a PDF. +We note that $M_{0}[P]=1$ for all properly normalized probability +distributions, $M_{1}[P]=\bar{z}$ is the \textit{mean}, $M_{2}[P]$ is the +\textit{variance}, and $M_{3}[P]$ is the \textit{kurtosis}. +Though the first few moments are not in general sufficient to characterize a +highly structured probability distribution, they are included in this analysis +because they can prove useful in setting ballpark estimates of the influence of +different systematics in various science cases. + +\subsubsection{Individual \pz\ metrics} \label{sec:individual_metric} Some science applications rely on the recovery of individual galaxy \pz s that, -for example, may be used as the basis for targeting spectroscopic follow up for -a variety of science applications. -For this purpose, we also calculate the KLD of each individual \pz\ in our -catalogs and then characterize the distribution of KLD values (which is itself -a PDF) by its first, second, and third moments (the mean, variance, and -kurtosis, respectively). -We use these aggregate statistics to observe how the approximate individual \pz -s for each dataset vary with the choice of parametrization. +for example, may be used as the basis for constraining the masses of +\citep{applegate_weighing_2014} or finding \citep{radovich_searching_2017} +galaxy clusters. +For this purpose, we calculate the KLD of each individual \pz\ in our catalogs +and then characterize the distribution of KLD values (which is itself a PDF) by +its $m=1,\ 2,\ 3$ moments. +We also calculate the percent error on the $m=1,\ 2,\ 3$ moments of each \pz\ +under all parametrizations and use the median and interquartile range of the +moment percent error distribution $p(M_{m}[\hat{p}_{i}])$ of the ensemble. +We use these aggregate statistics to observe the fidelity of individual \pz\ +approximations for each dataset as a function of parametrization. \subsubsection{Stacked $\hat{n}(z)$ estimator} \label{sec:stacked_metric} +In addition to considering how the choice of storage parametrization affects +the recovery of individual \pz s, we also demonstrate how one might use +\texttt{qp} to choose the best parametrization for a particular science case. +We encourage \texttt{qp} users to develop a metric around their own \pz\ use +cases, as the optimal parametrization may not be shared among all science +applications of \pz s. + In cosmology, \pz s have thus far been used almost exclusively to estimate the redshift distribution function $n(z)$ necessary for calculating the correlation -functions used by many cosmological probes. +functions used by many cosmological probes \citep{clampitt_galaxygalaxy_2017, +hildebrandt_kids-450:_2017}. The most common way to estimate the redshift distribution function for a sample -of $N_{g}$ galaxies is to sum the \pz s according to +of $N_{g}$ galaxies is to average the \pz s according to \begin{align} \label{eq:nz} - \hat{n}(z) &\equiv \frac{1}{N_{g}}\ \sum_{k=1}^{N_{g}}\ \hat{p}_{k}(z). + \hat{n}(z) &\equiv \frac{1}{N_{g}}\ \sum_{k=1}^{N_{g}}\ \hat{p}_{k}(z), \end{align} -Here, the estimator is normalized so that it, too, is a PDF. +a procedure producing what we call the stacked estimator $\hat{n}(z)$ of the +redshift distribution function \citep{harnois-deraps_kids-450:_2017, +hoyle_dark_2017}.\footnote{ +Equation~\ref{eq:nz} is sometimes modified by weights specific to each galaxy +based on the relative prevalence of galaxies with similar photometry in a +reference population \citep{sheldon_photometric_2012, +troster_cross-correlation_2017} +} While we do not recommend this approach to estimating the redshift distribution -(see Malz and Hogg, et al.\ (in preparation)), we use it here on the assumption -that any metric calculated on a more principled estimator will have similar -behavior with respect to the parametrization of the \pz\ catalog. -Our primary metric is therefore the KLD \textit{from} the stacked estimator of -a catalog of evaluations of reconstructed \pz s \textit{to} the stacked -estimator of a catalog of evaluations of the true \pz s. +(see \citet{choi_cfhtlens_2016} for justification and Malz et al., in +preparation for an alternative method), we use it here to demonstrate +generically how one would optimize the choice of \pz\ parametrization around a +familiar science application. + +As the stacked estimator is normalized so that it, too, is a PDF, the KLD +\textit{from} the stacked estimator of a catalog of evaluations of +reconstructed \pz s \textit{to} the stacked estimator of a catalog of +evaluations of the original, high-resolution \pz s serves as a metric for a +specific science use case of \pz s. +Because the accuracy of lower-order moments of the redshift distribution +function dominates the weak lensing error budget, we also compare the percent +error on the $m=1,\ 2,\ 3$ moments of $\hat{n}(z)$. +However, this information may be less relevant due to the broad range of +redshifts and small number of galaxies considered in each instantiation. +Furthermore, we note that the dominance of the first few moments of +$\hat{n}(z)$ may not always hold true as the methodology of \pz\ usage in +cosmology evolves. \section{Photo-z Test Data} @@ -371,409 +501,639 @@ \section{Photo-z Test Data} With the expectation that the optimal parametrization for approximating \pz s may differ according to the properties of the original photometric data, we demonstrate a procedure for vetting \pz\ parametrizations on a pair of mock -datasets, each intended to be realistic projections of the anticipated LSST \pz -s. -All \pz s were fit using the publicly available Bayesian Photometric Redshift -(BPZ) code \citep{benitez_bayesian_2000}, which employs spectral energy -distribution (SED) fitting to a template library. +datasets, each intended to be realistic predictions of subsets of the +anticipated LSST \pz s. +All \pz s were fit to LSST 10-year $ugrizy$ magnitudes and errors +\citep{ivezic_lsst:_2008} using the publicly available Bayesian Photometric +Redshift (BPZ) code \citep{benitez_bayesian_2000}, which employs fitting to a +library of spectral energy distribution (SED) templates. The choice of \pz\ estimation method, however, is not relevant to this study; so long as the mock \pz s are \textit{realistically complex}, meaning they take shapes similar to those we expect to see in \pz s from real datasets with similar photometric properties, it does not matter whether the \pz s produced by BPZ are accurate redshift posteriors. -We seek only to optimize the fidelity of the stored \pz\ relative to the \pz\ -output by a representative \pz\ fitting code. +We seek only to optimize the fidelity of the stored \pz\ relative to the +original \pz\ from a representative \pz\ fitting code. \citep[See][Schmidt, et al.\ in preparation for other work comparing the -accuracy of \pz s produced by different methods.]{tanaka_photometric_2017} +accuracy of \pz s produced by different methods.]{tanaka_photometric_2017, +jong_third_2017, amaro_metaphor:_2016} +As BPZ is a widely used and well established method, we assume that the \pz s +produced by it are of representative complexity. +The default format of BPZ is a $N_{ff}>200$ gridded parametrization with +resolution exceeding the available storage for an LSST-like survey. Because we believe that each galaxy has an underlying redshift interim posterior probability distribution that is a continuous function, to which the output of BPZ is itself a high-resolution approximation in the form of -evaluations on a grid, we fit the gridded \pz\ with a Gaussian mixture model -that we designate as the "true" \pz\ for our accuracy tests. - -\subsection{Higher-quality data mock catalog} +evaluations on a grid, we fit each gridded \pz\ with a Gaussian mixture model +that we designate as the reference representation $p^{r}(z)$ for our tests. +The number of components of the mixture model is set to the $99^{\mathrm{th}}$ +percentile of the modality distribution of the \pz\ catalog in question. + +\begin{figure*} + \begin{center} + \includegraphics[width=\columnwidth]{figures/graham_pzs.pdf} + \includegraphics[width=\columnwidth]{figures/schmidt_pzs.pdf} + \caption{ + Example \pz s from the two mock LSST datasets. + Left: The \mgdata mock photometry yields largely narrow, unimodal \pz s. + Right: The \ssdata mock photometry contains a higher proportion of broad +and/or multimodal \pz s. + \label{fig:example_pzs}} + \end{center} +\end{figure*} + +\subsection{\Mgdata data mock catalog} \label{sec:graham} -Our first dataset is an $N_{g}\approx30,000$ object subset of the simulated -galaxy catalog used for LSST photometric redshift experiments by Graham, et -al.\ (in preparation). -The data builds on the Millennium simulation \citep{springel_simulations_2005}, -and in particular the LC DEEP catalog based on the galaxy formation models of -\cite{gonzalez-perez_how_2014}, and was created using the lightcone -construction techniques described by \cite{merson_lightcone_2013}. -We limit the sample to galaxies with a catalog $i$-band magnitude of $i<25$ and -true redshifts $z<3.5$. -As in Graham, et al. (in preparation), we simulate observed apparent magnitudes -from the true catalog magnitudes by adding a normal random scatter with a -standard deviation equal to the predicted magnitude error for each galaxy (from -Section 3.2.1. of \citealt{ivezic_lsst:_2008}, using the software of -\citealt{connolly_end--end_2014}, assuming a mean airmass of 1.2 and a 10-year -accumulation of 56, 80, 184, 184, 160, and 160 visits in filters $ugrizy$, -respectively). -We also ignore any magnitudes fainter than the predicted 10-year limiting -magnitudes in each filter, $u<26.1$, $g<27.4$, $r<27.5$, $z<26.1$, and -$y<24.9$, as a realistic simulation of non-detections. - -The \pz\ estimates for this simulated catalog use the CFHTLS set of spectra -\citep{ilbert_accurate_2006} as BPZ templates and the default parameter -settings for BPZ, except that we impose a maximum photometric redshift of 3.5 -and allow BPZ to use the $i$-band as a magnitude prior during the photo-$z$ fit. +Our first dataset is an $N_{g} = 10^{5}$ object subset of the +\citet{graham_photometric_2017} simulated galaxy catalog used for LSST +photometric redshift experiments. +The data builds on the Millennium simulation of large-scale structure +\citep{springel_simulations_2005}, the galaxy formation models of +\citet{gonzalez-perez_how_2014}, and the lightcone construction techniques of +\citet{merson_lightcone_2013}. +The apparent $ugrizy$ magnitudes are derived from the true magnitudes using the +aforementioned 10-year LSST errors using the software of +\citet{connolly_end--end_2014}. +The sample is limited to galaxies with a catalog $i$-band magnitude of $i<25$ +and true redshifts $z<3.5$, omitting any magnitudes fainter than the predicted +10-year limiting magnitudes in each filter ($u<26.1$, $g<27.4$, $r<27.5$, +$z<26.1$, and $y<24.9$) to realistically simulate non-detections. + +The \pz\ estimates for this simulated catalog use BPZ templates based on the +VIsible MultiObject Spectrograph Very Large Telescope Deep Survey set of +spectra \citep{fevre_vimos_2005}, as in \citet{ilbert_accurate_2006}. +This catalog also uses the default parameter settings for BPZ with the two +additions of a photometric redshift maximum of 3.5 and an $i$-band magnitude +prior. The \pz s from BPZ are in the form of $N_{ff} = 351$ evaluations of the probability density on a regular grid of redshifts $0.01 < z < 3.51$, a -subsample of which are plotted in Figure~\ref{fig:graham_pzs}. - -\begin{figure} - \includegraphics[width=0.9\columnwidth]{figures/graham_pzs.png} - \caption{Example \pz s from the mock LSST data of Graham, et al.\ (in -preparation). - The high-quality mock photometry yields largely narrow, unimodal \pz s. - \label{fig:graham_pzs}} -\end{figure} - +subsample of which are shown in the left panel of Figure~\ref{fig:example_pzs}. As the figure shows, the \pz s from this dataset tend to be unimodal and -sharply peaked, as if coming from "higher quality" photometric data due to the +sharply peaked, as if coming from brighter photometric data due to the conservative cuts in photometric magnitudes of this dataset. -We produce "true" \pz s for the analysis by fitting a three-component Gaussian -mixture model to each \pz\ in the catalog. -We then calculate the three approximations to each \pz\ and evaluate their -accuracy using the metrics described above. +The brighter catalog reference \pz s are three-component Gaussian mixtures fir +to this data. -\subsection{Lower-quality data mock catalog} +\subsection{\Ssdata data mock catalog} \label{sec:schmidt} Our second dataset is an independent simulation of the expected LSST galaxy -sample. -Here, we use the Buzzard-highres-v1.0 mock galaxy catalog of deRose, et al.\ -(in preparation) of galaxies with SEDs drawn from an empirical library of -$\sim500,000$ SEDs from the Sloan Digital Sky Survey (SDSS). -Given an SED, redshift, and absolute $r$-band magnitude for each galaxy, we -compute the expected apparent magnitudes and magnitude errors in the six -broadband LSST filters ($ugrizy$), assuming the full 10-year depth of the -survey using the simple model of \citet{ivezic_lsst:_2008}. -The catalog contains $N_{g}\approx100,000$ galaxies to a depth of $i<26.9$, 1.5 -magnitudes deeper than the expected "LSST Gold sample" of galaxies that will -have $S/N\gtrsim30$ in multiple bands. - -In implementing BPZ, we createed a custom Bayesian prior using a subset of the -Buzzard-highres-v1.0 catalog and a spanning template set via a simple k-means -clustering algorithm based on $100$ of the SDSS SEDs used in creating the -Buzzard catalog. +sample, the Buzzard-highres-v1.0 mock galaxy catalog of deRose, et al.\ in +preparation of galaxies with SEDs drawn from an empirical library of +$\sim5\times10^{5}$ SEDs from the Sloan Digital Sky Survey (SDSS). +Given an SED, redshift, and absolute $r$-band magnitude for each galaxy, the +$ugrizy$ magnitudes are derived from the aforementioned 10-year LSST errors. +The catalog contains $N_{g} \approx 10^{5}$ galaxies $z<2.105$ to a depth of +$i<26.9$, 1.5 magnitudes deeper than the expected LSST gold sample of galaxies +\citep{lsst_science_collaboration_lsst_2009}, that will have $S/N \gtrsim 30$ +in multiple bands. + +We use a custom BPZ prior using a subset of the Buzzard-highres-v1.0 catalog +and a spanning template set via a simple k-means clustering algorithm based on +$100$ of the SDSS SEDs used to create the Buzzard catalog. BPZ produces \pz s in the format of probability density evaluations on a regular grid of $N_{ff}=211$ redshifts $0.005\leq z\leq2.105$, a subsample of -which are plotted in Figure~\ref{fig:schmidt_pzs}. - -\begin{figure} - \includegraphics[width=0.9\columnwidth]{figures/schmidt_pzs.png} - \caption{Example \pz s from the mock LSST data of deRose, et al.\ (in -preparation). - This sample contains a higher proportion of broad and/or multimodal \pz s, as -if from lower quality data. - \label{fig:schmidt_pzs}} -\end{figure} - -Even with six filters spanning the optical, there are known degeneracies -(e.~g.~the Lyman/Balmer break degeneracy) that lead us to expect the presence -of multimodal \pz s. -The exceptional depth of the dataset also leads us to expect the presence of -broad \pz s. -We produced "true" \pz s for the analysis by fitting a five-component Gaussian -mixture model to each gridded \pz\ in the catalog. -We then calculated the three different approximations to each \pz, and -evaluated their accuracy using the metrics described above. +which are plotted in the right panel of Figure~\ref{fig:example_pzs}. +The exceptional depth and known degeneracies (e.~g.~the Lyman/Balmer break +degeneracy) lead us to expect the presence of multimodal \pz s observed in the +figure. +The fainter catalog reference \pz s are five-component Gaussian mixtures fit to +this data. \section{Results \& Discussion} \label{sec:results} - -We calculate the metrics of Section~\ref{sec:metric} on 10 random -instantiations of catalogs of $N_{g}=100$ galaxies drawn randomly from each of -the datasets discussed in Section~\ref{sec:data} and with each of $N_{f}=3,\ -10,\ 30,\ 100$ stored parameters. +We evaluate the metrics of Section~\ref{sec:metric} on 10 random instantiations +of catalogs of $N_{g}=100$ galaxies drawn randomly from each of the datasets +discussed in Section~\ref{sec:data} and with each of $N_{f}=3,\ 10,\ 30,\ 100$ +stored parameters for the three formats of Section~\ref{sec:approx}. We then illustrate how our results could be used to choose an appropriate parametrization for each dataset given constraints on the distribution of KLDs -of individual \pz s, the KLD for $\hat{n}(z)$, or $N_{f}$ +or moment percent errors of individual \pz s, the KLD or moment percent error +of a science metric ($\hat{n}(z)$ in this case), or the available storage +capacity. + +\subsection{Individual \pz s} +\label{sec:individual_results} + +We compare our three formats on the basis of the distributions of the KLD +calculated for every \pz\ in the dataset. +An example of an individual \pz\ KLD distribution for the \mgdata dataset with +$N_{f}=10$ is shown in Figure~\ref{fig:individual}. +\begin{figure} + \begin{center} + \includegraphics[width=\columnwidth]{figures/individual_kld.pdf} + \caption{The distribution of log-KLD values for $N_{g}=100$ \pz s from the +\mgdata dataset with $N_{f}=10$ over the quantiles (purple with dashed border), +samples (green with dash-dotted border), and histogram (orange with dotted +border) formats. + In this instantiation, the samples format has a lower median KLD than the +quantiles format, which has a lower median KLD than the piecewise constant +format. + Note that the distributions are over log-KLD, so the ordering of the +formats by the breadth of the log-KLD distribution is the same as the order by +the median. + \label{fig:individual}} + \end{center} +\end{figure} + +To distill what is observed in the ten instantiations of plots like +Figure~\ref{fig:individual} for both datasets and all parametrizations, we +compare the moments of the distributions of metric values for the distribution +of the KLDs of individual \pz s under each parametrization, summarized in +Figure~\ref{fig:kld_moments}. +While it is obvious that one would like the mean (first moment) of the KLD +distribution to be low, interpretation of higher-order moments is less clear. +To meaningfully interpret the KLDs of individual \pz s, it will be necessary +for those using \pz s in their science to calculate the requirements on the +acceptable degree of information loss. +\begin{figure*} + \begin{center} + \includegraphics[width=\columnwidth]{graham_pz_kld.pdf} + \includegraphics[width=\columnwidth]{schmidt_pz_kld.pdf} + \caption{ + The means of the mean ($\bigstar$), variance ($+$), and kurtosis ($\times$) +of the log-KLD distributions for each dataset as a function of the number +$N_{f}$ of stored parameters for the quantile (dashed purple line), samples +(dash-dotted green line), and histogram (dotted orange line) formats with +$1\sigma$ Gaussian error bars based on 10 instantiations of 100 galaxies, which +are offset about $N_{f}$ to improve readability. + Left panel: The distribution of individual \pz\ KLD values of the \mgdata +mock catalog is most well-behaved when they are stored as samples, except at +large $N_{f}$. + Right panel: The \ssdata mock catalog achieves equivalence of the formats +in the moments of the log-KLD distributions at a much lower $N_{f}$, ultimately +showing the histogram format is most well-behaved at all but the smallest +$N_{f}$. + \label{fig:kld_moments}} + \end{center} +\end{figure*} + +Figure~\ref{fig:kld_moments} is rich in information. +For both datasets, the behavior of the first three log-moments of the log-KLD +distribution is highly correlated for a given format and number of parameters. +The error bars on the log-KLD log-moments increase at high $N_{f}$ for all +formats on both datasets beyond what one would expect simply based on the log +scaling. + +The \mgdata dataset has higher log-KLD log-moments than the \ssdata dataset at +all $N_{f}$ and across all formats, meaning information loss is enhanced for +more strongly featured data; this observation is not surprising because the +narrow, unimodal \pz s of the \mgdata dataset have long tails of very low +probability that are emphasized by the KLD. +The \ssdata dataset shows almost no change in the log-KLD log-moments between +$N_{f}=3$ and $N_{f}=10$ parameters, but both datasets otherwise exhibit a +steady decrease in all moments for the quantile and samples formats as $N_{f}$ +increases. + +The log-KLD log-moments are higher for quantiles than for samples for both +datasets, except at $N_{f}=100$ for the \mgdata dataset; this is not an +unexpected result because our choice of PDF reconstruction method for the +quantile format is most susceptible to error in the tails of the distribution, +where the KLD has the highest sensitivity. +The histogram format's log-KLD log-moments are higher than for other formats at +the lowest $N_{f}$ and steadily decrease in a manner similar to the other +formats, except at the highest $N_{f}$ values where the histogram format's +log-KLD log-moments decrease much more quickly. + +We also examine the percent error on the first three moments of the \pz s under +each approximation, using the base-10 log for interpretability. +Because the distribution of moment percent errors is highly non-Gaussian due to +a small number ($<1\%$) of truly extreme outliers for both datasets, across all +$N_{f}$ and all formats, we substitute the interquartile range for traditional +$1\sigma$ Gaussian error bars. +\begin{figure*} + \begin{center} + \includegraphics[width=\columnwidth]{graham_pz_err.pdf} + \includegraphics[width=\columnwidth]{schmidt_pz_err.pdf} + \caption{ + The median $\log_{10}$-percent errors on the mean ($\bigstar$), variance +($+$), and kurtosis ($\times$) of the \pz s for each dataset as a function of +the number $N_{f}$ of stored parameters per \pz\ for the quantile (dashed +purple line), samples (dash-dotted green line), and histogram (dotted orange +line) formats with interquartile range error bars based on 10 instantiations of +100 galaxies, where the $\log_{10}$-percent errors and their interquartile +ranges are offset around $N_{f}$ to improve readability. + Left panel: The \mgdata \pz\ ensemble's moment percent errors are minimized +by the quantile format at all $N_{f}$. + Right panel: The \ssdata \pz\ ensemble's moment percent errors are high for +all formats at low $N_{f}$ but distinct at high $N_{f}$, with the quantile +format overall outperforming the samples and histogram formats. + \label{fig:pz_moment_errs}} + \end{center} +\end{figure*} + +Though the $\log_{10}$-percent error of the moments of individual \pz s also +exhibits significant correlation between the moments for a given +parametrization, the behavior is otherwise markedly different from that of the +log-moments of the \pz\ ensemble's log-KLD distribution. +The percent errors of the moments of the approximate \pz s are overall lower in +the \mgdata dataset than the \ssdata dataset over the same range of number of +stored parameters; this is expected because there is simply less information to +capture in the \mgdata dataset. + +For the \mgdata dataset, the quantile format enables sub-percent errors in \pz\ +moments at the lowest $N_{f}$, a level that cannot be achieved until $N_{f}>30$ +parameters for the histogram format and $N_{f}>100$ for the samples format. +Furthermore, for the \mgdata dataset, the quantile format minimizes the percent +error at all $N_{f}$, whereas the samples format outperforms the histogram +format at low $N_{f}$ but the histogram format is outperforms the samples +format at high $N_{f}$. +Again, this behavior is expected of the narrow, unimodal \pz s of the \mgdata +dataset because large histogram bins are ineffective at capturing small-scale +structure and including more samples does not significantly improve +preservation of such features. + +The ordering of the moment percent error of all formats is the same for the +\ssdata dataset is the same as that of the \mgdata dataset at $N_{f}=3$. +In the \ssdata dataset, the inclusion of $N_{f}=30$ parameters decreases the +moment percent error of the histogram format more significantly than the +quantile or samples formats, to the point that the histogram and quantile +formats have comparable moment percent errors. +At higher $N_{f}$ in the \ssdata dataset, the quantile and histogram formats +continue to improve faster than the samples format, with the percent errors on +the \pz\ moments being consistently lower for the quantile format than for the +histogram format. +The broad, multimodal \pz s of the \ssdata dataset enable achievement of +sub-percent accuracy in the moments only with $N_{f}\geq30$ under the quantile +format and $N_{f}=100$ with the histogram format. \subsection{Stacked $\hat{n}(z)$ estimator} \label{sec:stacked_results} -Figure~\ref{fig:stacked} shows an example of $\hat{n}(z)$ estimated from \pz s +Figure~\ref{fig:stacked} shows an example of $\hat{n}(z)$ of \pz s reconstructed from just $N_{f}=10$ parameters under each of our three approximation formats, evaluated on the same fine grid as the input \pz s. +The strong features in the curve are due to the very small sample size of $100$ +galaxies. As expected, the stacked histogram is quite coarse because of the step function -interpolation. -The samples and quantiles can be interpolated such that the stacked $n(z)$ -estimator of the approximation is almost indistinguishable from the true -stacked estimator. - +interpolation, while the stacked estimator of the redshift distribution based +on \pz\ representations that are interpolations of stored samples and quantiles +are much closer to the stacked estimator of the original, high-resolution \pz s. +The KLD for each format is also included in the plot; in this instance, the KLD +is lowest for the quantile format and highest for the histogram format. \begin{figure} - \includegraphics[width=0.9\columnwidth]{figures/stacked.png} - \caption{An example of the stacked estimator of the redshift distribution, -for a subsample of $N_{g}=100$ galaxies drawn from the higher-quality data mock -catalog and with just $N_{f}=10$ parameters used for each \pz. - The most striking characteristic of $\hat{n}(z)$ with a relatively small + \begin{center} + \includegraphics[width=\columnwidth]{figures/stacked.pdf} + \caption{An example of the stacked estimator of the redshift distribution, +for a subsample of $N_{g}=100$ galaxies drawn from the \ssdata data mock +catalog and with $N_{f}=10$ parameters used for each \pz; the small-scale +features are due to the small number of galaxies in the sample. + The most striking characteristic of $\hat{n}(z)$ with a relatively small number of parameters on a small number of galaxies is the coarseness of the -histogram format (orange dotted line) relative to the quantile format (purple -dashed line) and samples format (green dash-dotted line), both of which are -fairly close to $\hat{n}(z)$ derived from evaluating the true \pz s (thick gray -line). - \label{fig:stacked}} +histogram format (orange dotted line) relative to the quantile (purple dashed +line) and samples (green dash-dotted line) formats, both of which are fairly +close to $\hat{n}(z)$ derived from evaluating the original, high-resolution \pz +s (thick gray line). + \label{fig:stacked}} + \end{center} \end{figure} +Again, due to the variation between $N_{g}=100$ galaxy subsamples, we repeat +the procedure that produced Figure~\ref{fig:stacked} 10 times to generate a +distribution over the KLD of the stacked estimator of the redshift distribution +for each format for each dataset. The $\hat{n}(z)$ KLD values for each parametrization on both mock datasets are collected and plotted in Figure~\ref{fig:kld}, with error regions based on the variance between the 10 instantiations. -The two datasets share some features: +\begin{figure*} + \begin{center} + \includegraphics[width=\columnwidth]{figures/graham_nz_kld.pdf} +\includegraphics[width=\columnwidth]{figures/schmidt_nz_kld.pdf} + \caption{The KLD between $\hat{n}(z)$ derived from approximate \pz\ +representations and $\hat{n}(z)$ derived from the original, high-resolution \pz +s, as a function of number $N_{f}$ of stored parameters, for the quantiles +(purple dashed line), samples (green dash-dotted line), and histogram (orange +dotted line) formats. + Shaded regions indicate the $1\sigma$ Gaussian errors derived from 10 +subsamples of 100 galaxies and lines indicate the mean of the distribution. + Left panel: The \mgdata \pz\ catalog's KLD of $\hat{n}(z)$ is minimized by +the quantile format at all $N_{f}$. + Right panel: The \ssdata \pz\ catalog's KLD of $\hat{n}(z)$ is minimized by +the quantile format at all $N_{f}$, though the samples format also has a +comparably low KLD. + \label{fig:kld}} + \end{center} +\end{figure*} +Figure~\ref{fig:kld} shows that the two datasets clearly share some features: \begin{enumerate} +\item As expected, the KLD drops as the number of stored parameters increases, +for all formats. +\item The quantile format minimizes the KLD at all numbers of stored parameters +considered. (Note that the appearance of a larger error region is due to the +log scaling.) \item The histogram format leads to substantial loss of information relative to -the other formats. -\item The samples and quantile formats have similar KLDs, particularly at low -$N_{f}$. -\item The quantile format achieves the lowest KLD of all formats in the high -$N_{f}$ regime. -\item The samples format is more consistent in terms of the variance between -instantiations. -\item The samples format asymptotically approaches a limit in KLD at -$N_{f}\geq30$, likely because that is the limit at which the samples-induced -shot noise becomes subdominant. +the other formats except at large numbers of stored parameters where it is +comparable with the samples format. +\end{enumerate} +However, there are also ways in which the behavior of the KLD on $\hat{n}$ +differs due to the data quality's significant impact on the behavior of this +metric: +\begin{enumerate} +\item The \ssdata dataset enables the achievement of lower KLD values than the +\mgdata dataset for all formats at all values of $N_{f}$ considered, likely a +consequence of the strong features present in $\hat{n}(z)$ for the \mgdata +dataset in our subsamples of 100 galaxies. +\item The rate at which the KLD of $\hat{n}(z)$ improves with increasing $N_{f} +$ is overall slower for the \mgdata dataset than for the \ssdata dataset; in +other words, saving more parameters has a greater marginal benefit for the +\ssdata dataset than for the \mgdata dataset. +\item The $\hat{n}(z)$ KLD for the samples format is not substantially higher +than for the quantile format in the \ssdata dataset but is for the \mgdata +dataset, which may reflect the subjectivity of the reconstruction scheme used +for those two formats. \end{enumerate} -Because the data quality has a significant impact on the behavior of the -metric, we proceed to discuss the two datasets separately, in terms of relative -performance between formats and absolute $\hat{n}(z)$ KLDs as a function of -$N_{f}$. - -For the higher quality data mock catalog, the histogram format not only has a -consistently higher $\hat{n}(z)$ KLD but one that does not improve with -increasing $N_{f}$. -This poor performance independent of resource allocation is actually expected -because, due to the narrow, unimodal \pz s of Figure~\ref{fig:graham_pzs}, a -great majority of the probability density will fall into a single bin -$c^{h}_{i=j}$ with all other $c^{h}_{i\neq j}\approx 0$ \textit{regardless} of -$N_{f}$. -The quantile format consistently produces the lowest $\hat{n}(z)$ KLDs, with a -larger improvement over the samples format as $N_{f}$ increases. -The improvement with larger $N_{f}$ is a surprising result because the shot -noise due to samples should decrease with higher $N_{f}$. -The quantile format also has a larger variance between instantiations than the -samples format, likely due to the imperfections of the procedures for deriving -the quantile parameters $\vec{c}^{q}$ from the true $p(z)$ and for -reconstructing $\hat{p}^{q}(z)$ from the quantile parameters $\vec{c}^{q}$. - -In the lower quality data mock catalog, the histogram format does improve with -increasing $N_{f}$, however, it takes a whopping $N_{f}=100$ stored parameters -before the histogram format can supercede the KLD of the samples and quantile -formats at a mere $N_{f}=3$ stored parameters, where the histogram format takes -a staggering value $KLD>1$. -With \pz s that take nontrivial values across a wider range of redshifts, the -histogram format is expected to capture more structure with more bins, -consistent with the aforementioned improvement. -The samples format's asymptote is surprisingly achieved at a lower KLD for the -low-quality data mock catalog than for the high-quality data mock catalog. -The variance between instantiations of the quantile format increases at high -$N_{f}$, a possible indication of room for improvement with the procedures for -deriving the quantile parameters $\vec{c}^{q}$ from the true $p(z)$ and for -reconstructing $\hat{p}^{q}(z)$ from the quantile parameters $\vec{c}^{q}$ - -\begin{figure} - \includegraphics[width=0.9\columnwidth]{figures/graham_kld.png}\\ - \includegraphics[width=0.9\columnwidth]{figures/schmidt_kld.png} - \caption{KLD between approximate $\hat{n}(z)$ and true $\hat{n}(z)$ as a -function of number of stored parameters, for the three different approximation -schemes: quantiles (purple dashed line), samples (green dash-dotted line), and -histogram (orange dotted line). - Top panel: The mock catalog of higher-quality data of -Section~\ref{sec:graham} favors the quantiles format and strongly disfavors the -histogram format, with quantiles being slightly more favorable than samples. - Bottom panel: The histogram format is not well-suited to the mock catalog of -lower-quality data of Section~\ref{sec:schmidt}, while the quantiles and -samples formats perform comparably well. - \label{fig:kld}} -\end{figure} - -We interpret Figure~\ref{fig:kld} in the context of constraints on storage -allocation imposed by the survey and constraints on the acceptable degree of -information loss imposed by the science requirements. -The former corresponds to a vertical line at a given $N_{f, lim}$ in Fig. -\ref{fig:kld}; the best format would be the one that achieves the lowest KLD at -$N_{f, lim}$. -The latter corresponds to a horizontal line at $KLD_{lim}$ in -Figure~\ref{fig:kld}; the best format would be the one that achieves -$KLD_{lim}$ at the smallest value of $N_{f}$. -If there is some flexibility in the acceptable degree of information loss on -$\hat{n}(z)$ and/or the allocation of storage for \pz s, as is the case for -LSST, it may be best to examine the asymptotic behavior of the KLD as a -function of $N_{f}$ for each format considered. -For example, if the KLD can be significantly reduced with a slightly larger -$N_{f}$, it may be possible to request additional storage capacity for the -survey's \pz s. - -\subsection{Individual \pz s} -\label{sec:individual_results} - -We also compare our three parametrizations on the basis of the distributions of -the KLD calculated for every \pz\ in the dataset's ensemble. -An example of the individual \pz\ KLD distribution is shown in -Figure~\ref{fig:individual} with $N_{f}=10$. - -\begin{figure} - \includegraphics[width=0.9\columnwidth]{figures/individual.png} - \caption{The distribution of log-KLD values for $N_{g}=100$ \pz s from the -low-quality dataset with $N_{f}=10$ over the quantiles (purple with dashed -border), samples (green with dash-dotted border), and histogram (orange with -dotted border) formats. - For the individual \pz s, the samples format has far better KLDs than the -histogram or quantile formats, which are comparable to one another. - \label{fig:individual}} -\end{figure} - -To distill what is observed in plots like Figure~\ref{fig:individual} for both -datasets and all parametrizations, we compare the moments of the distributions -of metric values for the distribution of the KLDs of individual \pz s under -each parametrization, summarized in Figure~\ref{fig:moments}. -While it is obvious that one would like the mean (first moment) of the KLD -distribution to be low, interpretation of higher-order moments is less clear. -In a science application that is robust to \pz\ outliers, a parametrization -with a high variance (second moment) may be acceptable, whereas in another -science application that simply requires well-characterized errors could -tolerate a higher mean in exchange for a lower variance. -To meaningfully interpret the KLDs of individual \pz s, it will be necessary -for those using \pz s in their science to calculate the requirements on the -acceptable degree of information loss. - -\begin{figure} - \includegraphics[width=0.9\columnwidth]{graham_moments.png}\\ - \includegraphics[width=0.9\columnwidth]{schmidt_moments.png} - \caption{The mean ($\bigstar$), variance ($+$), and kurtosis ($\times$) of -the log-KLD distributions for each dataset as a function of $N_{f}$ for the -quantile (purple), samples (green), and histogram (orange) formats. - The moments and their error regions are offset about $N_{f}$ to improve -readability. - Top panel: The samples format consistently minimizes the moments of the -log-KLD distribution of the higher quality data mock catalog at low $N_{f}$, -and the moments are similar across formats at the highest $N_{f}=100$. - Bottom panel: The lower quality data mock catalog exhibits no clear pattern -at low $N_{f}$ but the moments of all formats decrease at high $N_{f}$ with the -histogram format achieving the lowest moments in that regime. - \label{fig:moments}} -\end{figure} - - +We also address the relative, marginal, and absolute performance and +consistency thereof of the KLD on $\hat{n}(z)$ for each parametrization as a +function of format and $N_{f}$ for each dataset. +To guide this process, we interpret Figure~\ref{fig:kld} in the context of +constraints on storage allocation imposed by the survey and constraints on the +acceptable degree of information loss imposed by the science requirements, +which we anticipate establishing in the future. + +A constraint on storage resources corresponds to a vertical line at a given +$N_{f, \mathrm{lim}}$ in Fig. \ref{fig:kld}; the best format would be the one +that achieves the lowest KLD at $N_{f, \mathrm{lim}}$. +For example, if $N_{f, \mathrm{lim}}=10$ stored parameters, the quantile format +would be optimal for the \mgdata dataset because it has the lowest KLD value by +a large margin compared to other formats. +If the \ssdata dataset were subject to the same constraint, the quantile and +samples formats would both be good candidates for a storage parametrization, +with the quantile format opening the possibility of a lower KLD. +If there is some flexibility in the allocation of storage for \pz s, as is the +case for LSST, it may be best to examine the asymptotic behavior of the KLD as +a function of the number of stored parameters for each format considered; if +the KLD can be significantly reduced with a slightly larger $N_{f}$, it may be +possible to request additional storage capacity for the survey's \pz s. + +A constraint on the acceptable loss of information due to compression and +reconstruction of \pz s corresponds to a horizontal line at some +$\mathrm{KLD}_{\mathrm{lim}}$ in Figure~\ref{fig:kld}; the best parametrization +would correspond to the format that achieves $\mathrm{KLD}_{\mathrm{lim}}$ at +the lowest $N_{f}$. +For example, if the brighter dataset has $\mathrm{KLD}_{\mathrm{lim}}=10^{-2}$ +nats, the quantile parametrization with $N_{f}=3$ would be optimal due to the +slow marginal improvement of the $\hat{n}(z)$ KLD of the quantiles and samples +format and the high KLD for the histogram format. +If the \ssdata dataset were subject to the same constraint, the quantile +parametrization with $N_{f}=10$ achieves $\mathrm{KLD}_{\mathrm{lim}}$ at the +lowest $N_{f}$. + + + +We also calculate the percent error on the moments of the stacked estimator of +the redshift distribution, as these may be more useful for understanding error +propagation in cosmology due to \pz\ storage parametrization than the KLD, for +which no such infrastructure yet exists. +The percent error on the first three moments of the stacked estimator of the +redshift distribution function is shown in Figure~\ref{fig:nz_moment_errs}. +Because the distribution of moment percent errors is highly non-Gaussian due to +the small number of instantiations considered, we substitute the interquartile +range for traditional $1\sigma$ Gaussian error bars. +\begin{figure*} + \begin{center} + \includegraphics[width=\columnwidth]{graham_nz_err.pdf} +\includegraphics[width=\columnwidth]{schmidt_nz_err.pdf} + \caption{ + The percent error on the mean ($\bigstar$), variance ($+$), and kurtosis +($\times$) of the stacked estimator $hat{n}(z)$ of the redshift distribution +for each dataset as a function of the number $N_{f}$ of stored parameters for +the quantile (dashed purple line), samples (dash-dotted greenline), and +histogram (dotted orangeline) formats with interquartile range error bars based +on 10 instantiations of 100 galaxies, where the percent errors and their +interquartile ranges are offset about $N_{f}$ to improve readability. + Left panel: The \mgdata dataset shows evolution with $N_{f}$ of the +$\hat{n}(z)$ moment percent errors for the histogram format but none for the +samples and quantile formats. + Right panel: The \ssdata dataset shows qualitatively different evolution +with $N_{f}$ of the $\hat{n}(z)$ moment percent errors for the three formats +and for each moment. + \label{fig:nz_moment_errs}} + \end{center} +\end{figure*} +In this metric, the significant impact of data properties is quite apparent. +To explain this, we draw the reader's attention to Figure~\ref{fig:stacked} and +note that the actual redshift distribution for both datasets is similar, but +the redshift range over which they are defined is larger for the \mgdata +dataset than the \ssdata dataset. + +In the \mgdata dataset, the evolution of the $\hat{n}(z)$ moment errors with +$N_{f}$ differs for the histogram format relative to the samples and quantile +formats. +While the samples and quantile formats exhibit essentially no evolution in +excess of the error bars between instantiations, the histogram format +significantly underestimates the moments at low $N_{f}$, effectively +approximates the moment errors at intermediate $N_{f}$, and overestimates them +at high $N_{f}$. +For $N_{f}=3$, the moments are grossly underestimated because most of the +probability density of $\hat{n}^{r}(z)$ falls into the lowest single redshift +bin (explaining the higher moments), and the lowest redshift bin has most of +the probability density of $\hat{n}^{r}(z)$ above the middle of the bin +(explaining the mean). +When the bins are too small, at $N_{f}=100$, those at high redshifts have most +of their probability density below the middle of the bin, leading to slightly +overestimated moments. +Because the \pz s in the \mgdata dataset are so narrow and unimodal overall, +the reconstructions of the samples and quantile parametrizations are highly +accurate where most of the probability density is, even with low $N_{f}$, so +the reference representation moments are consistently recovered to within +$<1\%$. + +In the \ssdata dataset, the issues are different because the redshift range of +the original \pz s is smaller and the \pz s themselves are broader. +The samples format has no significant evolution in moment errors with $N_{f}$, +the histogram format severely overestimates the higher moments at low $N_{f}$, +and the quantiles format severely underestimates the moments at low $N_{f}$, +severely overestimates them at intermediate $N_{f}$, and moderately +overestimates them at high $N_{f}$. +The samples format may suffer from shot noise for broad, multimodal \pz s, but +the result is just spikier \pz s that produce narrow features in $\hat{n}(z)$ +that do not significantly affect the moments. +The histogram format's overestimation of higher moments at low $N_{f}$ in the +\ssdata dataset is caused by the bulk of the probability density of +$\hat{n}^{r}(z)$ falling almost evenly into the two low redshift bins with far +less probability in the highest bin. +As was hinted at in Figure~\ref{fig:kld_moments}, the quantile +parametrization's \pz\ KLD distribution has large moments, and the KLD is most +sensitive to a poor approximation of the tails of the distribution. +Both the underestimation of the $\hat{n}(z)$ moments at low $N_{f}$ and the +overestimation of the $\hat{n}(z)$ moments at intermediate $N_{f}$ are due to +the choice of a suboptimal reconstruction scheme for quantiles that could +doubtlessly be improved in the future. +The quantile format's overestimation of the moments even at high $N_{f}$ can be +explained by the fact that it is not limited to the redshift range over which +the original \pz s were defined, a possible oversight of the \qp\ +implementation. +A broad \pz\ may be reconstructed with probability density outside the redshift +range of the original \pz s and then truncated and normalized prior to +calculating the KLD. +Because broad \pz s are more likely to occur at high redshift, this excess +probability is more likely to be at high redshift, slightly but consistently +inflating the moments. \section{Conclusions \& Future Directions} \label{sec:conclusions} - -This work develops a principled approach to choosing a parametrization for -storing a catalog of \pz s from a survey of known data quality to balance the -accuracy of \pz s and science products thereof reconstructed from the stored -parameters against the available storage constraints. +This work develops a principled, systematic approach to choosing a +parametrization for storing a catalog of \pz s from a survey of known data +properties with a goal of balancing the available storage resources against the +accuracy of the \pz s and science products thereof reconstructed from the +stored parameters. We demonstrate the recommended method on two realistic mock datasets representative of upcoming \pz\ catalogs and draw the following conclusions: \begin{itemize} - \item The optimal parametrization depends on the properties of the data and -the science-driven metric used. - \item A larger number of available parameters in which to store a \pz\ -catalog does not necessarily imply a significant reduction in loss of -information. - \item The histogram format has a high rate of loss of information over high- -and low-quality data mock catalogs and across a wide range of number of stored -parameters, particularly under an aggregate function of individual -reconstructed \pz\ approximations. - \item The samples format is an excellent option for storage of high-quality -\pz\ catalogs, balancing loss of information for both individual \pz\ s and a -catalog-wide metric. - \item The quantile format is a promising option for minimizing loss of -information in \pz\ storage, competitive with samples for an estimator of the -redshift distribution function. + \item Some general trends are shared among the datasets we used in our tests, +but much of the qualitative and quantitative behavior is different. + The properties of the \pz\ catalog influence the optimal compression scheme. + \item The parametrization that best approximates individual \pz s may differ +from the parametrization that optimizes a given science metric. + The science goals must motivate the metric that guides the choice of +parametrization. + \item In our LSST-like examples with metrics motivated by gravitational +lensing probes of cosmology, we confirm the expectation that regular binning +and uniform sampling in the space of probability is more effective than regular +binning in redshift. + This trend can only be enhanced as the quantile and sample reconstruction +schemes improve. \end{itemize} - -Given the constraint that LSST will be able to store only 200 floating point -numbers to quantify the redshift of each galaxy and intends to include the -results of several \pz\ codes, we can safely say that LSST can store the output -of more than one \pz\ code without risk of significant loss of information. -We do not advocate for a one-size-fits-all solution to the problem and -emphasize that the optimal choice must depend on the requirements of the -science metric(s) and characteristics of the underlying \pz\ catalog. -We invite the community to contribute additional formats and metrics to the -publicly available \qp\ Python package developed for this project. \qp\ is a -tool that can be used to optimize the choice of stored parametrization of a -catalog of \pz s based on the accuracy needs of the use cases of the catalog. +To be clear, we do not advocate for a one-size-fits-all solution to the problem +of compressing \pz\ catalogs and emphasize that any decision should account for +the absolute, relative, and marginal behavior of the formats considered as a +function of the number of stored parameters. + +For the case of LSST, though the histogram format has the strongest presence in +the \pz\ literature, it exhibits a higher loss of information and moment +percent error of the reconstructed \pz s, except when a very large number of +parameters are stored, so we do not recommend its use for LSST's \pz\ catalog. +Given the constraint that LSST will be able to store only $\sim100$ numbers to +describe the redshift of each galaxy and intends to include the output of +several \pz\ codes, we can safely say that LSST can store the output of more +than one \pz\ code without risk of significant loss of information. +Had our results indicated a significant improvement in our metrics for a small +increase in the number of stored parameters, we would present to +decision-makers within the collaboration evidence in support of increasing that +allocation. + +Furthermore, though we discussed the previous use of each format in science +calculations, we do not endorse the preference of any format on the basis of +existing infrastructure for its use. +Rather, we anticipate great advances in the development of analysis techniques +that best make use of the information in \pz s and encourage the community to +then choose parametrizations that most effectively serve the needs of those +intended practices. +Future analyses may also consider options we did not, such as additional +formats, new metrics, variable $N_{f}$ over the PDF ensemble, and improved +samples and quantile reconstruction procedures. + +So that decisions of this kind can be optimized for all future surveys, the +\qp\ Python package developed for this project is made public on GitHub as a +tool for use by the broader community. +We invite contributions of formats, metrics, and reconstruction schemes to the +public GitHub repository. \subsection*{Appendix} \label{sec:kld} -The Kullback-Leibler divergence quantifies the loss of information, measured in -nats (base $e$ bits), due to using an approximation of a distribution. -The most important feature of the KLD is its asymmetry; it is not a distance, -like the root mean square error, that is the same from $P(z)$ to $P'(z)$ as it -is from $P'(z)$ to $P(z)$ but a \textit{divergence} in the information lost -when using $P'(z)$ to approximate $P(z)$. -The KLD requires that both functions $P(z)$ and $P'(z)$ be probability -distributions (always positive semidefinite and integrating to unity); this may -need to be explicitly enforced for some approximation formats. -The KLD is always positive, and a smaller value indicates better agreement -between the approximation and the truth. - -To develop some intuition, consider the simple example of a Gaussian -$P(z)=\mathcal{N}(\mu_{0}, \sigma_{0}^{2})$ being approximated by a Gaussian -$P'(z)=\mathcal{N}(\mu, \sigma^{2})$, whose KLD is +We develop some intuition for the Kullback-Leibler Divergence by contrasting it +with the familiar metric of the root-mean-square error (RMSE) +\begin{align} + \label{eq:rmse} + \mathrm{RMSE} &= \sqrt{\int (P(z) - \hat{P}(z))^{2} dz}. +\end{align} +Consider the simple example of a Gaussian $P(z)=\mathcal{N}(\mu_{0}, +\sigma_{0}^{2})$ being approximated by a Gaussian $P'(z)=\mathcal{N}(\mu, +\sigma^{2})$, whose KLD is \begin{align} \label{eq:gaussian} - KLD &= \frac{1}{2}\left(\log\left[\frac{\sigma^{2}}{\sigma_{0}^{2}}\right] + + \mathrm{KLD} &= +\frac{1}{2}\left(\log\left[\frac{\sigma^{2}}{\sigma_{0}^{2}}\right] + \frac{\sigma_{0}^{2}}{\sigma^{2}} + \frac{(\mu-\mu_{0})^{2}}{\sigma^{2}} - 1\right) \end{align} -To get a sense of the units of information, we can calculate the KLD in some -limiting cases. -If $\sigma=\sigma_{0}$ but $\mu=\mu_{0}+1$, we obtain $KLD=\frac{1}{2}$ nat -- -if the mean of the approximation is wrong by an additive factor of $1\sigma$, -half a nat of information is lost. +To get a sense of the units of information, we can calculate the KLD and RMSE +in some limiting cases. +If $\sigma=\sigma_{0}$ but $\mu=\mu_{0}+1$, we obtain +$\mathrm{KLD}=\frac{1}{2}$ nat -- if the mean of the approximation is wrong by +an additive factor of $\sigma$, half a nat of information is lost. If $\mu=\mu_{0}$ but $\sigma=\sqrt{2\pi}\sigma_{0}$, we find -$KLD\approx\frac{1}{2}$ nat -- half a nat of information is also lost if the -variance of the approximation is off by a multiplicative factor of $2\pi$. +$\mathrm{KLD}\approx\frac{1}{2}$ nat -- half a nat of information is also lost +if the variance of the approximation is off by a multiplicative factor of +$2\pi$. We can use the KLD to identify notions of imprecision and inaccuracy. Intuitively, precision must be related to how close $\sigma$ is to $\sigma_{0}$ and accuracy must be related to how close $\mu$ is to $\mu_{0}$. -If $\mu\approx\mu_{0}$, we can say $KLD\sim\log[r] + \frac{1}{2}r^{-2} - -\frac{1}{2}$ where $r^{-1}\equiv\frac{\sigma_{0}}{\sigma}$ is a measure of -"precision," whose behavior is illustrated in Figure~\ref{fig:precision}. \begin{figure} - \includegraphics[width=0.9\columnwidth]{figures/kld_precision.png} - \caption{The KLD (solid black line) is proportional to the log of the inverse -precision $r$ for $\sigma>\sigma_{0}$. - \label{fig:precision}} + \begin{center} + \includegraphics[width=\columnwidth]{figures/precision.pdf} + \caption{The KLD and RMSE as a function of the root variance ratio $r$ for +a simple Gaussian example. + The KLD (solid line) rises sharply at $\sigma<\sigma_{0}$ and is +proportional to the log of the inverse precision $r$ for $\sigma>\sigma_{0}$, +behavior that is qualitatively similar to that of the RMSE (dotted line). + \label{fig:precision}} + \end{center} \end{figure} -We observe that an overestimated variance increases the KLD as the log of the -square root of the ratio of the estimated variance to the true variance. -When $\sigma\approx\sigma_{0}$, $KLD\sim t^{2}$ in terms of the "tension" -$t\equiv\frac{(\mu-\mu_{0})^{2}}{\sigma^{2}}$, whose behavior is illustrated in -Figure~\ref{fig:tension}. +If $\mu\approx\mu_{0}$, we can say $\mathrm{KLD}\sim\log[r] + \frac{1}{2}r^{-2} +- \frac{1}{2}$ where $r^{-1}\equiv\frac{\sigma_{0}}{\sigma}$ is a measure of +\textit{precision}, whose behavior is illustrated in +Figure~\ref{fig:precision}, alongside that of the RMSE. We observe that an +overestimated variance increases the KLD as the log of the square root of the +ratio of the estimated variance to the true variance. \begin{figure} - \includegraphics[width=0.9\columnwidth]{figures/kld_tension.png} - \caption{The KLD (solid black line) is equal to the square of the tension -$t$, with a small additive offset when $r\neq1$. - \label{fig:tension}} + \begin{center} + \includegraphics[width=\columnwidth]{figures/tension.pdf} + \caption{The KLD and RMSE as a function of the tension $t$ for a simple +Gaussian example. + The KLD (solid lines) is equal to the square of the tension $t$, with a +small offset when $r\neq1$, whereas the RMSE (dotted lines) is relatively +insensitive to tension past a certain point but more sensitive to $r\neq1$. + \label{fig:tension}} + \end{center} \end{figure} +When $\sigma\approx\sigma_{0}$, $\mathrm{KLD}\sim t^{2}$ in terms of the +\textit{tension} $t\equiv\frac{(\mu-\mu_{0})^{2}}{\sigma^{2}}$, whose +concordance is illustrated in Figure~\ref{fig:tension}. +There is some limiting tension $t_{\mathrm{lim}}\approx2$ below which the RMSE +is more sensitive than the KLD and above which the KLD is more sensitive than +the RMSE. +This behavior hints at the KLD's reputation for sensitivity to the tails of a +distribution. +The notion of tension may be more important for cosmological applications of +\pz s, indicating the KLD may be a more appropriate metric for coarser +approximations and the RMSE may be a more appropriate metric for less coarse +approximations. + \subsection*{Acknowledgments} +This work was incubated at the 2016 LSST-DESC Hack Week hosted by Carnegie +Mellon University. +AIM is advised by David Hogg and was supported by NSF grant AST-1517237. +The work of AIM was also supported by the U.S. Department of Energy, Office of +Science, Office of Workforce Development for Teachers and Scientists, Office of +Science Graduate Student Research (SCGSR) program, administered by the Oak +Ridge Institute for Science and Education for the DOE under contract number +DE‐SC0014664. The work of PJM was supported by the U.S. Department of Energy under contract number DE-AC02-76SF00515. SJS was partially supported by the National Science Foundation under grant N56981CC. - \input{acknowledgments} +We would like to thank the LSST-DESC Publications Board and review committee +for helpful feedback in the preparation of this paper. \input{contributions} diff --git a/docs/desc-0000-qp-photo-z_approximation/research/analysis.html b/docs/desc-0000-qp-photo-z_approximation/research/analysis.html index 0b53ca1f..db861c3a 100644 --- a/docs/desc-0000-qp-photo-z_approximation/research/analysis.html +++ b/docs/desc-0000-qp-photo-z_approximation/research/analysis.html @@ -11766,7 +11766,10 @@

The Analysis PipelineIn [1]:
-
In [3]:
-
dataset_info = {}
-
- -
-
-
- -
-
-
-
-
-
-

There are two datasets available:

-
    -
  • $10^{5}$ LSST-like mock data provided by Sam Schmidt (UC Davis, LSST)
  • -
  • $10^{4}$ Euclid-like mock data provided by Melissa Graham (UW, LSST)
  • -
- -
-
-
-
-
-
In [4]:
-
-
-
# choose one of these:
-# dataset_key = 'Euclid'# Melissa Graham's data
-# dataset_key = 'LSST'# Sam Schmidt's data
-dataset_keys = ['Optical+IR', 'Optical']
-
-for dataset_key in dataset_keys:
-    dataset_info[dataset_key] = {}
-
- -
-
-
- -
-
-
-
-
-
-

Both datasets are fit with BPZ.

- -
-
-
-
-
-
In [5]:
-
-
-
for dataset_key in dataset_keys:
-    if dataset_key == 'Optical+IR':
-        datafilename = 'bpz_euclid_test_10_2.probs'
-    elif dataset_key == 'Optical':
-        datafilename = 'test_magscat_trainingfile_probs.out'
-    dataset_info[dataset_key]['filename'] = datafilename
-
- -
-
-
- -
-
-
-
-
-
-

The data files don't appear to come with information about the native format or metaparameters, but we are told they're evaluations on a regular grid of redshifts with given endpoints and number of parameters.

- -
-
-
-
-
-
In [6]:
-
-
-
delta = 0.01
-
-for dataset_key in dataset_keys:
-    
-    if dataset_key == 'Optical+IR':
-        z_low = 0.01
-        z_high = 3.51
-    elif dataset_key == 'Optical':
-        z_low = 0.005
-        z_high = 2.11
-    
-    dataset_info[dataset_key]['z_lim'] = (z_low, z_high)
-
-    z_grid = np.arange(z_low, z_high, delta, dtype='float')#np.arange(z_low, z_high + delta, delta, dtype='float')
-    z_range = z_high - z_low
-    delta_z = z_range / len(z_grid)
-
-    dataset_info[dataset_key]['z_grid'] = z_grid
-    dataset_info[dataset_key]['delta_z'] = delta_z
-
- -
-
-
- -
-
-
-
-
-
-

qp cannot currently convert gridded PDFs to histograms or quantiles - we need to make a GMM first, and use this to instantiate a qp.PDF object using a qp.composite object based on that GMM as qp.PDF.truth. The number of parameters necessary for a qualitatively good fit depends on the characteristics of the dataset.

- -
-
-
-
-
-
In [7]:
-
-
-
for dataset_key in dataset_keys:
-    
-    if dataset_key == 'Optical+IR':
-        nc_needed = 3
-    elif dataset_key == 'Optical':
-        nc_needed = 5
-    
-    dataset_info[dataset_key]['N_GMM'] = nc_needed
-
- -
-
-
- -
-
-
-
-
-
-

Let's define some useful quantities.

- -
-
-
-
-
-
In [8]:
-
-
-
#many_colors = ['red','green','blue','cyan','magenta','yellow']
-high_res = 300
-n_plot = 5
-n_moments_use = 4
-
-#make this a more clever structure, i.e. a dict
-formats = ['quantiles', 'histogram', 'samples']
-colors = {'quantiles': 'blueviolet', 'histogram': 'darkorange', 'samples': 'forestgreen'}
-styles = {'quantiles': '--', 'histogram': ':', 'samples': '-.'}
-stepstyles = {'quantiles': 'dashed', 'histogram': 'dotted', 'samples': 'dashdot'}
+
import matplotlib as mpl
+import matplotlib.pyplot as plt
+mpl.rcParams['text.usetex'] = True
+mpl.rcParams['mathtext.rm'] = 'serif'
+mpl.rcParams['font.family'] = 'serif'
+mpl.rcParams['font.serif'] = 'Times New Roman'
+mpl.rcParams['axes.titlesize'] = 16
+mpl.rcParams['axes.labelsize'] = 14
+mpl.rcParams['savefig.dpi'] = 250
+mpl.rcParams['savefig.format'] = 'pdf'
+mpl.rcParams['savefig.bbox'] = 'tight'
 
-pz_max = [1.]
-nz_max = [1.]
-hist_max = [1.]
-dist_min = [0.]
-dist_max = [0.]
-moment_max = [[]] * (n_moments_use - 1)
-mean_max = [[]] * (n_moments_use - 1)
-kld_min = [1.]
-kld_max = [1.]
+#comment out for NERSC
+%matplotlib inline
 
@@ -12021,37 +11847,17 @@

Analysis

-
In [9]:
+
In [4]:
-
def setup_dataset(dataset_key):#, n_gals_use):
-    
+
def setup_dataset(dataset_key, skip_rows, skip_cols):
+    start = timeit.default_timer()
     with open(dataset_info[dataset_key]['filename'], 'rb') as data_file:
         lines = (line.split(None) for line in data_file)
-        lines.next()
-        pdfs = np.array([[float(line[k]) for k in range(1,len(line))] for line in lines])
-    
-    # sys.getsizeof(pdfs)
-
-#     n_gals_tot = len(pdfs)
-#     full_gal_range = range(n_gals_tot)
-#     subset = np.random.choice(full_gal_range, n_gals_use, replace=False)#range(n_gals_use)
-#     pdfs_use = pdfs[subset]
-
-#     # using the same grid for output as the native format, but doesn't need to be so
-#     dataset_info[dataset_key]['in_z_grid'] = dataset_info[dataset_key]['z_grid']
-#     dataset_info[dataset_key]['metric_z_grid'] = dataset_info[dataset_key]['z_grid']
-    
-#     bonus = '_original'
-#     path = os.path.join(dataset_key, str(n_gals_use))
-#     loc = os.path.join(path, str(n_gals_use)+'from'+dataset_key+'_pzs'+bonus)
-#     with open(loc+'.hkl', 'w') as filename:
-#         info = {}
-#         info['z_grid'] = dataset_info[dataset_key]['in_z_grid']
-#         info['pdfs'] = pdfs_use
-#         hickle.dump(info, filename)
-    
-#     return(pdfs_use, bonus)
+        for r in range(skip_rows):
+            lines.next()
+        pdfs = np.array([[float(line[k]) for k in range(skip_cols, len(line))] for line in lines])
+    print('read in data file in '+str(timeit.default_timer()-start))
     return(pdfs)
 
@@ -12062,26 +11868,48 @@

Analysis

-
In [10]:
+
In [5]:
def make_instantiation(dataset_key, n_gals_use, pdfs, bonus=None):
     
+    start = timeit.default_timer()
+    
     n_gals_tot = len(pdfs)
     full_gal_range = range(n_gals_tot)
     subset = np.random.choice(full_gal_range, n_gals_use, replace=False)#range(n_gals_use)
+    print('randos for debugging: '+str(subset))
     pdfs_use = pdfs[subset]
-
+    
+    modality = []
+    dpdfs = pdfs_use[:,1:] - pdfs_use[:,:-1]
+    iqrs = []
+    for i in range(n_gals_use):
+        modality.append(len(np.where(np.diff(np.signbit(dpdfs[i])))[0]))
+        cdf = np.cumsum(qp.utils.normalize_integral((dataset_info[dataset_key]['z_grid'], pdfs_use[i]), vb=False)[1])
+        iqr_lo = dataset_info[dataset_key]['z_grid'][bisect.bisect_left(cdf, 0.25)]
+        iqr_hi = dataset_info[dataset_key]['z_grid'][bisect.bisect_left(cdf, 0.75)]
+        iqrs.append(iqr_hi - iqr_lo)
+    modality = np.array(modality)
+        
+    dataset_info[dataset_key]['N_GMM'] = int(np.median(modality))+1
+#     print('n_gmm for '+dataset_info[dataset_key]['name']+' = '+str(dataset_info[dataset_key]['N_GMM']))
+      
     # using the same grid for output as the native format, but doesn't need to be so
     dataset_info[dataset_key]['in_z_grid'] = dataset_info[dataset_key]['z_grid']
     dataset_info[dataset_key]['metric_z_grid'] = dataset_info[dataset_key]['z_grid']
     
+    print('preprocessed data in '+str(timeit.default_timer()-start))
+    
     path = os.path.join(dataset_key, str(n_gals_use))
-    loc = os.path.join(path, 'pzs'+bonus+str(n_gals_use)+dataset_key)
+    loc = os.path.join(path, 'pzs'+str(n_gals_use)+dataset_key+bonus)
     with open(loc+'.hkl', 'w') as filename:
         info = {}
+        info['randos'] = randos
         info['z_grid'] = dataset_info[dataset_key]['in_z_grid']
         info['pdfs'] = pdfs_use
+        info['modes'] = modality
+        info['iqrs'] = iqrs
         hickle.dump(info, filename)
     
     return(pdfs_use)
@@ -12094,49 +11922,56 @@ 

Analysis

-
In [11]:
+
In [6]:
def plot_examples(n_gals_use, dataset_key, bonus=None):
     
     path = os.path.join(dataset_key, str(n_gals_use))
-    loc = os.path.join(path, 'pzs'+bonus+str(n_gals_use)+dataset_key)
+    loc = os.path.join(path, 'pzs'+str(n_gals_use)+dataset_key+bonus)
     with open(loc+'.hkl', 'r') as filename:
         info = hickle.load(filename)
+        randos = info['randos']
         z_grid = info['z_grid']
         pdfs = info['pdfs']
     
-    plt.figure(1)
+    plt.figure()
     for i in range(n_plot):
         data = (z_grid, pdfs[randos[i]])
         data = qp.utils.normalize_integral(qp.utils.normalize_gridded(data))
         pz_max.append(np.max(data))
-        plt.plot(data[0], data[1], label=dataset_key+'#'+str(randos[i]))
+        plt.plot(data[0], data[1], label=dataset_info[dataset_key]['name']+' \#'+str(randos[i]), color=color_cycle[i])
     plt.xlabel(r'$z$', fontsize=14)
     plt.ylabel(r'$p(z)$', fontsize=14)
     plt.xlim(min(z_grid), max(z_grid))
     plt.ylim(0., max(pz_max))
-    plt.title(bonus[1:]+' '+dataset_key+' mock catalog of '+str(n_gals_use), fontsize=16)
-    plt.legend()
-    
-    plt.savefig(loc+'.png', dpi=250)
+    plt.title(dataset_info[dataset_key]['name']+' data examples', fontsize=16)
+    plt.savefig(loc+'.pdf', dpi=250)
     plt.close()
     
-    plt.figure(2)
-    for i in range(n_plot):
-        data = (z_grid, pdfs[randos[i]])
-        data = qp.utils.normalize_integral(qp.utils.normalize_gridded(data))
-        plt.plot(data[0], data[1], label=dataset_key+'#'+str(randos[i]))
-    plt.xlabel(r'$z$', fontsize=14)
-    plt.ylabel(r'$\log[p(z)]$', fontsize=14)
-    plt.semilogy()
-    plt.xlim(min(z_grid), max(z_grid))
-    plt.ylim(qp.utils.epsilon, max(pz_max))
-    plt.title(bonus[1:]+' '+dataset_key+' mock catalog of '+str(n_gals_use), fontsize=16)
-    plt.legend()
-    
-    plt.savefig(loc+'_log.png', dpi=250)
-    plt.close()
+    if 'modes' in info.keys():
+        modes = info['modes']
+        modes_max.append(np.max(modes))
+        plt.figure()
+        ax = plt.hist(modes, color='k', alpha=1./n_plot, histtype='stepfilled', bins=range(max(modes_max)+1))
+        plt.xlabel('modes')
+        plt.ylabel('frequency')
+        plt.title(dataset_info[dataset_key]['name']+' data modality distribution (median='+str(dataset_info[dataset_key]['N_GMM'])+')', fontsize=16)
+        plt.savefig(loc+'modality.pdf', dpi=250)
+        plt.close()
+        
+    if 'iqrs' in info.keys():
+        iqrs = info['iqrs']
+        iqr_min.append(min(iqrs))
+        iqr_max.append(max(iqrs))
+        plot_bins = np.linspace(min(iqr_min), max(iqr_max), 20)
+        plt.figure()
+        ax = plt.hist(iqrs, bins=plot_bins, color='k', alpha=1./n_plot, histtype='stepfilled')
+        plt.xlabel('IQR')
+        plt.ylabel('frequency')
+        plt.title(dataset_info[dataset_key]['name']+' data IQR distribution', fontsize=16)
+        plt.savefig(loc+'iqrs.pdf', dpi=250)
+        plt.close()
 
@@ -12156,7 +11991,7 @@

Analysis

-
In [12]:
+
In [7]:
def setup_from_grid(dataset_key, in_pdfs, z_grid, N_comps, high_res=1000, bonus=None):
@@ -12165,37 +12000,59 @@ 

Analysis

zlim = (min(z_grid), max(z_grid)) N_pdfs = len(in_pdfs) -# plot_examples(N_pdfs, z_grid, pdfs) - - print('making the initial ensemble of '+str(N_pdfs)+' PDFs') - E0 = qp.Ensemble(N_pdfs, gridded=(z_grid, in_pdfs), limits=dataset_info[dataset_key]['z_lim'], vb=True) - print('made the initial ensemble of '+str(N_pdfs)+' PDFs') + start = timeit.default_timer() +# print('making the initial ensemble of '+str(N_pdfs)+' PDFs') + E0 = qp.Ensemble(N_pdfs, gridded=(z_grid, in_pdfs), limits=dataset_info[dataset_key]['z_lim'], vb=False) + print('made the initial ensemble of '+str(N_pdfs)+' PDFs in '+str(timeit.default_timer() - start)) #fit GMMs to gridded pdfs based on samples (faster than fitting to gridded) - print('sampling for the GMM fit') + start = timeit.default_timer() +# print('sampling for the GMM fit') samparr = E0.sample(high_res, vb=False) - print('took '+str(high_res)+' samples') + print('took '+str(high_res)+' samples in '+str(timeit.default_timer() - start)) - print('making a new ensemble from samples') + start = timeit.default_timer() +# print('making a new ensemble from samples') Ei = qp.Ensemble(N_pdfs, samples=samparr, limits=dataset_info[dataset_key]['z_lim'], vb=False) - print('made a new ensemble from samples') + print('made a new ensemble from samples in '+str(timeit.default_timer() - start)) - print('fitting the GMM to samples') + start = timeit.default_timer() +# print('fitting the GMM to samples') GMMs = Ei.mix_mod_fit(comps=N_comps, vb=False) - print('fit the GMM to samples') + print('fit the GMM to samples in '+str(timeit.default_timer() - start)) #set the GMMS as the truth - print('making the final ensemble') + start = timeit.default_timer() +# print('making the final ensemble') Ef = qp.Ensemble(N_pdfs, truth=GMMs, limits=dataset_info[dataset_key]['z_lim'], vb=False) - print('made the final ensemble') + print('made the final ensemble in '+str(timeit.default_timer() - start)) path = os.path.join(dataset_key, str(N_pdfs)) - loc = os.path.join(path, 'pzs'+bonus+str(N_pdfs)+dataset_key) + loc = os.path.join(path, 'pzs'+str(n_gals_use)+dataset_key+bonus) with open(loc+'.hkl', 'w') as filename: info = {} + info['randos'] = randos info['z_grid'] = z_grid info['pdfs'] = Ef.evaluate(z_grid, using='truth', norm=True, vb=False)[1] hickle.dump(info, filename) + + start = timeit.default_timer() +# print('calculating '+str(n_moments_use)+' moments of original PDFs') + in_moments, vals = [], [] + for n in range(n_moments_use): + in_moments.append(Ef.moment(n, using='truth', limits=zlim, + dx=delta_z, vb=False)) + vals.append(n) + moments = np.array(in_moments) + print('calculated '+str(n_moments_use)+' moments of original PDFs in '+str(timeit.default_timer() - start)) + + path = os.path.join(dataset_key, str(N_pdfs)) + loc = os.path.join(path, 'pz_moments'+str(n_gals_use)+dataset_key+bonus) + with open(loc+'.hkl', 'w') as filename: + info = {} + info['truth'] = moments + info['orders'] = vals + hickle.dump(info, filename) return(Ef)
@@ -12217,13 +12074,14 @@

Analysis

-
In [13]:
+
In [8]:
-
def analyze_individual(E, z_grid, N_floats, dataset_key, N_moments=4, i=None):
+
def analyze_individual(E, z_grid, N_floats, dataset_key, N_moments=4, i=None, bonus=None):
     zlim = (min(z_grid), max(z_grid))
     z_range = zlim[-1] - zlim[0]
     delta_z = z_range / len(z_grid)
+    path = os.path.join(dataset_key, str(n_gals_use))
     
     Eq, Eh, Es = E, E, E
     inits = {}
@@ -12233,67 +12091,78 @@ 

Analysis

inits[f][ff] = None qstart = timeit.default_timer() - print('performing quantization') inits['quantiles']['quantiles'] = Eq.quantize(N=N_floats, vb=False) - print('finished quantization at '+str(timeit.default_timer() - qstart)) + print('finished making in '+str(timeit.default_timer() - qstart)) hstart = timeit.default_timer() - print('performing histogramization') inits['histogram']['histogram'] = Eh.histogramize(N=N_floats, binrange=zlim, vb=False) - print('finished histogramization at '+str(timeit.default_timer() - hstart)) + print('finished histogramization in '+str(timeit.default_timer() - hstart)) sstart = timeit.default_timer() - print('performing sampling') inits['samples']['samples'] = Es.sample(samps=N_floats, vb=False) - print('finished sampling at '+str(timeit.default_timer() - sstart)) + print('finished sampling in '+str(timeit.default_timer() - sstart)) - print('making the approximate ensembles') Eo = {} for f in formats: + fstart = timeit.default_timer() Eo[f] = qp.Ensemble(E.n_pdfs, truth=E.truth, quantiles=inits[f]['quantiles'], histogram=inits[f]['histogram'], samples=inits[f]['samples'], limits=dataset_info[dataset_key]['z_lim']) - bonus = '_'+str(n_floats_use)+f+'_('+str(i)+')' - path = os.path.join(dataset_key, str(n_gals_use)) - loc = os.path.join(path, 'pzs'+bonus+str(n_gals_use)+dataset_key) + fbonus = str(N_floats)+f+str(i) + loc = os.path.join(path, 'pzs'+str(n_gals_use)+dataset_key+fbonus) with open(loc+'.hkl', 'w') as filename: info = {} + info['randos'] = randos info['z_grid'] = z_grid info['pdfs'] = Eo[f].evaluate(z_grid, using=f, norm=True, vb=False)[1] hickle.dump(info, filename) - print('made the approximate ensembles') + print('made '+f+' ensemble in '+str(timeit.default_timer()-fstart)) - print('calculating the individual metrics') metric_start = timeit.default_timer() - klds, metrics, moments = {}, {}, {} + inloc = os.path.join(path, 'pz_moments'+str(n_gals_use)+dataset_key+bonus) + with open(inloc+'.hkl', 'r') as infilename: + pz_moments = hickle.load(infilename) + pz_moment_deltas, klds, metrics, kld_moments = {}, {}, {}, {} for key in Eo.keys(): - print('starting '+key) + key_start = timeit.default_timer() klds[key] = Eo[key].kld(using=key, limits=zlim, dx=delta_z) samp_metric = qp.PDF(samples=klds[key]) gmm_metric = samp_metric.mix_mod_fit(n_components=dataset_info[dataset_key]['N_GMM'], using='samples', vb=False) metrics[key] = qp.PDF(truth=gmm_metric) - moments[key] = [] - for n in range(N_moments+1): - moments[key].append([qp.utils.calculate_moment(metrics[key], n, - using=key, + + pz_moment_deltas[key], pz_moments[key], kld_moments[key] = [], [], [] + for n in range(N_moments): + kld_moments[key].append(qp.utils.calculate_moment(metrics[key], n, + using='truth', limits=zlim, dx=delta_z, - vb=False)]) - print('finished with '+key) - print('calculated the individual metrics in '+str(timeit.default_timer() - metric_start)) + vb=False)) + new_moment = Eo[key].moment(n, using=key, limits=zlim, + dx=delta_z, vb=False) + pz_moments[key].append(new_moment) + delta_moment = (new_moment - pz_moments['truth'][n]) / pz_moments['truth'][n] + pz_moment_deltas[key].append(delta_moment) + print('calculated the '+key+' individual moments, kld moments in '+str(timeit.default_timer() - key_start)) - path = os.path.join(dataset_key, str(E.n_pdfs)) - loc = os.path.join(path, str(N_floats)+'kld_hist'+str(n_gals_use)+dataset_key+str(i)) + loc = os.path.join(path, 'kld_hist'+str(n_gals_use)+dataset_key+str(N_floats)+'_'+str(i)) with open(loc+'.hkl', 'w') as filename: info = {} info['z_grid'] = z_grid info['N_floats'] = N_floats info['pz_klds'] = klds hickle.dump(info, filename) + + outloc = os.path.join(path, 'pz_moments'+str(n_gals_use)+dataset_key+str(N_floats)+'_'+str(i)) + with open(outloc+'.hkl', 'w') as outfilename: + hickle.dump(pz_moments, outfilename) - return(Eo, klds, moments) + save_moments(name, size, n_floats_use, kld_moments, 'pz_kld_moments') + save_moments(name, size, n_floats_use, pz_moments, 'pz_moments') + save_moments(name, size, n_floats_use, pz_moment_deltas, 'pz_moment_deltas') + + return(Eo)#, klds, kld_moments, pz_moments, pz_moment_deltas)
@@ -12303,13 +12172,56 @@

Analysis

-
In [14]:
+
In [9]:
+
+
+
def plot_all_examples(name, size, N_floats, init, bonus={}):
+    
+    fig, ax = plt.subplots()
+    lines = []
+    for bonus_key in bonus.keys():
+        path = os.path.join(name, str(size))
+        loc = os.path.join(path, 'pzs'+str(size)+name+bonus_key)
+        with open(loc+'.hkl', 'r') as filename:
+            info = hickle.load(filename)
+            randos = info['randos']
+            z_grid = info['z_grid']
+            pdfs = info['pdfs']
+        ls = bonus[bonus_key][0]
+        a = bonus[bonus_key][1]
+        lab = re.sub(r'[\_]', '', bonus_key)
+        line, = ax.plot([-1., 0.], [0., 0.], linestyle=ls, alpha=a, color='k', label=lab)
+        lines.append(line)
+        leg = ax.legend(loc='upper right', handles=lines)
+        for i in range(n_plot):
+            data = (z_grid, pdfs[randos[i]])
+            data = qp.utils.normalize_integral(qp.utils.normalize_gridded(data))
+            ax.plot(data[0], data[1], linestyle=ls, alpha=a, color=color_cycle[i])
+#     ax.legend(loc='upper right')
+    ax.set_xlabel(r'$z$', fontsize=14)
+    ax.set_ylabel(r'$p(z)$', fontsize=14)
+    ax.set_xlim(min(z_grid), max(z_grid))
+    ax.set_title(dataset_info[name]['name']+r' examples with $N_{f}=$'+str(N_floats), fontsize=16)
+    saveloc = os.path.join(path, 'pzs'+str(size)+name+str(N_floats)+'_'+str(init))
+    fig.savefig(saveloc+'.pdf', dpi=250)
+    plt.close()
+
+ +
+
+
+ +
+
+
+
In [10]:
-
def plot_individual(n_gals_use, dataset_key, N_floats, i):
+
def plot_individual_kld(n_gals_use, dataset_key, N_floats, i):
     
     path = os.path.join(dataset_key, str(n_gals_use))
-    loc = os.path.join(path, str(N_floats)+'kld_hist'+str(n_gals_use)+dataset_key+str(i))
+    a = 1./len(formats)
+    loc = os.path.join(path, 'kld_hist'+str(n_gals_use)+dataset_key+str(N_floats)+'_'+str(i))
     with open(loc+'.hkl', 'r') as filename:
         info = hickle.load(filename)
         z_grid = info['z_grid']
@@ -12318,21 +12230,20 @@ 

Analysis

plt.figure() plot_bins = np.linspace(-3., 3., 20) - a = 1./len(formats) for key in pz_klds.keys(): logdata = qp.utils.safelog(pz_klds[key]) kld_hist = plt.hist(logdata, color=colors[key], alpha=a, histtype='stepfilled', edgecolor='k', - label=key, normed=True, bins=plot_bins, linestyle=stepstyles[key], ls=stepstyles[key], lw=3) + label=key, normed=True, bins=plot_bins, linestyle=stepstyles[key], ls=stepstyles[key], lw=2) hist_max.append(max(kld_hist[0])) dist_min.append(min(logdata)) dist_max.append(max(logdata)) plt.legend() plt.ylabel('frequency', fontsize=14) plt.xlabel(r'$\log[KLD]$', fontsize=14) - plt.xlim(min(dist_min), max(dist_max)) - plt.ylim(0., max(hist_max)) - plt.title('KLD distribution of '+str(n_gals_use)+' from '+dataset_key+r' with $N_{f}='+str(N_floats)+r'$', fontsize=16) - plt.savefig(loc+'.png', dpi=250) +# plt.xlim(min(dist_min), max(dist_max)) +# plt.ylim(0., max(hist_max)) + plt.title(dataset_info[dataset_key]['name']+r' data $p(KLD)$ with $N_{f}='+str(N_floats)+r'$', fontsize=16) + plt.savefig(loc+'.pdf', dpi=250) plt.close()
@@ -12340,6 +12251,83 @@

Analysis

+
+
+
+
In [11]:
+
+
+
# def plot_individual_moment(n_gals_use, dataset_key, N_floats, i):
+    
+#     path = os.path.join(dataset_key, str(n_gals_use))
+#     a = 1./len(formats)    
+#     loc = os.path.join(path, 'pz_moments'+str(n_gals_use)+dataset_key+str(N_floats)+'_'+str(i))
+#     with open(loc+'.hkl', 'r') as filename:
+#         moments = hickle.load(filename)
+#     delta_moments = {}
+        
+#     plt.figure(figsize=(5, 5 * (n_moments_use-1)))
+#     for n in range(1, n_moments_use):
+#         ax = plt.subplot(n_moments_use, 1, n)
+#         ends = (min(moments['truth'][n]), max(moments['truth'][n]))
+#         for key in formats:
+#             ends = (min(ends[0], min(moments[key][n])), max(ends[-1], max(moments[key][n])))
+#         plot_bins = np.linspace(ends[0], ends[-1], 20)
+#         ax.hist([-100], color='k', alpha=a, histtype='stepfilled', edgecolor='k', label='truth', 
+#                     linestyle='-', ls='-')
+#         ax.hist(moments['truth'][n], bins=plot_bins, color='k', alpha=a, histtype='stepfilled', normed=True)
+#         ax.hist(moments['truth'][n], bins=plot_bins, color='k', histtype='step', normed=True, linestyle='-', alpha=a)
+#         for key in formats:
+#             ax.hist([-100], color=colors[key], alpha=a, histtype='stepfilled', edgecolor='k', label=key, 
+#                     linestyle=stepstyles[key], ls=stepstyles[key], lw=2)
+#             ax.hist(moments[key][n], bins=plot_bins, color=colors[key], alpha=a, histtype='stepfilled', normed=True)
+#             ax.hist(moments[key][n], bins=plot_bins, color='k', histtype='step', normed=True, linestyle=stepstyles[key], alpha=a, lw=2)
+#         ax.legend()
+#         ax.set_ylabel('frequency', fontsize=14)
+#         ax.set_xlabel(moment_names[n], fontsize=14)
+#         ax.set_xlim(min(plot_bins), max(plot_bins))
+#     plt.suptitle(dataset_info[dataset_key]['name']+r' data moments with $N_{f}='+str(N_floats)+r'$', fontsize=16)
+#     plt.tight_layout()
+#     plt.subplots_adjust(top=0.95)
+#     plt.savefig(loc+'.pdf', dpi=250)
+#     plt.close()
+        
+#     ngood = {}
+#     normarr = np.ones(n_gals_use)
+#     for key in formats:
+#         ngood[key] = np.zeros(n_moments_use)
+#     plt.figure(figsize=(5, 5 * (n_moments_use-1)))
+#     for n in range(1, n_moments_use):
+#         ax = plt.subplot(n_moments_use, 1, n)
+#         ends = (100., -100.)
+#         for key in formats:
+#             delta_moments[key] = (moments[key] - moments['truth']) / moments['truth']
+#             ngood[key][n] = np.sum(normarr[np.abs(delta_moments[key][n]) < 0.01]) / float(n_gals_use)
+#             ends = (min(ends[0], min(delta_moments[key][n])), max(ends[-1], max(delta_moments[key][n])))
+#         plot_bins = np.linspace(ends[0], ends[-1], 20)
+#         for key in formats:
+#             ax.hist([-100], color=colors[key], alpha=a, histtype='stepfilled', edgecolor='k', label=key, 
+#                     linestyle=stepstyles[key], ls=stepstyles[key], lw=2)
+#             ax.hist(delta_moments[key][n], bins=plot_bins, color=colors[key], alpha=a, histtype='stepfilled', normed=True)
+#             ax.hist(delta_moments[key][n], bins=plot_bins, color='k', histtype='step', normed=True, linestyle=stepstyles[key], alpha=a, lw=2)
+#         ax.legend()
+#         ax.set_ylabel('frequency', fontsize=14)
+#         ax.set_xlabel(r'fractional error on '+moment_names[n], fontsize=14)
+#         ax.set_xlim(min(plot_bins), max(plot_bins))
+#     plt.tight_layout()
+#     plt.subplots_adjust(top=0.95)
+#     plt.suptitle(dataset_info[dataset_key]['name']+r' data moment fractional errors with $N_{f}='+str(N_floats)+r'$', fontsize=16)
+#     plt.savefig(loc+'_delta.pdf', dpi=250)
+#     plt.close()
+    
+#     #TO DO: move this calculation and saving out of this plot, then eliminate the plot!
+#     save_moments(dataset_key, n_gals_use, N_floats, ngood, 'pz_moment_deltas')
+
+ +
+
+
+
@@ -12353,7 +12341,7 @@

Analysis

-
In [15]:
+
In [12]:
def analyze_stacked(E0, E, z_grid, n_floats_use, dataset_key, i=None):
@@ -12362,39 +12350,64 @@ 

Analysis

z_range = zlim[-1] - zlim[0] delta_z = z_range / len(z_grid) - print('stacking the ensembles') - stack_start = timeit.default_timer() +# print('stacking the ensembles') +# stack_start = timeit.default_timer() stacked_pdfs, stacks = {}, {} for key in formats: + start = timeit.default_timer() stacked_pdfs[key] = qp.PDF(gridded=E[key].stack(z_grid, using=key, vb=False)[key]) stacks[key] = stacked_pdfs[key].evaluate(z_grid, using='gridded', norm=True, vb=False)[1] + print('stacked '+key+ ' in '+str(timeit.default_timer()-start)) + stack_start = timeit.default_timer() stacked_pdfs['truth'] = qp.PDF(gridded=E0.stack(z_grid, using='truth', vb=False)['truth']) stacks['truth'] = stacked_pdfs['truth'].evaluate(z_grid, using='gridded', norm=True, vb=False)[1] - print('stacked the ensembles in '+str(timeit.default_timer() - stack_start)) + print('stacked truth in '+str(timeit.default_timer() - stack_start)) + +# print('calculating the metrics') +# metric_start = timeit.default_timer() +# for n in range(n_moments_use): +# moments['truth'].append(qp.utils.calculate_moment(stacked_pdfs['truth'], n, +# limits=zlim, +# dx=delta_z, +# vb=False)) +# print('calculated the true moments in '+str(timeit.default_timer() - metric_start)) - print('calculating the metrics') - metric_start = timeit.default_timer() klds = {} for key in formats: + kld_start = timeit.default_timer() klds[key] = qp.utils.calculate_kl_divergence(stacked_pdfs['truth'], stacked_pdfs[key], limits=zlim, dx=delta_z) - print('calculated the metrics in '+str(timeit.default_timer() - metric_start)) + print('calculated the '+key+' stacked kld in '+str(timeit.default_timer() - kld_start)) + save_nz_metrics(name, size, n_floats_use, klds, 'nz_klds') + + moments = {} + for key in formats_plus: + moment_start = timeit.default_timer() + moments[key] = [] + for n in range(n_moments_use): + moments[key].append(qp.utils.calculate_moment(stacked_pdfs[key], n, + limits=zlim, + dx=delta_z, + vb=False)) + print('calculated the '+key+' stacked moments in '+str(timeit.default_timer() - moment_start)) + save_moments(name, size, n_floats_use, moments, 'nz_moments') path = os.path.join(dataset_key, str(E0.n_pdfs)) - loc = os.path.join(path, str(n_floats_use)+'nz_comp'+str(n_gals_use)+dataset_key+str(i)) + loc = os.path.join(path, 'nz_comp'+str(n_gals_use)+dataset_key+str(n_floats_use)+'_'+str(i)) with open(loc+'.hkl', 'w') as filename: info = {} info['z_grid'] = z_grid info['stacks'] = stacks info['klds'] = klds + info['moments'] = moments hickle.dump(info, filename) - return(stacked_pdfs, klds) + return(stacked_pdfs)
@@ -12404,13 +12417,13 @@

Analysis

-
In [16]:
+
In [13]:
def plot_estimators(n_gals_use, dataset_key, n_floats_use, i=None):
     
     path = os.path.join(dataset_key, str(n_gals_use))
-    loc = os.path.join(path, str(n_floats_use)+'nz_comp'+str(n_gals_use)+dataset_key+str(i))
+    loc = os.path.join(path, 'nz_comp'+str(n_gals_use)+dataset_key+str(n_floats_use)+'_'+str(i))
     with open(loc+'.hkl', 'r') as filename:
         info = hickle.load(filename)
         z_grid = info['z_grid']
@@ -12418,18 +12431,18 @@ 

Analysis

klds = info['klds'] plt.figure() - plt.plot(z_grid, stacks['truth'], color='black', lw=4, alpha=0.3, label='truth') + plt.plot(z_grid, stacks['truth'], color='black', lw=3, alpha=0.3, label='original') nz_max.append(max(stacks['truth'])) for key in formats: nz_max.append(max(stacks[key])) - plt.plot(z_grid, stacks[key], label=key+r' KLD='+str(klds[key]), color=colors[key], linestyle=styles[key]) + plt.plot(z_grid, stacks[key], label=key+r' KLD='+str(klds[key])[:8], color=colors[key], linestyle=styles[key]) plt.xlabel(r'$z$', fontsize=14) plt.ylabel(r'$\hat{n}(z)$', fontsize=14) plt.xlim(min(z_grid), max(z_grid)) - plt.ylim(0., max(nz_max)) +# plt.ylim(0., max(nz_max)) plt.legend() - plt.title(r'$\hat{n}(z)$ for '+str(n_gals_use)+r' from '+dataset_key+r' with $N_{f}='+str(n_floats_use)+r'$', fontsize=16) - plt.savefig(loc+'.png', dpi=250) + plt.title(dataset_info[dataset_key]['name']+r' data $\hat{n}(z)$ with $N_{f}='+str(n_floats_use)+r'$', fontsize=16) + plt.savefig(loc+'.pdf', dpi=250) plt.close()
@@ -12475,40 +12488,40 @@

Scaling

-
In [17]:
+
In [14]:
-
def save_pz_metrics(dataset_key, n_gals_use, N_f, metric_moments):
+
def save_moments(dataset_key, n_gals_use, N_f, stat, stat_name):
 
     path = os.path.join(dataset_key, str(n_gals_use))
-    loc = os.path.join(path, 'pz_klds'+str(n_gals_use)+dataset_key)
+    loc = os.path.join(path, stat_name+str(n_gals_use)+dataset_key)
     
     if os.path.exists(loc+'.hkl'):
-        with open(loc+'.hkl', 'r') as pz_file:
+        with open(loc+'.hkl', 'r') as stat_file:
         #read in content of list/dict
-            pz_stats = hickle.load(pz_file)
+            stats = hickle.load(stat_file)
     else:
-        pz_stats = {}
-        pz_stats['N_f'] = []
-        for f in formats:#change this name to formats
-            pz_stats[f] = []
-            for m in range(n_moments_use + 1):
-                pz_stats[f].append([])
+        stats = {}
+        stats['N_f'] = []
+        for f in stat.keys():
+            stats[f] = []
+            for m in range(n_moments_use):
+                stats[f].append([])
 
-    if N_f not in pz_stats['N_f']:
-        pz_stats['N_f'].append(N_f)
-        for f in formats:
-            for m in range(n_moments_use + 1):
-                pz_stats[f][m].append([])
+    if N_f not in stats['N_f']:
+        stats['N_f'].append(N_f)
+        for f in stat.keys():
+            for m in range(n_moments_use):
+                stats[f][m].append([])
         
-    where_N_f = pz_stats['N_f'].index(N_f)
+    where_N_f = stats['N_f'].index(N_f)
         
-    for f in formats:
-        for m in range(n_moments_use + 1):
-            pz_stats[f][m][where_N_f].append(metric_moments[f][m])
+    for f in stat.keys():
+        for m in range(n_moments_use):
+            stats[f][m][where_N_f].append(stat[f][m])
 
-    with open(loc+'.hkl', 'w') as pz_file:
-        hickle.dump(pz_stats, pz_file)
+    with open(loc+'.hkl', 'w') as stat_file:
+        hickle.dump(stats, stat_file)
 
@@ -12518,18 +12531,24 @@

Scaling

-
In [18]:
+
In [15]:
def plot_pz_metrics(dataset_key, n_gals_use):
-# trying really hard to make this colorblind-readable but still failing
 
     path = os.path.join(dataset_key, str(n_gals_use))
-    loc = os.path.join(path, 'pz_klds'+str(n_gals_use)+dataset_key)
+    loc = os.path.join(path, 'pz_kld_moments'+str(n_gals_use)+dataset_key)
     with open(loc+'.hkl', 'r') as pz_file:
         pz_stats = hickle.load(pz_file)
+#     if len(instantiations) == 10:
+#         for f in formats:
+#             for n in range(n_moments_use):
+#                 if not np.shape(pz_stats[f][n]) == (4, 10):
+#                     for s in range(len(pz_stats[f][n])):
+#                         pz_stats[f][n][s] = np.array(np.array(pz_stats[f][n][s])[:10]).flatten()
         
     flat_floats = np.array(pz_stats['N_f']).flatten()
+    in_x = np.log(flat_floats)
 
     def make_patch_spines_invisible(ax):
         ax.set_frame_on(True)
@@ -12537,80 +12556,212 @@ 

Scaling

for sp in ax.spines.values(): sp.set_visible(False) - shapes = ['*','+','x']#,'v','^','<','>'] - marksize = 50 + shapes = moment_shapes + marksize = 10 a = 1./len(formats) + fig, ax = plt.subplots() + fig.subplots_adjust(right=1.) + ax_n = ax + for key in formats: + ax.plot([-1], [0], color=colors[key], label=key, linewidth=1, linestyle=styles[key]) + for n in range(1, n_moments_use): + ax.scatter([-1], [0], color='k', alpha=a, marker=shapes[n], s=2*marksize, label=moment_names[n]) + n_factor = 0.1 * (n - 2) + if n>1: + ax_n = ax.twinx() + rot_ang = 270 + label_space = 15. + else: + rot_ang = 90 + label_space = 0. + if n>2: + ax_n.spines["right"].set_position(("axes", 1. + 0.1 * (n-1))) + make_patch_spines_invisible(ax_n) + ax_n.spines["right"].set_visible(True) + for s in range(len(formats)): + f = formats[s] + f_factor = 0.05 * (s - 1) +# print('pz metrics data shape '+str(pz_stats[f][n])) + data_arr = np.log(np.swapaxes(np.array(pz_stats[f][n]), 0, 1))#go from n_floats*instantiations to instantiations*n_floats + mean = np.mean(data_arr, axis=0).flatten() + std = np.std(data_arr, axis=0).flatten() + y_plus = mean + std + y_minus = mean - std + y_cor = np.array([y_minus[:-1], y_plus[:-1], y_plus[1:], y_minus[1:]]) + ax_n.plot(np.exp(in_x+n_factor), mean, marker=shapes[n], markersize=marksize, linestyle=styles[f], alpha=a, color=colors[f]) + ax_n.vlines(np.exp(in_x+n_factor), y_minus, y_plus, linewidth=3., alpha=a, color=colors[f]) + pz_mean_max[n] = max(pz_mean_max[n], np.max(y_plus)) + pz_mean_min[n] = min(pz_mean_min[n], np.min(y_minus)) + ax_n.set_ylabel(r'$\log[\mathrm{'+moment_names[n]+r'}]$', rotation=rot_ang, fontsize=14, labelpad=label_space) + ax_n.set_ylim((pz_mean_min[n]-1., pz_mean_max[n]+1.)) + ax.set_xscale('log') + ax.set_xticks(flat_floats) + ax.get_xaxis().set_major_formatter(mpl.ticker.ScalarFormatter()) + ax.set_xlim(np.exp(min(in_x)-0.25), np.exp(max(in_x)+0.25)) + ax.set_xlabel('number of parameters', fontsize=14) + ax.set_title(dataset_info[dataset_key]['name']+r' data $\log[KLD]$ log-moments', fontsize=16) + ax.legend(loc='lower left') + fig.tight_layout() + fig.savefig(loc+'_clean.pdf', dpi=250) + plt.close() + fig, ax = plt.subplots() fig.subplots_adjust(right=1.) ax_n = ax for key in formats: ax_n.plot([-1], [0], color=colors[key], label=key, linestyle=styles[key], linewidth=1) - - for n in range(1, 4): - ax.scatter([-1], [0], color='k', marker=shapes[n-1], s=marksize, label='moment '+str(n)) + for n in range(1, n_moments_use): + n_factor = 0.1 * (n - 2) + ax.scatter([-1], [0], color='k', alpha=a, marker=shapes[n], s=2*marksize, label=moment_names[n]) if n>1: ax_n = ax.twinx() + rot_ang = 270 + label_space = 15. + else: + rot_ang = 90 + label_space = 0. if n>2: ax_n.spines["right"].set_position(("axes", 1. + 0.1 * (n-1))) make_patch_spines_invisible(ax_n) ax_n.spines["right"].set_visible(True) - for f in formats: - data_arr = np.swapaxes(np.array(pz_stats[f][n]), 0, 1)#go from n_floats*instantiations to instantiations*n_floats - for i in data_arr:#next try plot with marker and linewidth/linestyle keywords - ax_n.scatter(flat_floats, i, marker=shapes[n-1], s=marksize, color=colors[f], alpha=a)#, - # linewidth=1, linestyle=styles[f], edgecolor='k') -# ax_n.scatter(flat_floats, i, marker=shapes[n-1], s=marksize, color='None', -# linewidth=2, linestyle=styles[f], edgecolor='k', alpha=1.) - moment_max[n-1].append(max(i)) - ax_n.set_ylabel('moment '+str(n), fontsize=14) - ax_n.set_ylim(0., max(moment_max[n-1])) - ax.set_xlim(min(flat_floats) - 10**int(np.log10(min(flat_floats))), max(flat_floats) + 10**int(np.log10(max(flat_floats)))) - ax.semilogx() + for s in range(len(formats)): + f = formats[s] + f_factor = 0.05 * (s - 1) +# print('pz metrics data shape '+str(pz_stats[f][n])) + data_arr = np.log(np.swapaxes(np.array(pz_stats[f][n]), 0, 1))#go from n_floats*instantiations to instantiations*n_floats + for i in data_arr: + ax_n.plot(np.exp(in_x+n_factor), i, linestyle=styles[f], marker=shapes[n], markersize=marksize, color=colors[f], alpha=a) +# pz_moment_max[n-1].append(max(i)) + ax_n.set_ylabel(r'$\log[\mathrm{'+moment_names[n]+r'}]$', rotation=rot_ang, fontsize=14, labelpad=label_space) + ax_n.set_ylim(pz_mean_min[n]-1., pz_mean_max[n]+1.) + ax.set_xscale('log') + ax.set_xticks(flat_floats) + ax.get_xaxis().set_major_formatter(mpl.ticker.ScalarFormatter()) + ax.set_xlim(np.exp(min(in_x)-0.25), np.exp(max(in_x)+0.25)) ax.set_xlabel('number of parameters', fontsize=14) - ax.set_title('KLD moments on '+str(n_gals_use)+' from '+dataset_key, fontsize=16) - ax.legend(loc='upper left') + ax.set_title(dataset_info[dataset_key]['name']+r' data $\log[KLD]$ log-moments', fontsize=16) + ax.legend(loc='lower left') fig.tight_layout() - fig.savefig(loc+'.png', dpi=250) + fig.savefig(loc+'_all.pdf', dpi=250) plt.close() +

+ +
+
+
+ +
+
+
+
In [16]:
+
+
+
def plot_pz_delta_moments(name, size):
+    n_gals_use = size
     
+    # should look like nz_moments
+    path = os.path.join(name, str(n_gals_use))
+    loc = os.path.join(path, 'pz_moment_deltas'+str(n_gals_use)+name)
+    with open(loc+'.hkl', 'r') as pz_file:
+        pz_stats = hickle.load(pz_file)
+    flat_floats = np.array(pz_stats['N_f']).flatten()
+    in_x = np.log(flat_floats)
+    a = 1./len(formats)
+    shapes = moment_shapes
+    marksize = 10
+    
+    def make_patch_spines_invisible(ax):
+        ax.set_frame_on(True)
+        ax.patch.set_visible(False)
+        for sp in ax.spines.values():
+            sp.set_visible(False)   
+            
     fig, ax = plt.subplots()
     fig.subplots_adjust(right=1.)
     ax_n = ax
-#     jitters = {}
-#     factors = {'quantiles':-0.1, 'histogram':0., 'samples':0.1}
     for key in formats:
-        ax_n.plot([-1], [0], color=colors[key], label=key, linewidth=1)
-#         jitters[key] = factors[key] * np.sqrt(flat_floats)
-    for n in range(1, 4):
-        ax.scatter([-1], [0], color='k', marker=shapes[n-1], s=marksize, label='moment '+str(n))
+        ax.plot([-10], [0], color=colors[key], label=key, linestyle=styles[key], linewidth=1)
+    for n in range(1, n_moments_use):
+        ax.scatter([-10], [0], color='k', alpha=a, marker=shapes[n], s=2*marksize, label=moment_names[n])
+        n_factor = 0.1 * (n - 2)
         if n>1:
             ax_n = ax.twinx()
+            rot_ang = 270
+            label_space = 15.
+        else:
+            rot_ang = 90
+            label_space = 0.
         if n>2:
             ax_n.spines["right"].set_position(("axes", 1. + 0.1 * (n-1)))
             make_patch_spines_invisible(ax_n)
             ax_n.spines["right"].set_visible(True)
-        for f in formats:
+        for s in range(len(formats)):
+            f = formats[s]
+            f_factor = 0.05 * (s - 1)
+#             print(str(np.shape(pz_stats[f][n]))+' should be n_floats * n_instantiations')
             data_arr = np.swapaxes(np.array(pz_stats[f][n]), 0, 1)#go from n_floats*instantiations to instantiations*n_floats
+#             print(str(np.shape(data_arr))+' should be n_instantiations * n_floats')
+            data_arr = np.median(data_arr, axis=2) * 100.
             mean = np.mean(data_arr, axis=0).flatten()
             std = np.std(data_arr, axis=0).flatten()
-#             x_cor = np.array([flat_floats[:-1], flat_floats[:-1], flat_floats[1:], flat_floats[1:]])
-#             y_plus = mean + std
-#             y_minus = mean - std
-#             y_cor = np.array([y_minus[:-1], y_plus[:-1], y_plus[1:], y_minus[1:]])
-            ax_n.scatter(flat_floats, mean, marker=shapes[n-1], s=marksize, alpha=2*a, color=colors[f])
-            ax_n.errorbar(flat_floats, mean, yerr=std, color=colors[f], alpha=2*a, capsize=5, elinewidth=1, linewidth=0., visible=True)
-#             ax_n.fill(x_cor, y_cor, color=colors[f], alpha=a, linewidth=0.)
-            mean_max[n-1].append(np.max(mean+std))
-        ax_n.set_ylabel('moment '+str(n), fontsize=14)
-        ax_n.set_ylim(0., np.max(np.array(mean_max[n-1])))
-    ax.set_xlim(min(flat_floats)/3., max(flat_floats)*3.)
-    ax.semilogx()
+            y_plus = mean + std
+            y_minus = mean - std
+            y_cor = np.array([y_minus[:-1], y_plus[:-1], y_plus[1:], y_minus[1:]])
+            ax_n.plot(np.exp(in_x+n_factor), mean, linestyle=styles[key], marker=shapes[n], markersize=marksize, alpha=a, color=colors[f])
+            ax_n.vlines(np.exp(in_x+n_factor), y_minus, y_plus, linewidth=3., alpha=a, color=colors[f])
+            n_delta_max[n] = max(n_delta_max[n], np.max(y_plus))
+            n_delta_min[n] = min(n_delta_min[n], np.min(y_minus))
+        ax_n.set_ylabel(r'median percent error on '+moment_names[n], rotation=rot_ang, fontsize=14, labelpad=label_space)
+        ax_n.set_ylim((min(n_delta_min)-1., max(n_delta_max)+1.))
+    ax.set_xscale('log')
+    ax.set_xticks(flat_floats)
+    ax.get_xaxis().set_major_formatter(mpl.ticker.ScalarFormatter())
+    ax.set_xlim(np.exp(min(in_x)-0.25), np.exp(max(in_x)+0.25))
     ax.set_xlabel('number of parameters', fontsize=14)
-    ax.set_title('KLD moments on '+str(n_gals_use)+' from '+dataset_key, fontsize=16)
-    ax.legend(loc='upper left')
+    ax.set_title(dataset_info[name]['name']+r' data $\hat{p}(z)$ moment errors', fontsize=16)
+    ax.legend(loc='upper right')
     fig.tight_layout()
-    fig.savefig(loc+'_clean.png', dpi=250)
+    fig.savefig(loc+'_clean.pdf', dpi=250)
+    plt.close()
+            
+    fig, ax = plt.subplots()
+    fig.subplots_adjust(right=1.)
+    ax_n = ax
+    for key in formats:
+        ax_n.plot([-10], [0], color=colors[key], label=key, linestyle=styles[key], linewidth=1)
+    for n in range(1, n_moments_use):
+        n_factor = 0.1 * (n - 2)
+        ax.scatter([-10], [0], color='k', alpha=a, marker=shapes[n], s=2*marksize, label=moment_names[n])
+        if n>1:
+            ax_n = ax.twinx()
+            rot_ang = 270
+            label_space = 15.
+        else:
+            rot_ang = 90
+            label_space = 0.
+        if n>2:
+            ax_n.spines["right"].set_position(("axes", 1. + 0.1 * (n-1)))
+            make_patch_spines_invisible(ax_n)
+            ax_n.spines["right"].set_visible(True)
+        for s in range(len(formats)):
+            f = formats[s]
+            f_factor = 0.05 * (s - 1)
+            data_arr = np.swapaxes(np.array(pz_stats[f][n]), 0, 1)
+            data_arr = np.median(data_arr, axis=2) * 100.
+            for i in data_arr:
+                ax_n.plot(np.exp(in_x+n_factor), i, linestyle=styles[f], marker=shapes[n], markersize=marksize, color=colors[f], alpha=a)
+        ax_n.set_ylabel(r'median percent error on '+moment_names[n], rotation=rot_ang, fontsize=14, labelpad=label_space)
+        ax_n.set_ylim((min(n_delta_min)-1., min(n_delta_max)+1.))
+    ax.set_xscale('log')
+    ax.set_xticks(flat_floats)
+    ax.get_xaxis().set_major_formatter(mpl.ticker.ScalarFormatter())
+    ax.set_xlim(np.exp(min(in_x)-0.25), np.exp(max(in_x)+0.25))
+    ax.set_xlabel('number of parameters', fontsize=14)
+    ax.set_title(dataset_info[name]['name']+r' data $\hat{n}(z)$ moments', fontsize=16)
+    ax.legend(loc='upper right')
+    fig.tight_layout()
+    fig.savefig(loc+'_all.pdf', dpi=250)
     plt.close()
 
@@ -12631,13 +12782,13 @@

Scaling

-
In [19]:
+
In [17]:
-
def save_nz_metrics(dataset_key, n_gals_use, N_f, nz_klds):
+
def save_nz_metrics(dataset_key, n_gals_use, N_f, nz_klds, stat_name):
     
     path = os.path.join(dataset_key, str(n_gals_use))
-    loc = os.path.join(path, 'nz_kld'+str(n_gals_use)+dataset_key)
+    loc = os.path.join(path, stat_name+str(n_gals_use)+dataset_key)
     if os.path.exists(loc+'.hkl'):
         with open(loc+'.hkl', 'r') as nz_file:
         #read in content of list/dict
@@ -12669,23 +12820,26 @@ 

Scaling

-
In [20]:
+
In [18]:
-
def plot_nz_metrics(dataset_key, n_gals_use):
+
def plot_nz_klds(dataset_key, n_gals_use):
     
     path = os.path.join(dataset_key, str(n_gals_use))
-    loc = os.path.join(path, 'nz_kld'+str(n_gals_use)+dataset_key)
+    loc = os.path.join(path, 'nz_klds'+str(n_gals_use)+dataset_key)
     with open(loc+'.hkl', 'r') as nz_file:
         nz_stats = hickle.load(nz_file)
+    if len(instantiations) == 10:
+        for f in formats:
+            if not np.shape(nz_stats[f]) == (4, 10):
+                for s in range(len(floats)):
+                    nz_stats[f][s] = np.array(np.array(nz_stats[f][s])[:10]).flatten()
 
     flat_floats = np.array(nz_stats['N_f']).flatten()
     
     plt.figure(figsize=(5, 5))
-
     for f in formats:
-#     mu = np.mean(np.array(nz_stats[dataset_key][f]), axis=0)
-#     sigma = np.std(np.array(nz_stats[dataset_key][f]), axis=0)
+#         print('nz klds data shape '+str(nz_stats[f][n]))
         data_arr = np.swapaxes(np.array(nz_stats[f]), 0, 1)#turn N_f * instantiations into instantiations * N_f
         n_i = len(data_arr)
         a = 1./len(formats)#1./n_i
@@ -12696,21 +12850,20 @@ 

Scaling

kld_max.append(max(i)) plt.semilogy() plt.semilogx() + plt.xticks(flat_floats, [str(ff) for ff in flat_floats]) plt.ylim(min(kld_min) / 10., 10. * max(kld_max)) plt.xlim(min(flat_floats) / 3., max(flat_floats) * 3.) plt.xlabel(r'number of parameters', fontsize=14) plt.ylabel(r'KLD', fontsize=14) plt.legend(loc='upper right') - plt.title(r'$\hat{n}(z)$ KLD on '+str(n_gals_use)+' from '+dataset_key, fontsize=16) - - plt.savefig(loc+'.png', dpi=250) + plt.title(r'$\hat{n}(z)$ KLD on '+str(n_gals_use)+' from '+dataset_info[dataset_key]['name']+' mock catalog', fontsize=16) + plt.savefig(loc+'_all.pdf', dpi=250) plt.close() plt.figure(figsize=(5, 5)) a = 1./len(formats) for f in formats: -# mu = np.mean(np.array(nz_stats[dataset_key][f]), axis=0) -# sigma = np.std(np.array(nz_stats[dataset_key][f]), axis=0) +# print('nz klds data shape '+str(nz_stats[f][n])) data_arr = np.swapaxes(np.array(nz_stats[f]), 0, 1)#turn N_f * instantiations into instantiations * N_f plt.plot([10. * max(flat_floats), 10. * max(flat_floats)], [1., 10.], color=colors[f], label=f, linestyle=styles[f]) kld_min.append(np.min(data_arr)) @@ -12725,14 +12878,148 @@

Scaling

plt.fill(x_cor, y_cor, color=colors[f], alpha=a, linewidth=0.) plt.semilogy() plt.semilogx() + plt.xticks(flat_floats, [str(ff) for ff in flat_floats]) plt.ylim(min(kld_min) / 10., 10. * max(kld_max)) - plt.xlim(min(flat_floats) / 3., max(flat_floats) * 3.) + plt.xlim(min(flat_floats), max(flat_floats)) plt.xlabel(r'number of parameters', fontsize=14) plt.ylabel(r'KLD', fontsize=14) plt.legend(loc='upper right') - plt.title(r'$\hat{n}(z)$ KLD on '+str(n_gals_use)+' from '+dataset_key, fontsize=16) + plt.title(dataset_info[dataset_key]['name']+r' data $\hat{n}(z)$ KLD', fontsize=16) + plt.savefig(loc+'_clean.pdf', dpi=250) + plt.close() +

+ +
+
+
+ +
+
+
+
In [19]:
+
+
+
def plot_nz_moments(dataset_key, n_gals_use):
 
-    plt.savefig(loc+'_clean.png', dpi=250)
+    path = os.path.join(dataset_key, str(n_gals_use))
+    loc = os.path.join(path, 'nz_moments'+str(n_gals_use)+dataset_key)
+    with open(loc+'.hkl', 'r') as nz_file:
+        nz_stats = hickle.load(nz_file)
+    flat_floats = np.array(nz_stats['N_f']).flatten()
+    in_x = np.log(flat_floats)
+    a = 1./len(formats)
+    shapes = moment_shapes
+    marksize = 10
+    
+    def make_patch_spines_invisible(ax):
+        ax.set_frame_on(True)
+        ax.patch.set_visible(False)
+        for sp in ax.spines.values():
+            sp.set_visible(False)   
+            
+    fig, ax = plt.subplots()
+    fig.subplots_adjust(right=1.)
+    ax_n = ax
+    for key in formats:
+        ax.plot([-10], [0], color=colors[key], label=key, linestyle=styles[key], linewidth=1)
+#     ax.plot([-10], [0], color='k', label='original', linewidth=0.5, alpha=1.)
+    for n in range(1, n_moments_use):
+        ax.scatter([-10], [0], color='k', alpha=a, marker=shapes[n], s=2*marksize, label=moment_names[n])
+        n_factor = 0.1 * (n - 2)
+        truth = np.swapaxes(np.array(nz_stats['truth'][n]), 0, 1)
+        if n>1:
+            ax_n = ax.twinx()
+            rot_ang = 270
+            label_space = 15.
+        else:
+            rot_ang = 90
+            label_space = 0.
+        if n>2:
+            ax_n.spines["right"].set_position(("axes", 1. + 0.1 * (n-1)))
+            make_patch_spines_invisible(ax_n)
+            ax_n.spines["right"].set_visible(True)
+        for s in range(len(formats)):
+            f = formats[s]
+            f_factor = 0.05 * (s - 1)
+#             print('nz moments data shape '+str(nz_stats[f][n]))
+            data_arr = (np.swapaxes(np.array(nz_stats[f][n]), 0, 1) - truth) / truth * 100.#np.log(np.swapaxes(np.array(nz_stats[f]), 0, 1)[:][:][n])#go from n_floats*instantiations to instantiations*n_floats
+            mean = np.mean(data_arr, axis=0).flatten()
+            std = np.std(data_arr, axis=0).flatten()
+            y_plus = mean + std
+            y_minus = mean - std
+            y_cor = np.array([y_minus[:-1], y_plus[:-1], y_plus[1:], y_minus[1:]])
+            ax_n.plot(np.exp(in_x+n_factor), mean, linestyle=styles[key], marker=shapes[n], markersize=marksize, alpha=a, color=colors[f])
+            ax_n.vlines(np.exp(in_x+n_factor), y_minus, y_plus, linewidth=3., alpha=a, color=colors[f])
+            nz_mean_max[n] = max(nz_mean_max[n], np.max(y_plus))
+            nz_mean_min[n] = min(nz_mean_min[n], np.min(y_minus))
+#         data_arr = np.log(np.swapaxes(np.array(nz_stats['truth'][n]), 0, 1))
+#         mean = np.mean(data_arr, axis=0).flatten()
+#         std = np.std(data_arr, axis=0).flatten()
+#         y_plus = mean + std
+#         y_minus = mean - std
+#         y_cor = np.array([y_minus[:-1], y_plus[:-1], y_plus[1:], y_minus[1:]])
+#         ax_n.plot(np.exp(in_x+n_factor), mean, linestyle='-', marker=shapes[n], markersize=marksize, alpha=a, color='k', linewidth=0.5)
+#         ax_n.vlines(np.exp(in_x+n_factor), y_minus, y_plus, linewidth=3., alpha=a, color='k')
+#         nz_mean_max[n] = max(nz_mean_max[n], np.max(y_plus))
+#         nz_mean_min[n] = min(nz_mean_min[n], np.min(y_minus))
+#         ax_n.plot(np.exp(in_x+n_factor), np.log(nz_stats['truth'][n]), linestyle='-', marker=shapes[n], markersize=marksize, alpha=a, linewidth=0.5, color='k')
+        ax_n.set_ylabel(r'percent error on '+moment_names[n], rotation=rot_ang, fontsize=14, labelpad=label_space)
+        ax_n.set_ylim((min(nz_mean_min)-1., max(nz_mean_max)+1.))
+    ax.set_xscale('log')
+    ax.set_xticks(flat_floats)
+    ax.get_xaxis().set_major_formatter(mpl.ticker.ScalarFormatter())
+    ax.set_xlim(np.exp(min(in_x)-0.25), np.exp(max(in_x)+0.25))
+    ax.set_xlabel('number of parameters', fontsize=14)
+    ax.set_title(dataset_info[dataset_key]['name']+r' data $\hat{n}(z)$ moments', fontsize=16)
+    ax.legend(loc='upper right')
+    fig.tight_layout()
+    fig.savefig(loc+'_clean.pdf', dpi=250)
+    plt.close()
+            
+    fig, ax = plt.subplots()
+    fig.subplots_adjust(right=1.)
+    ax_n = ax
+    for key in formats:
+        ax_n.plot([-10], [0], color=colors[key], label=key, linestyle=styles[key], linewidth=1)
+#     ax.plot([-10], [0], color='k', label='original', linewidth=0.5, alpha=1.)
+    for n in range(1, n_moments_use):
+        n_factor = 0.1 * (n - 2)
+        ax.scatter([-10], [0], color='k', alpha=a, marker=shapes[n], s=2*marksize, label=moment_names[n])
+        truth = np.swapaxes(np.array(nz_stats['truth'][n]), 0, 1)
+        if n>1:
+            ax_n = ax.twinx()
+            rot_ang = 270
+            label_space = 15.
+        else:
+            rot_ang = 90
+            label_space = 0.
+        if n>2:
+            ax_n.spines["right"].set_position(("axes", 1. + 0.1 * (n-1)))
+            make_patch_spines_invisible(ax_n)
+            ax_n.spines["right"].set_visible(True)
+        for s in range(len(formats)):
+            f = formats[s]
+            f_factor = 0.05 * (s - 1)
+#             print('nz moments data shape '+str(nz_stats[f][n]))
+            data_arr = (np.swapaxes(np.array(nz_stats[f][n]), 0, 1) - truth) / truth * 100.
+            for i in data_arr:
+                ax_n.plot(np.exp(in_x+n_factor), i, linestyle=styles[f], marker=shapes[n], markersize=marksize, color=colors[f], alpha=a)
+#                 nz_moment_max[n-1].append(max(i))
+        data_arr = np.log(np.swapaxes(np.array(nz_stats['truth'][n]), 0, 1))
+#         for i in data_arr:
+#             ax_n.plot(np.exp(in_x+n_factor), i, linestyle='-', marker=shapes[n], markersize=marksize, color='k', alpha=a)
+# #         ax_n.plot(np.exp(in_x+n_factor), np.log(nz_stats['truth'][n]), linestyle='-', marker=shapes[n], markersize=marksize, alpha=a, linewidth=0.5, color='k')
+        ax_n.set_ylabel(r'percent error on '+moment_names[n], rotation=rot_ang, fontsize=14, labelpad=label_space)
+        ax_n.set_ylim((min(nz_mean_min)-1., max(nz_mean_max)+1.))
+    ax.set_xscale('log')
+    ax.set_xticks(flat_floats)
+    ax.get_xaxis().set_major_formatter(mpl.ticker.ScalarFormatter())
+    ax.set_xlim(np.exp(min(in_x)-0.25), np.exp(max(in_x)+0.25))
+    ax.set_xlabel('number of parameters', fontsize=14)
+    ax.set_title(dataset_info[dataset_key]['name']+r' data $\hat{n}(z)$ moments', fontsize=16)
+    ax.legend(loc='upper right')
+    fig.tight_layout()
+    fig.savefig(loc+'_all.pdf', dpi=250)
     plt.close()
 
@@ -12753,18 +13040,61 @@

-
In [21]:
+
In [20]:
-
floats = [3, 10, 30, 100]
-sizes = [100]#, 1000, 10000]
-names = ['Optical']
-instantiations = range(0, 2)#10)
+
dataset_info = {}
+delta = 0.01
 
-#many_colors = ['red','green','blue','cyan','magenta','yellow']
-high_res = 300
-n_plot = 5
+dataset_keys = ['mg', 'ss']
+
+for dataset_key in dataset_keys:
+    dataset_info[dataset_key] = {}
+    if dataset_key == 'mg':
+        datafilename = 'bpz_euclid_test_10_3.probs'
+        z_low = 0.01
+        z_high = 3.51
+        nc_needed = 3
+        plotname = 'brighter'
+        skip_rows = 1
+        skip_cols = 1
+    elif dataset_key == 'ss':
+        datafilename = 'test_magscat_trainingfile_probs.out'
+        z_low = 0.005
+        z_high = 2.11
+        nc_needed = 5
+        plotname = 'fainter'
+        skip_rows = 1
+        skip_cols = 1
+    dataset_info[dataset_key]['filename'] = datafilename  
+    
+    dataset_info[dataset_key]['z_lim'] = (z_low, z_high)
+    z_grid = np.arange(z_low, z_high, delta, dtype='float')#np.arange(z_low, z_high + delta, delta, dtype='float')
+    z_range = z_high - z_low
+    delta_z = z_range / len(z_grid)
+    dataset_info[dataset_key]['z_grid'] = z_grid
+    dataset_info[dataset_key]['delta_z'] = delta_z
+
+    dataset_info[dataset_key]['N_GMM'] = nc_needed# will be overwritten later
+    dataset_info[dataset_key]['name'] = plotname
+
+ +
+
+
+ +
+
+
+
In [21]:
+
+
+
high_res = 300
+color_cycle = np.array([(230, 159, 0), (86, 180, 233), (0, 158, 115), (240, 228, 66), (0, 114, 178), (213, 94, 0), (204, 121, 167)])/256.
+n_plot = len(color_cycle)
 n_moments_use = 4
+moment_names = ['integral', 'mean', 'variance', 'kurtosis']
+moment_shapes = ['o', '*', 'P', 'X']
 
 #make this a more clever structure, i.e. a dict
 formats = ['quantiles', 'histogram', 'samples']
@@ -12772,15 +13102,46 @@ 

styles = {'quantiles': '--', 'histogram': ':', 'samples': '-.'} stepstyles = {'quantiles': 'dashed', 'histogram': 'dotted', 'samples': 'dashdot'} +formats_plus = ['quantiles', 'histogram', 'samples', 'truth'] +colors_plus = {'quantiles': 'blueviolet', 'histogram': 'darkorange', 'samples': 'forestgreen', 'truth':'black'} +styles_plus = {'quantiles': '--', 'histogram': ':', 'samples': '-.', 'truth': '-'} + +iqr_min = [3.5] +iqr_max = [delta] +modes_max = [0] pz_max = [1.] nz_max = [1.] hist_max = [1.] dist_min = [0.] dist_max = [0.] -moment_max = [[]] * (n_moments_use - 1) -mean_max = [[]] * (n_moments_use - 1) +pz_mean_max = -10.*np.ones(n_moments_use) +pz_mean_min = 10.*np.ones(n_moments_use) kld_min = [1.] kld_max = [1.] +nz_mean_max = -10.*np.ones(n_moments_use) +nz_mean_min = 10.*np.ones(n_moments_use) +n_delta_max = -10.*np.ones(n_moments_use) +n_delta_min = 10.*np.ones(n_moments_use) +

+ +
+
+
+ +
+
+
+
In [22]:
+
+
+
#change all for NERSC
+
+floats = [3, 10, 30, 100]
+sizes = [10]#[10, 100, 1000]
+names = dataset_info.keys()
+instantiations = range(2, 3)#0)
+
+all_randos = [[np.random.choice(size, n_plot, replace=False) for size in sizes] for name in names]
 
@@ -12801,22 +13162,24 @@

-
In [22]:
+
In [23]:
# the "pipeline"
-
-for name in names:
+global_start = timeit.default_timer()
+for n in range(len(names)):
+    name = names[n]
     
     dataset_start = timeit.default_timer()
     print('started '+name)
     
-    pdfs = setup_dataset(name)
+    pdfs = setup_dataset(name, skip_rows, skip_cols)
     
-    for size in sizes:
+    for s in range(len(sizes)):
+        size=sizes[s]
         
         size_start = timeit.default_timer()
-        print('started '+str(size)+name)
+        print('started '+name+str(size))
         
         path = os.path.join(name, str(size))
         if not os.path.exists(path):
@@ -12824,49 +13187,54 @@ 

n_gals_use = size - randos = np.random.choice(size, n_plot, replace=False) + randos = all_randos[n][s] for i in instantiations: +# top_bonusdict = {} + i_start = timeit.default_timer() + print('started '+name+str(size)+' #'+str(i)) - original = '_original_('+str(i)+')' + original = '_original'+str(i) pdfs_use = make_instantiation(name, size, pdfs, bonus=original) - plot = plot_examples(size, name, bonus=original) +# plot = plot_examples(size, name, bonus=original) +# top_bonusdict[original] = ['-', 0.25] z_grid = dataset_info[name]['in_z_grid'] N_comps = dataset_info[name]['N_GMM'] - postfit = '_post-fit_('+str(i)+')' + postfit = '_postfit'+str(i) catalog = setup_from_grid(name, pdfs_use, z_grid, N_comps, high_res=high_res, bonus=postfit) - plot = plot_examples(size, name, bonus=postfit) +# plot = plot_examples(size, name, bonus=postfit) +# top_bonusdict[postfit] = ['-', 0.5] for n_floats_use in floats: - +# bonusdict = top_bonusdict.copy() float_start = timeit.default_timer() - print('started '+str(size)+name+str(n_floats_use)+'\#'+str(i)) + print('started '+name+str(size)+' #'+str(i)+' with '+str(n_floats_use)) - (ensembles, pz_klds, metric_moments) = analyze_individual(catalog, - z_grid,#dataset_info[name]['metric_z_grid'], - n_floats_use, name, n_moments_use, i=i) - for f in formats: - fname = '_'+str(n_floats_use)+f+'_('+str(i)+')' - plot = plot_examples(size, name, bonus=fname) - plot = plot_individual(size, name, n_floats_use, i=i) - save_pz_metrics(name, size, n_floats_use, metric_moments) + ensembles = analyze_individual(catalog, z_grid, n_floats_use, name, n_moments_use, i=i, bonus=postfit) + +# for f in formats: +# fname = str(n_floats_use)+f+str(i) +# plot = plot_examples(size, name, bonus=fname) +# bonusdict[fname] = [styles[f], 0.5] +# plot = plot_all_examples(name, size, n_floats_use, i, bonus=bonusdict) +# plot = plot_individual_kld(size, name, n_floats_use, i=i) - (stack_evals, nz_klds) = analyze_stacked(catalog, ensembles, z_grid,#dataset_info[name]['metric_z_grid'], - n_floats_use, name, i=i) - plot = plot_estimators(size, name, n_floats_use, i=i) - save_nz_metrics(name, size, n_floats_use, nz_klds) + stack_evals = analyze_stacked(catalog, ensembles, z_grid, n_floats_use, name, i=i) +# plot = plot_estimators(size, name, n_floats_use, i=i) - print('finished '+str(size)+name+str(n_floats_use)+' in '+str(timeit.default_timer() - float_start)) - - plot = plot_pz_metrics(name, size) - - plot = plot_nz_metrics(name, size) + print('FINISHED '+name+str(size)+' #'+str(i)+' with '+str(n_floats_use)+' in '+str(timeit.default_timer() - float_start)) + print('FINISHED '+name+str(size)+' #'+str(i)+' in '+str(timeit.default_timer() - i_start)) +# plot = plot_pz_metrics(name, size) +# plot = plot_pz_delta_moments(name, size) +# plot = plot_nz_klds(name, size) +# plot = plot_nz_moments(name, size) - print('finished '+str(size)+name+' in '+str(timeit.default_timer() - size_start)) + print('FINISHED '+name+str(size)+' in '+str(timeit.default_timer() - size_start)) - print('finished '+name+' in '+str(timeit.default_timer() - dataset_start)) + print('FINISHED '+name+' in '+str(timeit.default_timer() - dataset_start)) +print('FINISHED everything in '+str(timeit.default_timer() - global_start))

@@ -12879,266 +13247,369 @@

-
started Optical
-started 100Optical
-making the initial ensemble of 100 PDFs
-made the pool of 4 in 0.151180028915
-made the catalog in 0.528388977051
-made the initial ensemble of 100 PDFs
-sampling for the GMM fit
-took 300 samples
-making a new ensemble from samples
-made the pool of 4 in 0.00019097328186
-made the catalog in 0.566546916962
-made a new ensemble from samples
-fitting the GMM to samples
-fit the GMM to samples
-making the final ensemble
-made the pool of 4 in 0.000116109848022
-made the catalog in 25.0085771084
-made the final ensemble
-started 100Optical3\#0
-performing quantization
-finished quantization at 48.7826230526
-performing histogramization
-finished histogramization at 36.4276270866
-performing sampling
-finished sampling at 36.4227459431
-making the approximate ensembles
-made the pool of 4 in 0.000101089477539
-made the catalog in 17.6193859577
-made the pool of 4 in 9.51290130615e-05
-made the catalog in 17.8442480564
-made the pool of 4 in 0.000105857849121
-made the catalog in 17.8148469925
-made the approximate ensembles
-calculating the individual metrics
-starting quantiles
-finished with quantiles
-starting samples
-finished with samples
-starting histogram
-finished with histogram
-calculated the individual metrics in 54.539247036
-stacking the ensembles
-stacked the ensembles in 74.3008868694
-calculating the metrics
-calculated the metrics in 0.00213098526001
-finished 100Optical3 in 414.400747061
-started 100Optical10\#0
-performing quantization
-finished quantization at 28.1276221275
-performing histogramization
-finished histogramization at 17.8713190556
-performing sampling
-finished sampling at 18.5625870228
-making the approximate ensembles
-made the pool of 4 in 7.48634338379e-05
-made the catalog in 9.87148594856
-made the pool of 4 in 6.00814819336e-05
-made the catalog in 10.9558501244
-made the pool of 4 in 6.69956207275e-05
-made the catalog in 9.58238005638
-made the approximate ensembles
-calculating the individual metrics
-starting quantiles
-finished with quantiles
-starting samples
-finished with samples
-starting histogram
-finished with histogram
-calculated the individual metrics in 51.0830500126
-stacking the ensembles
-stacked the ensembles in 66.6083409786
-calculating the metrics
-calculated the metrics in 0.00159907341003
-finished 100Optical10 in 279.833028078
-started 100Optical30\#0
-performing quantization
-finished quantization at 39.2002589703
-performing histogramization
-finished histogramization at 16.4590818882
-performing sampling
-finished sampling at 17.0624511242
-making the approximate ensembles
+
started ss
+read in data file in 11.2739341259
+started ss10
+started ss10 #2
+randos for debugging: [21538 91754 37805 55875  5972 56011 72367 67397 25966 71019]
+preprocessed data in 0.0339078903198
+made the pool of 4 in 0.143560171127
+made the catalog in 0.0556938648224
+made the initial ensemble of 10 PDFs in 0.200202941895
+took 300 samples in 1.17442893982
+made the pool of 4 in 9.79900360107e-05
+made the catalog in 0.0416679382324
+made a new ensemble from samples in 0.0423328876495
+fit the GMM to samples in 0.167675018311
+made the pool of 4 in 2.90870666504e-05
+made the catalog in 0.769010782242
+made the final ensemble in 0.769263029099
+calculated 4 moments of original PDFs in 3.48636102676
+started ss10 #2 with 3
+finished making in 0.969101905823
+finished histogramization in 0.932945013046
+finished sampling in 2.92821788788
 made the pool of 4 in 5.10215759277e-05
-made the catalog in 8.61182880402
-made the pool of 4 in 5.00679016113e-05
-made the catalog in 8.71334505081
+made the catalog in 0.376241922379
+made quantiles ensemble in 1.13677692413
+made the pool of 4 in 5.10215759277e-05
+made the catalog in 0.425587892532
+made histogram ensemble in 1.29570603371
+made the pool of 4 in 6.31809234619e-05
+made the catalog in 0.48353600502
+made samples ensemble in 1.30219697952
+calculated the quantiles individual moments, kld moments in 4.10330486298
+calculated the samples individual moments, kld moments in 4.03081297874
+calculated the histogram individual moments, kld moments in 4.66959190369
+stacked quantiles in 0.880034923553
+stacked histogram in 0.823406934738
+stacked samples in 0.851178884506
+stacked truth in 1.29819512367
+calculated the quantiles stacked kld in 0.00104308128357
+calculated the histogram stacked kld in 0.000702142715454
+calculated the samples stacked kld in 0.000648021697998
+calculated the quantiles stacked moments in 0.000274896621704
+calculated the quantiles stacked moments in 0.000715970993042
+calculated the quantiles stacked moments in 0.00103878974915
+calculated the quantiles stacked moments in 0.00144100189209
+calculated the histogram stacked moments in 0.000185012817383
+calculated the histogram stacked moments in 0.000486135482788
+calculated the histogram stacked moments in 0.000741004943848
+calculated the histogram stacked moments in 0.00104212760925
+calculated the samples stacked moments in 0.000212907791138
+calculated the samples stacked moments in 0.000457048416138
+calculated the samples stacked moments in 0.00075101852417
+calculated the samples stacked moments in 0.00113701820374
+calculated the truth stacked moments in 0.000216007232666
+calculated the truth stacked moments in 0.000505924224854
+calculated the truth stacked moments in 0.000817060470581
+calculated the truth stacked moments in 0.00114703178406
+FINISHED ss10 #2 with 3 in 25.5616438389
+started ss10 #2 with 10
+finished making in 0.969255208969
+finished histogramization in 1.01541614532
+finished sampling in 1.30525708199
+made the pool of 4 in 6.103515625e-05
+made the catalog in 0.468002796173
+made quantiles ensemble in 1.53850412369
+made the pool of 4 in 0.000102043151855
+made the catalog in 0.702124118805
+made histogram ensemble in 1.95024609566
+made the pool of 4 in 5.88893890381e-05
+made the catalog in 0.522737979889
+made samples ensemble in 1.39210891724
+calculated the quantiles individual moments, kld moments in 4.09236812592
+calculated the samples individual moments, kld moments in 4.13031983376
+calculated the histogram individual moments, kld moments in 5.16090512276
+stacked quantiles in 0.836797952652
+stacked histogram in 0.853886842728
+stacked samples in 0.849081993103
+stacked truth in 0.855072021484
+calculated the quantiles stacked kld in 0.000715017318726
+calculated the histogram stacked kld in 0.000553131103516
+calculated the samples stacked kld in 0.000545978546143
+calculated the quantiles stacked moments in 0.000257968902588
+calculated the quantiles stacked moments in 0.000645875930786
+calculated the quantiles stacked moments in 0.000994920730591
+calculated the quantiles stacked moments in 0.00130200386047
+calculated the histogram stacked moments in 0.000175952911377
+calculated the histogram stacked moments in 0.000493049621582
+calculated the histogram stacked moments in 0.000676155090332
+calculated the histogram stacked moments in 0.000903129577637
+calculated the samples stacked moments in 0.0001380443573
+calculated the samples stacked moments in 0.000327110290527
+calculated the samples stacked moments in 0.000588178634644
+calculated the samples stacked moments in 0.000787019729614
+calculated the truth stacked moments in 0.000263929367065
+calculated the truth stacked moments in 0.000633001327515
+calculated the truth stacked moments in 0.000967979431152
+calculated the truth stacked moments in 0.0012469291687
+FINISHED ss10 #2 with 10 in 25.4388132095
+started ss10 #2 with 30
+finished making in 0.874783039093
+finished histogramization in 1.15561914444
+finished sampling in 0.931715965271
+made the pool of 4 in 5.79357147217e-05
+made the catalog in 0.467746973038
+made quantiles ensemble in 1.32912802696
 made the pool of 4 in 6.41345977783e-05
-made the catalog in 8.38162398338
-made the approximate ensembles
-calculating the individual metrics
-starting quantiles
-finished with quantiles
-starting samples
-finished with samples
-starting histogram
-finished with histogram
-calculated the individual metrics in 52.6996619701
-stacking the ensembles
-stacked the ensembles in 64.1746098995
-calculating the metrics
-calculated the metrics in 0.00199699401855
-finished 100Optical30 in 270.532145023
-started 100Optical100\#0
-performing quantization
-finished quantization at 77.1992280483
-performing histogramization
-finished histogramization at 15.8513290882
-performing sampling
-finished sampling at 16.1518719196
-making the approximate ensembles
-made the pool of 4 in 5.00679016113e-05
-made the catalog in 8.22205805779
-made the pool of 4 in 4.72068786621e-05
-made the catalog in 8.31369900703
-made the pool of 4 in 4.72068786621e-05
-made the catalog in 8.31879496574
-made the approximate ensembles
-calculating the individual metrics
-starting quantiles
-finished with quantiles
-starting samples
-finished with samples
-starting histogram
-finished with histogram
-calculated the individual metrics in 49.796243906
-stacking the ensembles
-stacked the ensembles in 64.499352932
-calculating the metrics
-calculated the metrics in 0.00170803070068
-finished 100Optical100 in 302.587764025
-making the initial ensemble of 100 PDFs
-made the pool of 4 in 5.81741333008e-05
-made the catalog in 0.163774013519
-made the initial ensemble of 100 PDFs
-sampling for the GMM fit
-took 300 samples
-making a new ensemble from samples
-made the pool of 4 in 6.29425048828e-05
-made the catalog in 0.193228006363
-made a new ensemble from samples
-fitting the GMM to samples
-fit the GMM to samples
-making the final ensemble
-made the pool of 4 in 6.50882720947e-05
-made the catalog in 8.19308781624
-made the final ensemble
-started 100Optical3\#1
-performing quantization
-finished quantization at 22.5612850189
-performing histogramization
-finished histogramization at 15.9950089455
-performing sampling
-finished sampling at 15.6737070084
-making the approximate ensembles
+made the catalog in 0.474547147751
+made histogram ensemble in 1.61009693146
+made the pool of 4 in 8.10623168945e-05
+made the catalog in 0.535789012909
+made samples ensemble in 1.46645712852
+calculated the quantiles individual moments, kld moments in 8.42942214012
+calculated the samples individual moments, kld moments in 6.12721705437
+calculated the histogram individual moments, kld moments in 4.38112401962
+stacked quantiles in 0.854387044907
+stacked histogram in 0.876643896103
+stacked samples in 0.838201999664
+stacked truth in 0.777998924255
+calculated the quantiles stacked kld in 0.000617027282715
+calculated the histogram stacked kld in 0.000613927841187
+calculated the samples stacked kld in 0.000825881958008
+calculated the quantiles stacked moments in 0.000220060348511
+calculated the quantiles stacked moments in 0.000627040863037
+calculated the quantiles stacked moments in 0.000865936279297
+calculated the quantiles stacked moments in 0.00115704536438
+calculated the histogram stacked moments in 0.000209808349609
+calculated the histogram stacked moments in 0.000463962554932
+calculated the histogram stacked moments in 0.000783920288086
+calculated the histogram stacked moments in 0.00114488601685
+calculated the samples stacked moments in 0.000169992446899
+calculated the samples stacked moments in 0.000415086746216
+calculated the samples stacked moments in 0.00068211555481
+calculated the samples stacked moments in 0.000911951065063
+calculated the truth stacked moments in 0.000155925750732
+calculated the truth stacked moments in 0.000379085540771
+calculated the truth stacked moments in 0.000571966171265
+calculated the truth stacked moments in 0.00083589553833
+FINISHED ss10 #2 with 30 in 30.3066999912
+started ss10 #2 with 100
+finished making in 2.44939208031
+finished histogramization in 0.818332910538
+finished sampling in 0.83979511261
+made the pool of 4 in 5.3882598877e-05
+made the catalog in 0.529491901398
+made quantiles ensemble in 1.34206700325
 made the pool of 4 in 5.29289245605e-05
-made the catalog in 8.5415558815
-made the pool of 4 in 4.91142272949e-05
-made the catalog in 8.23108196259
-made the pool of 4 in 5.79357147217e-05
-made the catalog in 9.7219440937
-made the approximate ensembles
-calculating the individual metrics
-starting quantiles
-finished with quantiles
-starting samples
-finished with samples
-starting histogram
-finished with histogram
-calculated the individual metrics in 66.3453202248
-stacking the ensembles
-stacked the ensembles in 86.0212540627
-calculating the metrics
-calculated the metrics in 0.00229501724243
-finished 100Optical3 in 290.403768063
-started 100Optical10\#1
-performing quantization
-finished quantization at 29.4174051285
-performing histogramization
-finished histogramization at 18.0480341911
-performing sampling
-finished sampling at 23.3950479031
-making the approximate ensembles
+made the catalog in 0.467664003372
+made histogram ensemble in 1.25780892372
+made the pool of 4 in 5.29289245605e-05
+made the catalog in 0.435877799988
+made samples ensemble in 1.25956916809
+calculated the quantiles individual moments, kld moments in 6.02705717087
+calculated the samples individual moments, kld moments in 5.44899702072
+calculated the histogram individual moments, kld moments in 4.46730804443
+stacked quantiles in 1.02569699287
+stacked histogram in 0.787800073624
+stacked samples in 1.18201804161
+stacked truth in 1.13130617142
+calculated the quantiles stacked kld in 0.00105404853821
+calculated the histogram stacked kld in 0.000844955444336
+calculated the samples stacked kld in 0.000648021697998
+calculated the quantiles stacked moments in 0.000410079956055
+calculated the quantiles stacked moments in 0.000878095626831
+calculated the quantiles stacked moments in 0.00110411643982
+calculated the quantiles stacked moments in 0.00160813331604
+calculated the histogram stacked moments in 0.000263929367065
+calculated the histogram stacked moments in 0.000639915466309
+calculated the histogram stacked moments in 0.000905990600586
+calculated the histogram stacked moments in 0.00126791000366
+calculated the samples stacked moments in 0.000211000442505
+calculated the samples stacked moments in 0.000504016876221
+calculated the samples stacked moments in 0.00091814994812
+calculated the samples stacked moments in 0.00122618675232
+calculated the truth stacked moments in 0.000243902206421
+calculated the truth stacked moments in 0.000509023666382
+calculated the truth stacked moments in 0.000752925872803
+calculated the truth stacked moments in 0.00108003616333
+FINISHED ss10 #2 with 100 in 28.9493198395
+FINISHED ss10 #2 in 117.245790005
+FINISHED ss10 in 117.246392965
+FINISHED ss in 128.52078104
+started mg
+read in data file in 22.7191698551
+started mg10
+started mg10 #2
+randos for debugging: [51107 68537 53635 23399  9697 77903 25869 12059 40991 63275]
+preprocessed data in 0.0304780006409
 made the pool of 4 in 6.60419464111e-05
-made the catalog in 10.0427048206
-made the pool of 4 in 7.00950622559e-05
-made the catalog in 11.8057179451
-made the pool of 4 in 4.91142272949e-05
-made the catalog in 10.7334661484
-made the approximate ensembles
-calculating the individual metrics
-starting quantiles
-finished with quantiles
-starting samples
-finished with samples
-starting histogram
-finished with histogram
-calculated the individual metrics in 54.9884839058
-stacking the ensembles
-stacked the ensembles in 68.5702269077
-calculating the metrics
-calculated the metrics in 0.00174808502197
-finished 100Optical10 in 296.665799141
-started 100Optical30\#1
-performing quantization
-finished quantization at 40.2204730511
-performing histogramization
-finished histogramization at 16.366106987
-performing sampling
-finished sampling at 16.4885060787
-making the approximate ensembles
-made the pool of 4 in 5.19752502441e-05
-made the catalog in 8.64090895653
-made the pool of 4 in 5.10215759277e-05
-made the catalog in 8.86281299591
-made the pool of 4 in 4.88758087158e-05
-made the catalog in 8.78555202484
-made the approximate ensembles
-calculating the individual metrics
-starting quantiles
-finished with quantiles
-starting samples
-finished with samples
-starting histogram
-finished with histogram
-calculated the individual metrics in 51.2652020454
-stacking the ensembles
-stacked the ensembles in 71.2589428425
-calculating the metrics
-calculated the metrics in 0.00397396087646
-finished 100Optical30 in 280.936203003
-started 100Optical100\#1
-performing quantization
-finished quantization at 107.062072039
-performing histogramization
-finished histogramization at 16.9325959682
-performing sampling
-finished sampling at 18.053632021
-making the approximate ensembles
+made the catalog in 0.039901971817
+made the initial ensemble of 10 PDFs in 0.0404348373413
+took 300 samples in 1.94261884689
 made the pool of 4 in 5.79357147217e-05
-made the catalog in 10.7041079998
-made the pool of 4 in 5.00679016113e-05
-made the catalog in 10.593298912
-made the pool of 4 in 0.000109910964966
-made the catalog in 10.6380779743
-made the approximate ensembles
-calculating the individual metrics
-starting quantiles
-finished with quantiles
-starting samples
-finished with samples
-starting histogram
-finished with histogram
-calculated the individual metrics in 56.7630190849
-stacking the ensembles
-stacked the ensembles in 79.3335080147
-calculating the metrics
-calculated the metrics in 0.00180912017822
-finished 100Optical100 in 376.264676094
-finished 100Optical in 2660.02056098
-finished Optical in 2683.37149405
+made the catalog in 0.0352969169617
+made a new ensemble from samples in 0.0355360507965
+fit the GMM to samples in 0.147353172302
+made the pool of 4 in 2.40802764893e-05
+made the catalog in 0.674927949905
+made the final ensemble in 0.675142049789
+calculated 4 moments of original PDFs in 3.03729391098
+started mg10 #2 with 3
+finished making in 0.780121803284
+finished histogramization in 0.774260044098
+finished sampling in 0.808349132538
+made the pool of 4 in 5.48362731934e-05
+made the catalog in 0.423326015472
+made quantiles ensemble in 1.21974611282
+made the pool of 4 in 4.79221343994e-05
+made the catalog in 0.456423997879
+made histogram ensemble in 1.30866789818
+made the pool of 4 in 4.88758087158e-05
+made the catalog in 0.464884996414
+made samples ensemble in 1.31574010849
+calculated the quantiles individual moments, kld moments in 4.12014389038
+calculated the samples individual moments, kld moments in 3.89831805229
+calculated the histogram individual moments, kld moments in 4.01975798607
+stacked quantiles in 1.21437597275
+stacked histogram in 0.838474988937
+stacked samples in 1.42362308502
+stacked truth in 1.11074781418
+calculated the quantiles stacked kld in 0.00150585174561
+calculated the histogram stacked kld in 0.00222396850586
+calculated the samples stacked kld in 0.00371885299683
+calculated the quantiles stacked moments in 0.000260829925537
+calculated the quantiles stacked moments in 0.000839948654175
+calculated the quantiles stacked moments in 0.00129890441895
+calculated the quantiles stacked moments in 0.00178384780884
+calculated the histogram stacked moments in 0.000329971313477
+calculated the histogram stacked moments in 0.000756025314331
+calculated the histogram stacked moments in 0.0011899471283
+calculated the histogram stacked moments in 0.00180697441101
+calculated the samples stacked moments in 0.000293970108032
+calculated the samples stacked moments in 0.000711917877197
+calculated the samples stacked moments in 0.00110793113708
+calculated the samples stacked moments in 0.00156378746033
+calculated the truth stacked moments in 0.000337839126587
+calculated the truth stacked moments in 0.000759840011597
+calculated the truth stacked moments in 0.00116181373596
+calculated the truth stacked moments in 0.00162887573242
+FINISHED mg10 #2 with 3 in 23.3168389797
+started mg10 #2 with 10
+finished making in 1.51137495041
+finished histogramization in 1.39197802544
+finished sampling in 1.23923683167
+made the pool of 4 in 6.41345977783e-05
+made the catalog in 0.524284124374
+made quantiles ensemble in 1.66794490814
+made the pool of 4 in 8.51154327393e-05
+made the catalog in 0.745748996735
+made histogram ensemble in 2.26602888107
+made the pool of 4 in 7.39097595215e-05
+made the catalog in 0.647045850754
+made samples ensemble in 1.62145590782
+calculated the quantiles individual moments, kld moments in 6.71728801727
+calculated the samples individual moments, kld moments in 5.40768003464
+calculated the histogram individual moments, kld moments in 6.78759598732
+stacked quantiles in 1.14174818993
+stacked histogram in 0.903980970383
+stacked samples in 0.96507692337
+stacked truth in 0.863555908203
+calculated the quantiles stacked kld in 0.000929117202759
+calculated the histogram stacked kld in 0.000620126724243
+calculated the samples stacked kld in 0.000907897949219
+calculated the quantiles stacked moments in 0.000237941741943
+calculated the quantiles stacked moments in 0.00062894821167
+calculated the quantiles stacked moments in 0.000795841217041
+calculated the quantiles stacked moments in 0.00125288963318
+calculated the histogram stacked moments in 0.000307083129883
+calculated the histogram stacked moments in 0.000663042068481
+calculated the histogram stacked moments in 0.000990867614746
+calculated the histogram stacked moments in 0.001384973526
+calculated the samples stacked moments in 0.000274181365967
+calculated the samples stacked moments in 0.000648021697998
+calculated the samples stacked moments in 0.000982999801636
+calculated the samples stacked moments in 0.00139117240906
+calculated the truth stacked moments in 0.000205993652344
+calculated the truth stacked moments in 0.00053596496582
+calculated the truth stacked moments in 0.000850915908813
+calculated the truth stacked moments in 0.00121688842773
+FINISHED mg10 #2 with 10 in 32.9919991493
+started mg10 #2 with 30
+finished making in 1.60738801956
+finished histogramization in 0.867848157883
+finished sampling in 1.39367294312
+made the pool of 4 in 0.000163078308105
+made the catalog in 0.82582116127
+made quantiles ensemble in 2.11856389046
+made the pool of 4 in 5.72204589844e-05
+made the catalog in 0.520595788956
+made histogram ensemble in 1.46121788025
+made the pool of 4 in 0.000101804733276
+made the catalog in 0.525902032852
+made samples ensemble in 1.46025896072
+calculated the quantiles individual moments, kld moments in 4.10835695267
+calculated the samples individual moments, kld moments in 4.14074587822
+calculated the histogram individual moments, kld moments in 3.88139796257
+stacked quantiles in 0.777491092682
+stacked histogram in 0.864093065262
+stacked samples in 0.810467004776
+stacked truth in 0.804461956024
+calculated the quantiles stacked kld in 0.000840187072754
+calculated the histogram stacked kld in 0.000622987747192
+calculated the samples stacked kld in 0.000609874725342
+calculated the quantiles stacked moments in 0.000240087509155
+calculated the quantiles stacked moments in 0.000703096389771
+calculated the quantiles stacked moments in 0.00105404853821
+calculated the quantiles stacked moments in 0.00142908096313
+calculated the histogram stacked moments in 0.000262022018433
+calculated the histogram stacked moments in 0.000568151473999
+calculated the histogram stacked moments in 0.000870227813721
+calculated the histogram stacked moments in 0.00122117996216
+calculated the samples stacked moments in 0.000231027603149
+calculated the samples stacked moments in 0.000545978546143
+calculated the samples stacked moments in 0.000859975814819
+calculated the samples stacked moments in 0.00120496749878
+calculated the truth stacked moments in 0.000247001647949
+calculated the truth stacked moments in 0.000550031661987
+calculated the truth stacked moments in 0.000839948654175
+calculated the truth stacked moments in 0.00118112564087
+FINISHED mg10 #2 with 30 in 24.9179830551
+started mg10 #2 with 100
+finished making in 4.54299402237
+finished histogramization in 0.877863883972
+finished sampling in 0.789620876312
+made the pool of 4 in 5.48362731934e-05
+made the catalog in 0.45265007019
+made quantiles ensemble in 1.30355715752
+made the pool of 4 in 4.91142272949e-05
+made the catalog in 0.463873147964
+made histogram ensemble in 1.32496881485
+made the pool of 4 in 5.31673431396e-05
+made the catalog in 0.430168867111
+made samples ensemble in 1.3029999733
+calculated the quantiles individual moments, kld moments in 4.18964004517
+calculated the samples individual moments, kld moments in 3.98618006706
+calculated the histogram individual moments, kld moments in 4.04275989532
+stacked quantiles in 0.791258096695
+stacked histogram in 0.834949970245
+stacked samples in 0.800794839859
+stacked truth in 0.832922935486
+calculated the quantiles stacked kld in 0.00085711479187
+calculated the histogram stacked kld in 0.000594854354858
+calculated the samples stacked kld in 0.000599145889282
+calculated the quantiles stacked moments in 0.000240802764893
+calculated the quantiles stacked moments in 0.000664949417114
+calculated the quantiles stacked moments in 0.000977993011475
+calculated the quantiles stacked moments in 0.00132584571838
+calculated the histogram stacked moments in 0.00029993057251
+calculated the histogram stacked moments in 0.000585079193115
+calculated the histogram stacked moments in 0.000874042510986
+calculated the histogram stacked moments in 0.00120496749878
+calculated the samples stacked moments in 0.000239849090576
+calculated the samples stacked moments in 0.000517845153809
+calculated the samples stacked moments in 0.000797033309937
+calculated the samples stacked moments in 0.00112199783325
+calculated the truth stacked moments in 0.000216007232666
+calculated the truth stacked moments in 0.000493049621582
+calculated the truth stacked moments in 0.000772953033447
+calculated the truth stacked moments in 0.0010929107666
+FINISHED mg10 #2 with 100 in 26.3846879005
+FINISHED mg10 #2 in 114.395443916
+FINISHED mg10 in 114.39635396
+FINISHED mg in 137.116107941
+FINISHED everything in 265.638630867
 

@@ -13159,36 +13630,75 @@

-
In [23]:
+
In [26]:
+
+
+
floats = [3, 10, 30, 100]
+sizes = [10]#[10, 100, 1000]
+names = dataset_info.keys()
+instantiations = range(2, 3)#0)
+
+all_randos = [[np.random.choice(size, n_plot, replace=False) for size in sizes] for name in names]
+
+ +
+
+
+ +

+
+
+
In [27]:
-
for name in names:
+
# comment out for NERSC
+# run twice to match axis limits
+
+for name in names:
     for size in sizes:
-        path = os.path.join(name, str(size))
         for i in instantiations:
-            
-            plot = plot_examples(size, name, bonus='_original_('+str(i)+')')
-        
-            plot = plot_examples(size, name, bonus='_post-fit_('+str(i)+')')
-            
-            for n_floats_use in floats:
-            
+            top_bonusdict = {}
+            bo = '_original'+str(i)
+#             plot = plot_examples(size, name, bonus=bo)
+            top_bonusdict[bo] = ['-', 0.25]
+            bp = '_postfit'+str(i)
+#             plot = plot_examples(size, name, bonus=bp)
+            top_bonusdict[bp] = ['-', 0.5]
+            for n in range(len(floats)):
+                bonusdict = top_bonusdict.copy()
+                n_floats_use = floats[n]
                 for f in formats:
-                    fname = '_'+str(n_floats_use)+f+'_('+str(i)+')'
-                    plot = plot_examples(size, name, bonus=fname)
-                plot = plot_individual(size, name, n_floats_use, i)
-            
+                    fname = str(n_floats_use)+f+str(i)
+#                     plot = plot_examples(size, name, bonus=fname)
+                    bonusdict[fname] = [styles[f], 0.5]
+                plot = plot_all_examples(name, size, n_floats_use, i, bonus=bonusdict)
+                plot = plot_individual_kld(size, name, n_floats_use, i)
                 plot = plot_estimators(size, name, n_floats_use, i)
-            
         plot = plot_pz_metrics(name, size)
-        
-        plot = plot_nz_metrics(name, size)
+        plot = plot_pz_delta_moments(name, size)
+        plot = plot_nz_klds(name, size)
+        plot = plot_nz_moments(name, size)
 
+
+
+ + +
+
+
/home/aimalz/.local/lib/python2.7/site-packages/matplotlib/axes/_axes.py:6198: RuntimeWarning: invalid value encountered in true_divide
+  m = (m.astype(float) / db) / m.sum()
+
+
+
+ +
+
+
diff --git a/docs/desc-0000-qp-photo-z_approximation/research/analysis.ipynb b/docs/desc-0000-qp-photo-z_approximation/research/analysis.ipynb index fd712ac9..b48c64e7 100644 --- a/docs/desc-0000-qp-photo-z_approximation/research/analysis.ipynb +++ b/docs/desc-0000-qp-photo-z_approximation/research/analysis.ipynb @@ -2,7 +2,9 @@ "cells": [ { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "collapsed": true + }, "source": [ "# The Analysis Pipeline\n", "\n", @@ -36,27 +38,18 @@ "source": [ "from __future__ import print_function\n", " \n", + "import pickle\n", "import hickle\n", "import numpy as np\n", - "from pathos.multiprocessing import ProcessingPool as Pool\n", "import random\n", "import cProfile\n", "import pstats\n", "import StringIO\n", - "import timeit\n", - "import psutil\n", "import sys\n", "import os\n", "import timeit\n", - "\n", - "import pandas as pd\n", - "pd.set_option('display.max_columns', None)\n", - "\n", - "import matplotlib\n", - "import matplotlib.pyplot as plt\n", - "\n", - "#comment out for NERSC\n", - "%matplotlib inline\n", + "import bisect\n", + "import re\n", "\n", "import qp\n", "from qp.utils import calculate_kl_divergence as make_kld\n", @@ -65,6 +58,30 @@ "# random.seed(a=42)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import matplotlib as mpl\n", + "import matplotlib.pyplot as plt\n", + "mpl.rcParams['text.usetex'] = True\n", + "mpl.rcParams['mathtext.rm'] = 'serif'\n", + "mpl.rcParams['font.family'] = 'serif'\n", + "mpl.rcParams['font.serif'] = 'Times New Roman'\n", + "mpl.rcParams['axes.titlesize'] = 16\n", + "mpl.rcParams['axes.labelsize'] = 14\n", + "mpl.rcParams['savefig.dpi'] = 250\n", + "mpl.rcParams['savefig.format'] = 'pdf'\n", + "mpl.rcParams['savefig.bbox'] = 'tight'\n", + "\n", + "#comment out for NERSC\n", + "%matplotlib inline" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -82,42 +99,126 @@ }, "outputs": [], "source": [ - "def setup_dataset(dataset_key):#, n_gals_use):\n", - " \n", + "def setup_dataset(dataset_key, skip_rows, skip_cols):\n", + " start = timeit.default_timer()\n", " with open(dataset_info[dataset_key]['filename'], 'rb') as data_file:\n", " lines = (line.split(None) for line in data_file)\n", - " lines.next()\n", - " pdfs = np.array([[float(line[k]) for k in range(1,len(line))] for line in lines])\n", - " \n", + " for r in range(skip_rows):\n", + " lines.next()\n", + " pdfs = np.array([[float(line[k]) for k in range(skip_cols, len(line))] for line in lines])\n", + " print('read in data file in '+str(timeit.default_timer()-start))\n", " return(pdfs)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "(Sometimes it's nice to specify the indices of problematic catalog entries for debugging purposes.)" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": false + }, + "outputs": [], + "source": [ + "# indices = '17973 55916 27263 80958 92 105911 8785 33319 65920 40503'\n", + "# indices += ' 10352 80751 34891 13496 25851 132 11069 100722 27470 68111'\n", + "# indices += ' 106503 8712 22089 90708 23051 9397 65576 24531 47199 91570'\n", + "# indices += ' 32487 89396 99692 37710 61251 11427 71400 78880 2908 45962'\n", + "# indices += ' 928 94445 71626 107949 87846 15247 73184 22959 19501 74407'\n", + "# indices += ' 87224 46537 108482 49046 79293 96142 41955 3846 11222 98502'\n", + "# indices += ' 1435 62134 7695 23080 46586 79245 107572 3463 49494 89434'\n", + "# indices += ' 87071 5758 67784 16225 42432 17900 64149 52005 30800 107278'\n", + "# indices += ' 104994 101667 72678 70245 101053 57744 96752 79002 54929 94621'\n", + "# indices += ' 42103 45776 107058 25019 70930 6414 68584 105571 13166 1207'\n", + "# indices = '27150 11288 54516 101230 61485 19623 50259 48744 6427 62300'\n", + "# indices = '92250 3847 83378 12742 43667 10569 31701 26828 29136 11683'\n", + "# indices += ' 43998 96531 34802 14008 5083 94955 106754 86870 23547 93601'\n", + "# indices += ' 5869 5157 100074 316 96728 75727 2662 41331 41474 93074'\n", + "# indices += ' 784 105537 39558 108553 46954 41754 47130 54528 34920 58321'\n", + "# indices += ' 70453 108822 98370 74756 25879 80431 61434 65169 46466 6126'\n", + "# indices += ' 6466 101890 108524 96272 25660 81478 92854 24288 88348 7223'\n", + "# indices += ' 58928 49020 2141 25304 75384 34641 65491 45164 44332 107756'\n", + "# indices += ' 91896 75871 87481 24340 7056 80483 49792 20459 70865 109372'\n", + "# indices += ' 34026 53985 60089 4565 38033 5947 51576 3856 24570 3438'\n", + "# indices += ' 22431 60534 81397 16680 88137 14027 86049 21710 96081 13413'\n", + "# indices = '9604 43445 88556 50193 1408 76204 104276 48054 104136 58073'\n", + "# indices += ' 10084 32784 101990 59630 78907 27352 13652 56942 27011 101717'\n", + "# indices += ' 105840 73315 41895 21820 105664 18054 94791 29329 99846 56379'\n", + "# indices += ' 13504 45749 32028 45607 56649 2589 24215 9117 97779 27706'\n", + "# indices += ' 75812 14868 59759 41794 87621 99253 83269 23886 83001 67509'\n", + "# indices += ' 37047 28435 72226 64501 57296 26271 13468 50067 26576 5017'\n", + "# indices += ' 827 22780 65501 78088 75632 28483 108573 6032 60818 26916'\n", + "# indices += ' 99955 8065 89647 4756 91047 73095 12845 10803 52331 62513'\n", + "# indices += ' 6845 26550 94541 3467 37175 101384 96101 109303 110300 53161'\n", + "# indices += ' 41110 31736 70330 11116 58618 52321 68545 87421 61994 48439'\n", + "# # indices = [ 14619, 66891, 67914, 20931, 97633, 7202, 46756, 109704, 93110, 59915]\n", + "# # 35851, 65657, 3292, 3838, 10862, 50447, 5316, 49503, 39363, 110951\n", + "# # 12543, 52661, 46216, 53296, 95524, 84574 , 2607 ,56017 , 64794, 7600\n", + "# # 94746 59924 73186 21069 2579 34780 4623 93464 44621 29828\n", + "# # 111140 74609 34411 42554 32981 34904 10264 1667 42037 23986\n", + "# # 51790 98555 94971 58683 99752 87479 67286 89575 36950 84283\n", + "# # 89866 64959 53221 102714 48642 37379 95257 11874 70743 15107\n", + "# # 93651 48304 93829 64956 94703 107021 88900 7849 88808 71397\n", + "# # 26862 74765 89470 2741 56888 94275 40017 85989 94077 66553\n", + "# # 74666 90417 12553 21928 14720 53798 30290 109516 37033 95242]\n", + "# # indices = [ 59935, 44820, 26407, 84617, 98728, 35216, 73968, 105130, 844, 63892]\n", + "# indices = map(int, indices.split())\n", + "# all_randos = [[indices for size in sizes] for name in names]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false }, "outputs": [], "source": [ "def make_instantiation(dataset_key, n_gals_use, pdfs, bonus=None):\n", " \n", + " start = timeit.default_timer()\n", + " \n", " n_gals_tot = len(pdfs)\n", " full_gal_range = range(n_gals_tot)\n", " subset = np.random.choice(full_gal_range, n_gals_use, replace=False)#range(n_gals_use)\n", + "# subset = indices\n", + " print('randos for debugging: '+str(subset))\n", " pdfs_use = pdfs[subset]\n", - "\n", + " \n", + " modality = []\n", + " dpdfs = pdfs_use[:,1:] - pdfs_use[:,:-1]\n", + " iqrs = []\n", + " for i in range(n_gals_use):\n", + " modality.append(len(np.where(np.diff(np.signbit(dpdfs[i])))[0]))\n", + " cdf = np.cumsum(qp.utils.normalize_integral((dataset_info[dataset_key]['z_grid'], pdfs_use[i]), vb=False)[1])\n", + " iqr_lo = dataset_info[dataset_key]['z_grid'][bisect.bisect_left(cdf, 0.25)]\n", + " iqr_hi = dataset_info[dataset_key]['z_grid'][bisect.bisect_left(cdf, 0.75)]\n", + " iqrs.append(iqr_hi - iqr_lo)\n", + " modality = np.array(modality)\n", + " \n", + " dataset_info[dataset_key]['N_GMM'] = int(np.median(modality))+1\n", + "# print('n_gmm for '+dataset_info[dataset_key]['name']+' = '+str(dataset_info[dataset_key]['N_GMM']))\n", + " \n", " # using the same grid for output as the native format, but doesn't need to be so\n", " dataset_info[dataset_key]['in_z_grid'] = dataset_info[dataset_key]['z_grid']\n", " dataset_info[dataset_key]['metric_z_grid'] = dataset_info[dataset_key]['z_grid']\n", " \n", + " print('preprocessed data in '+str(timeit.default_timer()-start))\n", + " \n", " path = os.path.join(dataset_key, str(n_gals_use))\n", - " loc = os.path.join(path, 'pzs'+bonus+str(n_gals_use)+dataset_key)\n", + " loc = os.path.join(path, 'pzs'+str(n_gals_use)+dataset_key+bonus)\n", " with open(loc+'.hkl', 'w') as filename:\n", " info = {}\n", " info['randos'] = randos\n", " info['z_grid'] = dataset_info[dataset_key]['in_z_grid']\n", " info['pdfs'] = pdfs_use\n", + " info['modes'] = modality\n", + " info['iqrs'] = iqrs\n", " hickle.dump(info, filename)\n", " \n", " return(pdfs_use)" @@ -127,14 +228,14 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": false }, "outputs": [], "source": [ - "def plot_examples(n_gals_use, dataset_key, bonus=None):\n", + "def plot_examples(n_gals_use, dataset_key, bonus=None, norm=False):\n", " \n", " path = os.path.join(dataset_key, str(n_gals_use))\n", - " loc = os.path.join(path, 'pzs'+bonus+str(n_gals_use)+dataset_key)\n", + " loc = os.path.join(path, 'pzs'+str(n_gals_use)+dataset_key+bonus)\n", " with open(loc+'.hkl', 'r') as filename:\n", " info = hickle.load(filename)\n", " randos = info['randos']\n", @@ -146,16 +247,85 @@ " data = (z_grid, pdfs[randos[i]])\n", " data = qp.utils.normalize_integral(qp.utils.normalize_gridded(data))\n", " pz_max.append(np.max(data))\n", - " plt.plot(data[0], data[1], label=dataset_info[dataset_key]['name']+' #'+str(randos[i]))\n", + " plt.plot(data[0], data[1], label=dataset_info[dataset_key]['name']+' \\#'+str(randos[i]), color=color_cycle[i])\n", " plt.xlabel(r'$z$', fontsize=14)\n", " plt.ylabel(r'$p(z)$', fontsize=14)\n", " plt.xlim(min(z_grid), max(z_grid))\n", - " plt.ylim(0., max(pz_max))\n", - " plt.title(dataset_info[dataset_key]['name']+' data', fontsize=16)\n", - " plt.legend()\n", + " plt.title(dataset_info[dataset_key]['name']+' data examples', fontsize=16)\n", + " if norm:\n", + " plt.ylim(0., max(pz_max))\n", + " plt.savefig(loc+'norm.pdf', dpi=250)\n", + " else:\n", + " plt.savefig(loc+'.pdf', dpi=250)\n", + " plt.close()\n", " \n", - " plt.savefig(loc+'.png', dpi=250)\n", - " plt.close()" + " if 'modes' in info.keys():\n", + " modes = info['modes']\n", + " modes_max.append(np.max(modes))\n", + " plt.figure()\n", + " ax = plt.hist(modes, color='k', alpha=1./n_plot, histtype='stepfilled', bins=range(max(modes_max)+1))\n", + " plt.xlabel('modes')\n", + " plt.ylabel('frequency')\n", + " plt.title(dataset_info[dataset_key]['name']+' data modality distribution (median='+str(dataset_info[dataset_key]['N_GMM'])+')', fontsize=16)\n", + " plt.savefig(loc+'modality.pdf', dpi=250)\n", + " plt.close()\n", + " \n", + " if 'iqrs' in info.keys():\n", + " iqrs = info['iqrs']\n", + " iqr_min.append(min(iqrs))\n", + " iqr_max.append(max(iqrs))\n", + " plot_bins = np.linspace(min(iqr_min), max(iqr_max), 20)\n", + " plt.figure()\n", + " ax = plt.hist(iqrs, bins=plot_bins, color='k', alpha=1./n_plot, histtype='stepfilled')\n", + " plt.xlabel('IQR')\n", + " plt.ylabel('frequency')\n", + " plt.title(dataset_info[dataset_key]['name']+' data IQR distribution', fontsize=16)\n", + " plt.savefig(loc+'iqrs.pdf', dpi=250)\n", + " plt.close()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We're going to incrementally save the quantities that are costly to calculate." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def save_one_stat(dataset_name, n_gals_use, N_f, i, stat, stat_name):\n", + " path = os.path.join(dataset_name, str(n_gals_use))\n", + " loc = os.path.join(path, stat_name+str(n_gals_use)+dataset_name+str(N_f)+'_'+str(i))\n", + " with open(loc+'.hkl', 'w') as filename:\n", + " hickle.dump(stat, filename)\n", + " \n", + "def load_one_stat(dataset_name, n_gals_use, N_f, i, stat_name):\n", + " path = os.path.join(dataset_name, str(n_gals_use))\n", + " loc = os.path.join(path, stat_name+str(n_gals_use)+dataset_name+str(N_f)+'_'+str(i))\n", + " with open(loc+'.hkl', 'r') as filename:\n", + " stat = hickle.load(filename)\n", + "# print(stat)\n", + " return stat\n", + "\n", + "def save_moments_wrapper(dataset_name, n_gals_use, N_f, i, stat_name):\n", + " stat = load_one_stat(dataset_name, n_gals_use, N_f, i, stat_name)\n", + " save_moments(dataset_name, n_gals_use, N_f, stat, stat_name)\n", + " \n", + "def save_metrics_wrapper(dataset_name, n_gals_use, N_f, i, stat_name):\n", + " stat = load_one_stat(dataset_name, n_gals_use, N_f, i, stat_name)\n", + " save_nz_metrics(dataset_name, n_gals_use, N_f, stat, stat_name)\n", + " \n", + "def clear_stats(dataset_name, n_gals_use, stat_name):\n", + " path = os.path.join(dataset_name, str(n_gals_use))\n", + " loc = os.path.join(path, stat_name+str(n_gals_use)+dataset_name+'.hkl')\n", + " if os.path.isfile(loc):\n", + " os.remove(loc)" ] }, { @@ -179,38 +349,59 @@ " zlim = (min(z_grid), max(z_grid))\n", " N_pdfs = len(in_pdfs)\n", " \n", - "# plot_examples(N_pdfs, z_grid, pdfs)\n", - " \n", - " print('making the initial ensemble of '+str(N_pdfs)+' PDFs')\n", - " E0 = qp.Ensemble(N_pdfs, gridded=(z_grid, in_pdfs), limits=dataset_info[dataset_key]['z_lim'], vb=True)\n", - " print('made the initial ensemble of '+str(N_pdfs)+' PDFs')\n", + " start = timeit.default_timer()\n", + "# print('making the initial ensemble of '+str(N_pdfs)+' PDFs')\n", + " E0 = qp.Ensemble(N_pdfs, gridded=(z_grid, in_pdfs), limits=dataset_info[dataset_key]['z_lim'], vb=False)\n", + " print('made the initial ensemble of '+str(N_pdfs)+' PDFs in '+str(timeit.default_timer() - start)) \n", " \n", " #fit GMMs to gridded pdfs based on samples (faster than fitting to gridded)\n", - " print('sampling for the GMM fit')\n", + " start = timeit.default_timer()\n", + "# print('sampling for the GMM fit')\n", " samparr = E0.sample(high_res, vb=False)\n", - " print('took '+str(high_res)+' samples')\n", + " print('took '+str(high_res)+' samples in '+str(timeit.default_timer() - start))\n", " \n", - " print('making a new ensemble from samples')\n", + " start = timeit.default_timer()\n", + "# print('making a new ensemble from samples')\n", " Ei = qp.Ensemble(N_pdfs, samples=samparr, limits=dataset_info[dataset_key]['z_lim'], vb=False)\n", - " print('made a new ensemble from samples')\n", + " print('made a new ensemble from samples in '+str(timeit.default_timer() - start))\n", " \n", - " print('fitting the GMM to samples')\n", + " start = timeit.default_timer()\n", + "# print('fitting the GMM to samples')\n", " GMMs = Ei.mix_mod_fit(comps=N_comps, vb=False)\n", - " print('fit the GMM to samples')\n", + " print('fit the GMM to samples in '+str(timeit.default_timer() - start))\n", " \n", " #set the GMMS as the truth\n", - " print('making the final ensemble')\n", + " start = timeit.default_timer()\n", + "# print('making the final ensemble')\n", " Ef = qp.Ensemble(N_pdfs, truth=GMMs, limits=dataset_info[dataset_key]['z_lim'], vb=False)\n", - " print('made the final ensemble')\n", + " print('made the final ensemble in '+str(timeit.default_timer() - start))\n", " \n", " path = os.path.join(dataset_key, str(N_pdfs))\n", - " loc = os.path.join(path, 'pzs'+bonus+str(N_pdfs)+dataset_key)\n", + " loc = os.path.join(path, 'pzs'+str(n_gals_use)+dataset_key+bonus)\n", " with open(loc+'.hkl', 'w') as filename:\n", " info = {}\n", " info['randos'] = randos\n", " info['z_grid'] = z_grid\n", " info['pdfs'] = Ef.evaluate(z_grid, using='truth', norm=True, vb=False)[1]\n", " hickle.dump(info, filename)\n", + " \n", + " start = timeit.default_timer()\n", + "# print('calculating '+str(n_moments_use)+' moments of original PDFs')\n", + " in_moments, vals = [], []\n", + " for n in range(n_moments_use):\n", + " in_moments.append(Ef.moment(n, using='truth', limits=zlim, \n", + " dx=delta_z, vb=False))\n", + " vals.append(n)\n", + " moments = np.array(in_moments)\n", + " print('calculated '+str(n_moments_use)+' moments of original PDFs in '+str(timeit.default_timer() - start))\n", + " \n", + " path = os.path.join(dataset_key, str(N_pdfs))\n", + " loc = os.path.join(path, 'pz_moments'+str(n_gals_use)+dataset_key+bonus)\n", + " with open(loc+'.hkl', 'w') as filename:\n", + " info = {}\n", + " info['truth'] = moments\n", + " info['orders'] = vals\n", + " hickle.dump(info, filename)\n", " \n", " return(Ef)" ] @@ -226,14 +417,15 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": false }, "outputs": [], "source": [ - "def analyze_individual(E, z_grid, N_floats, dataset_key, N_moments=4, i=None):\n", + "def analyze_individual(E, z_grid, N_floats, dataset_key, N_moments=4, i=None, bonus=None):\n", " zlim = (min(z_grid), max(z_grid))\n", " z_range = zlim[-1] - zlim[0]\n", " delta_z = z_range / len(z_grid)\n", + " path = os.path.join(dataset_key, str(n_gals_use))\n", " \n", " Eq, Eh, Es = E, E, E\n", " inits = {}\n", @@ -243,68 +435,156 @@ " inits[f][ff] = None\n", " \n", " qstart = timeit.default_timer()\n", - " print('performing quantization')\n", " inits['quantiles']['quantiles'] = Eq.quantize(N=N_floats, vb=True)\n", - " print('finished quantization at '+str(timeit.default_timer() - qstart))\n", + " print('finished quantization in '+str(timeit.default_timer() - qstart))\n", " hstart = timeit.default_timer()\n", - " print('performing histogramization')\n", " inits['histogram']['histogram'] = Eh.histogramize(N=N_floats, binrange=zlim, vb=False)\n", - " print('finished histogramization at '+str(timeit.default_timer() - hstart))\n", + " print('finished histogramization in '+str(timeit.default_timer() - hstart))\n", " sstart = timeit.default_timer()\n", - " print('performing sampling')\n", " inits['samples']['samples'] = Es.sample(samps=N_floats, vb=False)\n", - " print('finished sampling at '+str(timeit.default_timer() - sstart))\n", + " print('finished sampling in '+str(timeit.default_timer() - sstart))\n", " \n", - " print('making the approximate ensembles')\n", " Eo = {}\n", + " \n", + " metric_start = timeit.default_timer()\n", + " inloc = os.path.join(path, 'pz_moments'+str(n_gals_use)+dataset_key+bonus)\n", + " with open(inloc+'.hkl', 'r') as infilename:\n", + " pz_moments = hickle.load(infilename)\n", + " \n", + " klds, metrics, kld_moments, pz_moment_deltas = {}, {}, {}, {}\n", + " \n", " for f in formats:\n", + " fstart = timeit.default_timer()\n", " Eo[f] = qp.Ensemble(E.n_pdfs, truth=E.truth, \n", " quantiles=inits[f]['quantiles'], \n", " histogram=inits[f]['histogram'],\n", " samples=inits[f]['samples'], \n", " limits=dataset_info[dataset_key]['z_lim'])\n", - " bonus = '_'+str(n_floats_use)+f+'_('+str(i)+')'\n", - " path = os.path.join(dataset_key, str(n_gals_use))\n", - " loc = os.path.join(path, 'pzs'+bonus+str(n_gals_use)+dataset_key)\n", + " \n", + " fbonus = str(N_floats)+f+str(i)\n", + " loc = os.path.join(path, 'pzs'+str(n_gals_use)+dataset_key+fbonus)\n", " with open(loc+'.hkl', 'w') as filename:\n", " info = {}\n", " info['randos'] = randos\n", " info['z_grid'] = z_grid\n", " info['pdfs'] = Eo[f].evaluate(z_grid, using=f, norm=True, vb=False)[1]\n", " hickle.dump(info, filename)\n", - " print('made the approximate ensembles')\n", - " \n", - " print('calculating the individual metrics')\n", - " metric_start = timeit.default_timer()\n", - " klds, metrics, moments = {}, {}, {}\n", - " \n", - " for key in Eo.keys():\n", - " print('starting '+key)\n", - " klds[key] = Eo[key].kld(using=key, limits=zlim, dx=delta_z)\n", + " print('made '+f+' ensemble in '+str(timeit.default_timer()-fstart))\n", + "\n", + " key = f\n", + " \n", + " fstart = timeit.default_timer()\n", + " klds[key] = Eo[key].kld(using=key, limits=zlim, dx=delta_z, vb=False)\n", + " print('calculated the '+key+' individual klds in '+str(timeit.default_timer() - fstart))\n", + " \n", + " fstart = timeit.default_timer()\n", + " kld_moments[key] = []\n", " samp_metric = qp.PDF(samples=klds[key])\n", " gmm_metric = samp_metric.mix_mod_fit(n_components=dataset_info[dataset_key]['N_GMM'], \n", " using='samples', vb=False)\n", " metrics[key] = qp.PDF(truth=gmm_metric)\n", - " moments[key] = []\n", - " for n in range(N_moments+1):\n", - " moments[key].append([qp.utils.calculate_moment(metrics[key], n,\n", + " for n in range(N_moments):\n", + " kld_moments[key].append(qp.utils.calculate_moment(metrics[key], n,\n", " using='truth', \n", " limits=zlim, \n", " dx=delta_z, \n", - " vb=False)])\n", - " print('finished with '+key)\n", - " print('calculated the individual metrics in '+str(timeit.default_timer() - metric_start))\n", - "\n", - " path = os.path.join(dataset_key, str(E.n_pdfs))\n", - " loc = os.path.join(path, str(N_floats)+'kld_hist'+str(n_gals_use)+dataset_key+str(i))\n", + " vb=False))\n", + " save_one_stat(name, size, n_floats_use, i, kld_moments, 'pz_kld_moments')\n", + " print('calculated the '+key+' kld moments in '+str(timeit.default_timer() - fstart))\n", + " \n", + " pz_moment_deltas[key], pz_moments[key] = [], []\n", + " for n in range(N_moments):\n", + " start = timeit.default_timer()\n", + " new_moment = Eo[key].moment(n, using=key, limits=zlim, \n", + " dx=delta_z, vb=False)\n", + " pz_moments[key].append(new_moment)\n", + " #NOTE: delta_moment is crazy for clean data!\n", + " delta_moment = (new_moment - pz_moments['truth'][n]) / pz_moments['truth'][n]\n", + " pz_moment_deltas[key].append(delta_moment)\n", + " print('calculated the '+key+' individual moment '+str(n)+' in '+str(timeit.default_timer() - start))\n", + " save_one_stat(name, size, n_floats_use, i, pz_moments, 'pz_moments')\n", + " save_one_stat(name, size, n_floats_use, i, pz_moment_deltas, 'pz_moment_deltas')\n", + " \n", + " loc = os.path.join(path, 'kld_hist'+str(n_gals_use)+dataset_key+str(N_floats)+'_'+str(i))\n", " with open(loc+'.hkl', 'w') as filename:\n", " info = {}\n", " info['z_grid'] = z_grid\n", " info['N_floats'] = N_floats\n", " info['pz_klds'] = klds\n", " hickle.dump(info, filename)\n", + "\n", + " outloc = os.path.join(path, 'pz_moments'+str(n_gals_use)+dataset_key+str(N_floats)+'_'+str(i))\n", + " with open(outloc+'.hkl', 'w') as outfilename:\n", + " hickle.dump(pz_moments, outfilename)\n", + " \n", + "# save_moments(name, size, n_floats_use, kld_moments, 'pz_kld_moments')\n", + "# save_moments(name, size, n_floats_use, pz_moments, 'pz_moments')\n", + "# save_moments(name, size, n_floats_use, pz_moment_deltas, 'pz_moment_deltas')\n", " \n", - " return(Eo, klds, moments)" + " return(Eo)#, klds, kld_moments, pz_moments, pz_moment_deltas)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def plot_all_examples(name, size, N_floats, init, bonus={}):\n", + " path = os.path.join(name, str(size))\n", + " fig, ax = plt.subplots()\n", + "# fig_check, ax_check = plt.subplots()\n", + " lines = []\n", + " loc = os.path.join(path, 'pzs'+str(size)+name+'_postfit'+str(init))\n", + " with open(loc+'.hkl', 'r') as filename:\n", + " info = hickle.load(filename)\n", + " ref_pdfs = info['pdfs'] \n", + "# klds = {}\n", + " for bonus_key in bonus.keys():\n", + " loc = os.path.join(path, 'pzs'+str(size)+name+bonus_key)\n", + " with open(loc+'.hkl', 'r') as filename:\n", + " info = hickle.load(filename)\n", + " randos = info['randos']\n", + " z_grid = info['z_grid']\n", + " pdfs = info['pdfs']\n", + " ls = bonus[bonus_key][0]\n", + " a = bonus[bonus_key][1]\n", + " lab = re.sub(r'[\\_]', '', bonus_key)\n", + " line, = ax.plot([-1., 0.], [0., 0.], linestyle=ls, alpha=a, color='k', label=lab[:-1])\n", + " lines.append(line)\n", + " leg = ax.legend(loc='upper right', handles=lines)\n", + "# klds[bonus_key] = []\n", + " for i in range(n_plot):\n", + " data = (z_grid, pdfs[randos[i]])\n", + " data = qp.utils.normalize_integral(qp.utils.normalize_gridded(data))\n", + " ax.plot(data[0], data[1], linestyle=ls, alpha=a, color=color_cycle[i])\n", + " # ax.legend(loc='upper right')\n", + "# for i in range(size):\n", + "# data = (z_grid, pdfs[i])\n", + "# kld = qp.utils.quick_kl_divergence(ref_pdfs[i], pdfs[i], dx=0.01)\n", + "# klds[bonus_key].append(kld)\n", + "# plot_bins = np.linspace(-3., 3., 20)\n", + "# for bonus_key in bonus.keys()[1:-1]:\n", + "# ax_check.hist(np.log(np.array(klds[bonus_key])), alpha=a, \n", + "# histtype='stepfilled', edgecolor='k', \n", + "# label=bonus_key, normed=True, bins=plot_bins, lw=2)\n", + " ax.set_xlabel(r'$z$', fontsize=14)\n", + " ax.set_ylabel(r'$p(z)$', fontsize=14)\n", + " ax.set_xlim(min(z_grid), max(z_grid))\n", + " ax.set_title(dataset_info[name]['name']+r' examples with $N_{f}=$'+str(N_floats), fontsize=16)\n", + " saveloc = os.path.join(path, 'pzs'+str(size)+name+str(N_floats)+'all'+str(init))\n", + " fig.savefig(saveloc+'.pdf', dpi=250)\n", + "# ax_check.legend()\n", + "# ax_check.set_ylabel('frequency', fontsize=14)\n", + "# ax_check.set_xlabel(r'$\\mathrm{KLD}$', fontsize=14)\n", + "# ax_check.set_title(name+r' data $p(\\mathrm{KLD})$ with $N_{f}='+str(N_floats)+r'$', fontsize=16)\n", + "# fig_check.savefig(saveloc+'kld_check.pdf', dpi=250)\n", + " plt.close()\n", + "# with open(saveloc+'.p', 'w') as kldfile:\n", + "# pickle.dump(klds, kldfile)\n", + " " ] }, { @@ -315,10 +595,11 @@ }, "outputs": [], "source": [ - "def plot_individual(n_gals_use, dataset_key, N_floats, i):\n", + "def plot_individual_kld(n_gals_use, dataset_key, N_floats, i):\n", " \n", " path = os.path.join(dataset_key, str(n_gals_use))\n", - " loc = os.path.join(path, str(N_floats)+'kld_hist'+str(n_gals_use)+dataset_key+str(i))\n", + " a = 1./len(formats)\n", + " loc = os.path.join(path, 'kld_hist'+str(n_gals_use)+dataset_key+str(N_floats)+'_'+str(i))\n", " with open(loc+'.hkl', 'r') as filename:\n", " info = hickle.load(filename)\n", " z_grid = info['z_grid']\n", @@ -326,25 +607,66 @@ " pz_klds = info['pz_klds']\n", " \n", " plt.figure()\n", - " plot_bins = np.linspace(-3., 3., 20)\n", - " a = 1./len(formats)\n", + " plot_bins = np.linspace(-10., 5., 30)\n", " for key in pz_klds.keys():\n", " logdata = qp.utils.safelog(pz_klds[key])\n", - " kld_hist = plt.hist(logdata, color=colors[key], alpha=a, histtype='stepfilled', edgecolor='k',\n", - " label=key, normed=True, bins=plot_bins, linestyle=stepstyles[key], ls=stepstyles[key], lw=3)\n", - " hist_max.append(max(kld_hist[0]))\n", " dist_min.append(min(logdata))\n", " dist_max.append(max(logdata))\n", + "# plot_bins = np.linspace(-10., 5., 20)\n", + " kld_hist = plt.hist(logdata, color=colors[key], alpha=a, histtype='stepfilled', edgecolor='k',\n", + " label=key, normed=True, bins=plot_bins, linestyle=stepstyles[key], ls=stepstyles[key], lw=2)\n", + "# kld_hist = plt.hist(pz_klds[key], color=colors[key], alpha=a, histtype='stepfilled', edgecolor='k',\n", + "# label=key, normed=True, bins=plot_bins, linestyle=stepstyles[key], ls=stepstyles[key], lw=2)\n", + " hist_max.append(max(kld_hist[0]))\n", + "# print(loc+': min log[KLD]='+str(logdata)+' at N='+str(np.argmin(logdata)))\n", " plt.legend()\n", " plt.ylabel('frequency', fontsize=14)\n", - " plt.xlabel(r'$\\log[KLD]$', fontsize=14)\n", + "# plt.xlabel(r'$\\log[\\mathrm{KLD}]$', fontsize=14)\n", + " plt.xlabel(r'$\\log[\\mathrm{KLD}]$', fontsize=14)\n", "# plt.xlim(min(dist_min), max(dist_max))\n", - "# plt.ylim(0., max(hist_max))\n", - " plt.title(dataset_info[dataset_key]['name']+r' data $p(KLD)$ with $N_{f}='+str(N_floats)+r'$', fontsize=16)\n", - " plt.savefig(loc+'.png', dpi=250)\n", + " plt.ylim(0., max(hist_max))\n", + " plt.title(dataset_info[dataset_key]['name']+r' data $p(\\log[\\mathrm{KLD}])$ with $N_{f}='+str(N_floats)+r'$', fontsize=16)\n", + " plt.savefig(loc+'.pdf', dpi=250)\n", " plt.close()" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def plot_all_kld(size, name, i):\n", + " path = os.path.join(name, str(size))\n", + " fig, ax = plt.subplots()\n", + " fig.canvas.draw()\n", + " for i in instantiations:\n", + " to_plot = {}\n", + " for f in formats:\n", + " to_plot[f] = []\n", + " for Nf in floats:\n", + " place = os.path.join(path, 'kld_hist'+str(size)+name+str(Nf)+'_'+str(i))\n", + " with open(place+'.hkl', 'r') as filename:\n", + " klds = hickle.load(filename)['pz_klds']\n", + " for f in formats:\n", + " to_plot[f].append(klds[f])\n", + "# print(name, size, i, Nf, f, klds[f])\n", + " for f in formats:\n", + " to_plot[f] = np.array(to_plot[f])\n", + " delta_info = np.ones((len(floats), size))\n", + " for Nf in floats:\n", + " delta_info[:-1] = to_plot[f][1:] - to_plot[f][:-1]\n", + " delta_info[-1] = -1. * to_plot[f][-1]\n", + " ax.plot(floats, delta_info, color=colors[f])\n", + " ax.set_xlabel()\n", + " ax.set_ylabel()\n", + " ax.semilogx()\n", + " ax.set_xticks(floats)\n", + " ax.set_xticklabels([r'$3\\to 10$', r'$10\\to 30$', r'$30\\to 100$', r'$100\\to \\infty$'])" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -366,53 +688,73 @@ " z_range = zlim[-1] - zlim[0]\n", " delta_z = z_range / len(z_grid)\n", " \n", - " print('stacking the ensembles')\n", - " stack_start = timeit.default_timer()\n", + " n_gals_use = E0.n_pdfs\n", + " \n", + "# print('stacking the ensembles')\n", + "# stack_start = timeit.default_timer()\n", " stacked_pdfs, stacks = {}, {}\n", " for key in formats:\n", + " start = timeit.default_timer()\n", " stacked_pdfs[key] = qp.PDF(gridded=E[key].stack(z_grid, using=key, \n", " vb=False)[key])\n", " stacks[key] = stacked_pdfs[key].evaluate(z_grid, using='gridded', norm=True, vb=False)[1]\n", + " print('stacked '+key+ ' in '+str(timeit.default_timer()-start))\n", " \n", + " stack_start = timeit.default_timer()\n", " stacked_pdfs['truth'] = qp.PDF(gridded=E0.stack(z_grid, using='truth', \n", " vb=False)['truth'])\n", " \n", " stacks['truth'] = stacked_pdfs['truth'].evaluate(z_grid, using='gridded', norm=True, vb=False)[1]\n", - " print('stacked the ensembles in '+str(timeit.default_timer() - stack_start))\n", + " print('stacked truth in '+str(timeit.default_timer() - stack_start))\n", " \n", - " print('calculating the metrics')\n", - " metric_start = timeit.default_timer()\n", " klds = {}\n", " for key in formats:\n", + " kld_start = timeit.default_timer()\n", " klds[key] = qp.utils.calculate_kl_divergence(stacked_pdfs['truth'],\n", " stacked_pdfs[key], \n", " limits=zlim, dx=delta_z)\n", - " print('calculated the metrics in '+str(timeit.default_timer() - metric_start))\n", + " print('calculated the '+key+' stacked kld in '+str(timeit.default_timer() - kld_start))\n", + " save_one_stat(dataset_key, n_gals_use, n_floats_use, i, klds, 'nz_klds')\n", + "# save_nz_metrics(name, size, n_floats_use, klds, 'nz_klds')\n", + " \n", + " moments = {}\n", + " for key in formats_plus:\n", + " moment_start = timeit.default_timer()\n", + " moments[key] = []\n", + " for n in range(n_moments_use):\n", + " moments[key].append(qp.utils.calculate_moment(stacked_pdfs[key], n, \n", + " limits=zlim, \n", + " dx=delta_z, \n", + " vb=False))\n", + " print('calculated the '+key+' stacked moments in '+str(timeit.default_timer() - moment_start))\n", + " save_one_stat(dataset_key, n_gals_use, n_floats_use, i, moments, 'nz_moments')\n", + "# save_moments(name, size, n_floats_use, moments, 'nz_moments') \n", " \n", " path = os.path.join(dataset_key, str(E0.n_pdfs))\n", - " loc = os.path.join(path, str(n_floats_use)+'nz_comp'+str(n_gals_use)+dataset_key+str(i))\n", + " loc = os.path.join(path, 'nz_comp'+str(n_gals_use)+dataset_key+str(n_floats_use)+'_'+str(i))\n", " with open(loc+'.hkl', 'w') as filename:\n", " info = {}\n", " info['z_grid'] = z_grid\n", " info['stacks'] = stacks\n", " info['klds'] = klds\n", + " info['moments'] = moments\n", " hickle.dump(info, filename)\n", " \n", - " return(stacked_pdfs, klds)" + " return(stacked_pdfs)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": false }, "outputs": [], "source": [ "def plot_estimators(n_gals_use, dataset_key, n_floats_use, i=None):\n", " \n", " path = os.path.join(dataset_key, str(n_gals_use))\n", - " loc = os.path.join(path, str(n_floats_use)+'nz_comp'+str(n_gals_use)+dataset_key+str(i))\n", + " loc = os.path.join(path, 'nz_comp'+str(n_gals_use)+dataset_key+str(n_floats_use)+'_'+str(i))\n", " with open(loc+'.hkl', 'r') as filename:\n", " info = hickle.load(filename)\n", " z_grid = info['z_grid']\n", @@ -420,18 +762,18 @@ " klds = info['klds']\n", " \n", " plt.figure()\n", - " plt.plot(z_grid, stacks['truth'], color='black', lw=4, alpha=0.3, label='truth')\n", + " plt.plot(z_grid, stacks['truth'], color='black', lw=3, alpha=0.3, label='original')\n", " nz_max.append(max(stacks['truth']))\n", " for key in formats:\n", " nz_max.append(max(stacks[key]))\n", - " plt.plot(z_grid, stacks[key], label=key+r' KLD='+str(klds[key]), color=colors[key], linestyle=styles[key])\n", + " plt.plot(z_grid, stacks[key], label=key+r' KLD='+str(klds[key])[:8], color=colors[key], linestyle=styles[key])\n", " plt.xlabel(r'$z$', fontsize=14)\n", " plt.ylabel(r'$\\hat{n}(z)$', fontsize=14)\n", " plt.xlim(min(z_grid), max(z_grid))\n", - "# plt.ylim(0., max(nz_max))\n", + " plt.ylim(0., max(nz_max))\n", " plt.legend()\n", " plt.title(dataset_info[dataset_key]['name']+r' data $\\hat{n}(z)$ with $N_{f}='+str(n_floats_use)+r'$', fontsize=16)\n", - " plt.savefig(loc+'.png', dpi=250)\n", + " plt.savefig(loc+'.pdf', dpi=250)\n", " plt.close()" ] }, @@ -471,37 +813,129 @@ }, "outputs": [], "source": [ - "def save_pz_metrics(dataset_key, n_gals_use, N_f, metric_moments):\n", + "def save_moments(dataset_name, n_gals_use, N_f, stat, stat_name):\n", "\n", - " path = os.path.join(dataset_key, str(n_gals_use))\n", - " loc = os.path.join(path, 'pz_klds'+str(n_gals_use)+dataset_key)\n", + " path = os.path.join(dataset_name, str(n_gals_use))\n", + " loc = os.path.join(path, stat_name+str(n_gals_use)+dataset_name)\n", " \n", " if os.path.exists(loc+'.hkl'):\n", - " with open(loc+'.hkl', 'r') as pz_file:\n", + " with open(loc+'.hkl', 'r') as stat_file:\n", " #read in content of list/dict\n", - " pz_stats = hickle.load(pz_file)\n", + " stats = hickle.load(stat_file)\n", " else:\n", - " pz_stats = {}\n", - " pz_stats['N_f'] = []\n", - " for f in formats:\n", - " pz_stats[f] = []\n", - " for m in range(n_moments_use + 1):\n", - " pz_stats[f].append([])\n", + " stats = {}\n", + " stats['N_f'] = []\n", + " for f in stat.keys():\n", + " stats[f] = []\n", + " for m in range(n_moments_use):\n", + " stats[f].append([])\n", "\n", - " if N_f not in pz_stats['N_f']:\n", - " pz_stats['N_f'].append(N_f)\n", - " for f in formats:\n", - " for m in range(n_moments_use + 1):\n", - " pz_stats[f][m].append([])\n", + " if N_f not in stats['N_f']:\n", + " stats['N_f'].append(N_f)\n", + " for f in stat.keys():\n", + " for m in range(n_moments_use):\n", + " stats[f][m].append([])\n", " \n", - " where_N_f = pz_stats['N_f'].index(N_f)\n", + " where_N_f = stats['N_f'].index(N_f)\n", " \n", - " for f in formats:\n", - " for m in range(n_moments_use + 1):\n", - " pz_stats[f][m][where_N_f].append(metric_moments[f][m])\n", + " for f in stat.keys():\n", + " for m in range(n_moments_use):\n", + " stats[f][m][where_N_f].append(stat[f][m])\n", "\n", - " with open(loc+'.hkl', 'w') as pz_file:\n", - " hickle.dump(pz_stats, pz_file)" + " with open(loc+'.hkl', 'w') as stat_file:\n", + " hickle.dump(stats, stat_file)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# include second axis with mean KLD values?\n", + "# somehow combining pz_kld_moments with this?\n", + "# something is not right here with limits, need to check after nersc run\n", + "def plot_kld_stats(name, size):\n", + " a = 1./len(formats)\n", + " topdir = os.path.join(name, str(size))\n", + " \n", + " fig_one, ax_one = plt.subplots(figsize=(5, 5))\n", + " fig_one.canvas.draw()\n", + " mean_deltas, std_deltas = {}, {}\n", + " for f in formats:\n", + " mean_deltas[f], std_deltas[f] = [], []\n", + " ax_one.plot([1000., 1000.], [1., 10.], color=colors[f], alpha=a, label=f, linestyle=styles[f])\n", + " for i in instantiations:\n", + " to_plot = {}\n", + " for f in formats:\n", + " to_plot[f] = []\n", + " mean_deltas[f].append([])\n", + " std_deltas[f].append([])\n", + " for Nf in floats:\n", + " loc = os.path.join(topdir, 'kld_hist'+str(size)+name+str(Nf)+'_'+str(i))\n", + " with open(loc+'.hkl', 'r') as filename:\n", + " klds = hickle.load(filename)['pz_klds']\n", + " for f in formats:\n", + " to_plot[f].append(klds[f])\n", + " for f in formats:\n", + " to_plot[f] = np.array(to_plot[f])\n", + " delta_info = np.zeros((len(floats), size))\n", + " delta_info[:-1] = to_plot[f][:-1] - to_plot[f][1:]\n", + " delta_info[-1] = to_plot[f][-1]\n", + "# delta_info[delta_info < qp.utils.epsilon] = qp.utils.epsilon\n", + "# log_delta_info = np.log(delta_info)\n", + "# ax_one.plot(floats, log_delta_info)\n", + " mean_deltas[f][i] = np.mean(delta_info, axis=1)\n", + " std_deltas[f][i] = np.std(delta_info, axis=1)\n", + " indie_delta_kld_min.append(np.min(mean_deltas[f][i] - std_deltas[f][i]))\n", + " indie_delta_kld_max.append(np.max(mean_deltas[f][i] + std_deltas[f][i]))\n", + " ax_one.plot(floats, mean_deltas[f][i], color=colors[f], alpha=a, linestyle=styles[f])\n", + " ax_one.set_ylabel(r'$\\Delta\\mathrm{KLD}$ (nats)')\n", + "# ax_one.semilogy()\n", + " ax_one.set_ylim(0., np.max(indie_delta_kld_max))\n", + " ax_one.set_xlim(min(floats), max(floats))\n", + " ax_one.set_xlabel('change in number of parameters')\n", + " ax_one.semilogx()\n", + " ax_one.set_xticks(floats)\n", + " ax_one.set_xticklabels([r'$3\\to 10$', r'$10\\to 30$', r'$30\\to 100$', r'$100\\to \\infty$'])\n", + " ax_one.legend(loc='upper right')\n", + " ax_one.set_title(dataset_info[name]['name']+r' data per-PDF $\\Delta\\mathrm{KLD}$', fontsize=16)\n", + " place = os.path.join(topdir, 'indie_klds'+str(size)+name)\n", + " fig_one.savefig(place+'_each.pdf', dpi=250)\n", + " plt.close()\n", + " \n", + " fig, ax = plt.subplots(figsize=(5, 5))\n", + " for f in formats:\n", + " mean_deltas[f] = np.array(mean_deltas[f])\n", + " std_deltas[f] = np.array(std_deltas[f])\n", + " global_delta_mean = np.mean(mean_deltas[f], axis=0)\n", + " global_delta_std = np.sqrt(np.sum(mean_deltas[f]**2, axis=0))\n", + " print(global_delta_mean, global_delta_std)\n", + "# x_cor = np.array([floats[:-1], floats[:-1], floats[1:], floats[1:]])\n", + " y_plus = global_delta_mean + global_delta_std\n", + " y_minus = global_delta_mean - global_delta_std\n", + "# y_minus[y_minus < qp.utils.epsilon] = qp.utils.epsilon\n", + " indie_delta_kld_min.append(np.min(y_minus))\n", + " indie_delta_kld_max.append(np.max(y_plus))\n", + "# y_cor = np.array([y_minus[:-1], y_plus[:-1], y_plus[1:], y_minus[1:]])\n", + "# ax.fill(x_cor, y_cor, color=colors[f], alpha=a, linewidth=0.)\n", + " ax.fill_between(floats, y_minus, y_plus, color=colors[f], alpha=a, linewidth=0.)\n", + " ax.plot(floats, global_delta_mean, color=colors[f], linestyle=styles[f], label=f)\n", + " ax.set_ylabel(r'$\\Delta\\mathrm{KLD}$ (nats)')\n", + "# ax.semilogy()\n", + " ax.set_ylim(0., np.max(indie_delta_kld_max))\n", + " ax.set_xlim(min(floats), max(floats))\n", + " ax.set_xlabel('change in number of parameters')\n", + " ax.semilogx()\n", + " ax.set_xticks(floats)\n", + " ax.set_xticklabels([r'$3\\to 10$', r'$10\\to 30$', r'$30\\to 100$', r'$100\\to \\infty$'])\n", + " ax.legend(loc='upper right')\n", + " ax.set_title(dataset_info[name]['name']+r' data per-PDF $\\Delta\\mathrm{KLD}$', fontsize=16)\n", + " place = os.path.join(topdir, 'indie_klds'+str(size)+name)\n", + " fig.savefig(place+'_clean.pdf', dpi=250)\n", + " plt.close()" ] }, { @@ -515,16 +949,10 @@ "def plot_pz_metrics(dataset_key, n_gals_use):\n", "\n", " path = os.path.join(dataset_key, str(n_gals_use))\n", - " loc = os.path.join(path, 'pz_klds'+str(n_gals_use)+dataset_key)\n", + " loc = os.path.join(path, 'pz_kld_moments'+str(n_gals_use)+dataset_key)\n", " with open(loc+'.hkl', 'r') as pz_file:\n", " pz_stats = hickle.load(pz_file)\n", - " if len(instantiations) == 10:\n", - " for f in formats:\n", - " for n in range(n_moments_use):\n", - " if not np.shape(pz_stats[f][n]) == (4, 10):\n", - " for s in range(len(pz_stats[f][n])):\n", - " pz_stats[f][n][s] = np.array(np.array(pz_stats[f][n][s])[:10]).flatten()\n", - " \n", + " \n", " flat_floats = np.array(pz_stats['N_f']).flatten()\n", " in_x = np.log(flat_floats)\n", "\n", @@ -534,51 +962,71 @@ " for sp in ax.spines.values():\n", " sp.set_visible(False)\n", "\n", - " shapes = ['o','*','+','x']#,'v','^','<','>']\n", - " marksize = 50\n", + " shapes = moment_shapes\n", + " marksize = 10\n", " a = 1./len(formats)\n", " \n", " fig, ax = plt.subplots()\n", " fig.subplots_adjust(right=1.)\n", " ax_n = ax\n", " for key in formats:\n", - " ax_n.plot([-1], [0], color=colors[key], label=key, linestyle=styles[key], linewidth=1)\n", - " for n in range(1, 4):\n", - " ax.scatter([-1], [0], color='k', marker=shapes[n], s=marksize, label='moment '+str(n))\n", + " ax.plot([-1], [0], color=colors[key], label=key, linewidth=1, linestyle=styles[key], alpha=a)\n", + " for n in range(1, n_moments_use):\n", + " ax.scatter([-1], [0], color='k', alpha=a, marker=shapes[n], facecolors='none', s=2*marksize, label=moment_names[n])\n", + " n_factor = 0.1 * (n - 2)\n", " if n>1:\n", " ax_n = ax.twinx()\n", + " rot_ang = 270\n", + " label_space = 15.\n", + " else:\n", + " rot_ang = 90\n", + " label_space = 0.\n", " if n>2:\n", " ax_n.spines[\"right\"].set_position((\"axes\", 1. + 0.1 * (n-1)))\n", " make_patch_spines_invisible(ax_n)\n", " ax_n.spines[\"right\"].set_visible(True)\n", - " for f in formats:\n", - " data_arr = np.swapaxes(np.array(pz_stats[f][n]), 0, 1)#go from n_floats*instantiations to instantiations*n_floats\n", - " for i in data_arr:\n", - " ax_n.scatter(flat_floats, i, marker=shapes[n], s=marksize, color=colors[f], alpha=a)\n", - " moment_max[n-1].append(max(i))\n", - " ax_n.set_ylabel(moment_names[n], fontsize=14)\n", - " ax_n.set_ylim(0., max(moment_max[n-1]))\n", - " ax.set_xlim(min(flat_floats) - 10**int(np.log10(min(flat_floats))), max(flat_floats) + 10**int(np.log10(max(flat_floats))))\n", - " ax.set_xticks(flat_floats, ['log'+str(ff) for ff in flat_floats])\n", + " for s in range(len(formats)):\n", + " f = formats[s]\n", + " f_factor = 0.05 * (s - 1)\n", + "# print('pz metrics data shape '+str(pz_stats[f][n]))\n", + " data_arr = np.log(np.swapaxes(np.array(pz_stats[f][n]), 0, 1))#go from n_floats*instantiations to instantiations*n_floats\n", + " mean = np.mean(data_arr, axis=0).flatten()\n", + " std = np.std(data_arr, axis=0).flatten()\n", + " y_plus = mean + std\n", + " y_minus = mean - std\n", + "# y_cor = np.array([y_minus[:-1], y_plus[:-1], y_plus[1:], y_minus[1:]])\n", + " ax_n.plot(np.exp(in_x+n_factor), mean, marker=shapes[n], mfc='none', markersize=marksize, linestyle=styles[f], alpha=a, color=colors[f])\n", + " ax_n.vlines(np.exp(in_x+n_factor), y_minus, y_plus, linewidth=3., alpha=a, color=colors[f])\n", + " pz_mean_max[n] = max(pz_mean_max[n], np.max(y_plus))\n", + " pz_mean_min[n] = min(pz_mean_min[n], np.min(y_minus))\n", + " ax_n.set_ylabel(r'$\\log[\\mathrm{'+moment_names[n]+r'}]$', rotation=rot_ang, fontsize=14, labelpad=label_space)\n", + " ax_n.set_ylim((pz_mean_min[n]-1., pz_mean_max[n]+1.))\n", + " ax.set_xscale('log')\n", + " ax.set_xticks(flat_floats)\n", + " ax.get_xaxis().set_major_formatter(mpl.ticker.ScalarFormatter())\n", + " ax.set_xlim(np.exp(min(in_x)-0.25), np.exp(max(in_x)+0.25))\n", " ax.set_xlabel('number of parameters', fontsize=14)\n", - " ax.set_title('KLD moments on '+str(n_gals_use)+' from '+dataset_info[dataset_key]['name']+' mock catalog', fontsize=16)\n", - " ax.legend(loc='upper right')\n", + " ax.set_title(dataset_info[dataset_key]['name']+r' data $\\log[\\mathrm{KLD}]$ log-moments', fontsize=16)\n", + " ax.legend(loc='lower left')\n", " fig.tight_layout()\n", - " fig.savefig(loc+'.png', dpi=250)\n", + " fig.savefig(loc+'_clean.pdf', dpi=250)\n", " plt.close()\n", " \n", " fig, ax = plt.subplots()\n", " fig.subplots_adjust(right=1.)\n", " ax_n = ax\n", - "\n", " for key in formats:\n", - " ax.plot([-1], [0], color=colors[key], label=key, linewidth=1)\n", - " for n in range(1, 4):\n", - " ax.scatter([-1], [0], color='k', alpha=1., marker=shapes[n], s=marksize, label=moment_names[n])\n", - " for n in range(1, 4):\n", - " n_factor = 0.05 * (n - 2)\n", + " ax_n.plot([-1], [0], color=colors[key], label=key, linestyle=styles[key], alpha=a, linewidth=1)\n", + " for n in range(1, n_moments_use):\n", + " n_factor = 0.1 * (n - 2)\n", + " ax.scatter([-1], [0], color='k', alpha=a, marker=shapes[n], facecolors='none', s=2*marksize, label=moment_names[n])\n", " if n>1:\n", " ax_n = ax.twinx()\n", + " rot_ang = 270\n", + " label_space = 15.\n", + " else:\n", + " rot_ang = 90\n", + " label_space = 0.\n", " if n>2:\n", " ax_n.spines[\"right\"].set_position((\"axes\", 1. + 0.1 * (n-1)))\n", " make_patch_spines_invisible(ax_n)\n", @@ -586,74 +1034,25 @@ " for s in range(len(formats)):\n", " f = formats[s]\n", " f_factor = 0.05 * (s - 1)\n", + "# print('pz metrics data shape '+str(pz_stats[f][n]))\n", " data_arr = np.log(np.swapaxes(np.array(pz_stats[f][n]), 0, 1))#go from n_floats*instantiations to instantiations*n_floats\n", - " mean = np.mean(data_arr, axis=0).flatten()\n", - " std = np.std(data_arr, axis=0).flatten()\n", - " print(dataset_key, f, n, std)\n", - " y_plus = mean + std\n", - " y_minus = mean - std\n", - " y_cor = np.array([y_minus[:-1], y_plus[:-1], y_plus[1:], y_minus[1:]])\n", - " ax_n.scatter(np.exp(in_x+n_factor), mean, marker=shapes[n], s=marksize, alpha=2. * a, color=colors[f])\n", - " ax_n.vlines(np.exp(in_x+n_factor), y_minus, y_plus, linewidth=3., alpha=a, color=colors[f])\n", - " mean_max[n] = max(mean_max[n], np.max(y_plus))\n", - " mean_min[n] = min(mean_min[n], np.min(y_minus))\n", - " ax_n.set_ylabel(moment_names[n], fontsize=14)\n", - " ax_n.set_ylim((mean_min[n]-1., mean_max[n]+1.))\n", + " for i in data_arr:\n", + " ax_n.plot(np.exp(in_x+n_factor), i, linestyle=styles[f], marker=shapes[n], mfc='none', markersize=marksize, color=colors[f], alpha=a)\n", + "# pz_moment_max[n-1].append(max(i))\n", + " ax_n.set_ylabel(r'$\\log[\\mathrm{'+moment_names[n]+r'}]$', rotation=rot_ang, fontsize=14, labelpad=label_space)\n", + " ax_n.set_ylim(pz_mean_min[n]-1., pz_mean_max[n]+1.)\n", " ax.set_xscale('log')\n", " ax.set_xticks(flat_floats)\n", - " ax.get_xaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter())\n", + " ax.get_xaxis().set_major_formatter(mpl.ticker.ScalarFormatter())\n", " ax.set_xlim(np.exp(min(in_x)-0.25), np.exp(max(in_x)+0.25))\n", " ax.set_xlabel('number of parameters', fontsize=14)\n", - " ax.set_title(dataset_info[dataset_key]['name']+r' data $\\log[KLD]$ moments', fontsize=16)\n", + " ax.set_title(dataset_info[dataset_key]['name']+r' data $\\log[\\mathrm{KLD}]$ log-moments', fontsize=16)\n", " ax.legend(loc='lower left')\n", " fig.tight_layout()\n", - " fig.savefig(loc+'_clean.png', dpi=250)\n", + " fig.savefig(loc+'_all.pdf', dpi=250)\n", " plt.close()" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We want to plot the KLD on $\\hat{n}(z)$ for all formats as $N_{f}$ changes. We want to repeat this for many subsamples of the catalog to establush error bars on the KLD values." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "def save_nz_metrics(dataset_key, n_gals_use, N_f, nz_klds):\n", - " \n", - " path = os.path.join(dataset_key, str(n_gals_use))\n", - " loc = os.path.join(path, 'nz_kld'+str(n_gals_use)+dataset_key)\n", - " if os.path.exists(loc+'.hkl'):\n", - " with open(loc+'.hkl', 'r') as nz_file:\n", - " #read in content of list/dict\n", - " nz_stats = hickle.load(nz_file)\n", - " else:\n", - " nz_stats = {}\n", - " nz_stats['N_f'] = []\n", - " for f in formats:\n", - " nz_stats[f] = []\n", - " \n", - " if N_f not in nz_stats['N_f']:\n", - " nz_stats['N_f'].append(N_f)\n", - " for f in formats:\n", - " nz_stats[f].append([])\n", - " \n", - " where_N_f = nz_stats['N_f'].index(N_f) \n", - " \n", - " for f in formats:\n", - " nz_stats[f][where_N_f].append(nz_klds[f])\n", - "\n", - " with open(loc+'.hkl', 'w') as nz_file:\n", - " hickle.dump(nz_stats, nz_file)" - ] - }, { "cell_type": "code", "execution_count": null, @@ -662,26 +1061,208 @@ }, "outputs": [], "source": [ - "def plot_nz_metrics(dataset_key, n_gals_use):\n", + "def plot_pz_delta_moments(name, size):\n", + " n_gals_use = size\n", + " extremum = np.zeros(n_moments_use)\n", " \n", - " path = os.path.join(dataset_key, str(n_gals_use))\n", - " loc = os.path.join(path, 'nz_kld'+str(n_gals_use)+dataset_key)\n", - " with open(loc+'.hkl', 'r') as nz_file:\n", - " nz_stats = hickle.load(nz_file)\n", - " if len(instantiations) == 10:\n", - " for f in formats:\n", - " if not np.shape(nz_stats[f]) == (4, 10):\n", - " for s in range(len(floats)):\n", - " nz_stats[f][s] = np.array(np.array(nz_stats[f][s])[:10]).flatten()\n", - "\n", - " flat_floats = np.array(nz_stats['N_f']).flatten()\n", + " # should look like nz_moments\n", + " path = os.path.join(name, str(size))\n", + " loc = os.path.join(path, 'pz_moment_deltas'+str(size)+name)\n", + " with open(loc+'.hkl', 'r') as pz_file:\n", + " pz_stats = hickle.load(pz_file)\n", + " flat_floats = np.array(pz_stats['N_f']).flatten()\n", + " in_x = np.log(flat_floats)\n", + " a = 1./len(formats)\n", + " shapes = moment_shapes\n", + " marksize = 10\n", " \n", - " plt.figure(figsize=(5, 5))\n", - "\n", - " for f in formats:\n", - " data_arr = np.swapaxes(np.array(nz_stats[f]), 0, 1)#turn N_f * instantiations into instantiations * N_f\n", - " n_i = len(data_arr)\n", - " a = 1./len(formats)#1./n_i\n", + " def make_patch_spines_invisible(ax):\n", + " ax.set_frame_on(True)\n", + " ax.patch.set_visible(False)\n", + " for sp in ax.spines.values():\n", + " sp.set_visible(False) \n", + " \n", + " fig, ax = plt.subplots()\n", + " fig.subplots_adjust(right=1.)\n", + " ax_n = ax\n", + " for key in formats:\n", + " ax.plot([-10], [0], color=colors[key], label=key, linestyle=styles[key], alpha=a, linewidth=1)\n", + " for n in range(1, n_moments_use):\n", + " ax.scatter([-10], [0], color='k', alpha=a, marker=shapes[n], facecolors='none', s=2*marksize, label=moment_names[n])\n", + " n_factor = 0.1 * (n - 2)\n", + " if n>1:\n", + " ax_n = ax.twinx()\n", + " rot_ang = 270\n", + " label_space = 15.\n", + " else:\n", + " rot_ang = 90\n", + " label_space = 0.\n", + " if n>2:\n", + " ax_n.spines[\"right\"].set_position((\"axes\", 1. + 0.1 * (n-1)))\n", + " make_patch_spines_invisible(ax_n)\n", + " ax_n.spines[\"right\"].set_visible(True)\n", + " for s in range(len(formats)):\n", + " f = formats[s]\n", + " f_factor = 0.05 * (s - 1)\n", + " old_shape = np.shape(np.array(pz_stats[f][n]))\n", + " new_shape = (old_shape[0], np.prod(old_shape[1:]))\n", + " data_arr = np.abs(np.array(pz_stats[f][n]).reshape(new_shape)) * 100.#go from n_floats*instantiations*n_gals n_floats*(n_gals*n_instantiations)\n", + "# data_arr = np.median(data_arr, axis=2) * 100.\n", + "# data_arr = np.swapaxes(np.array(nz_stats[f][n]), 0, 1)* 100.#np.log(np.swapaxes(np.array(nz_stats[f]), 0, 1)[:][:][n])#go from n_floats*instantiations to instantiations*n_floats\n", + "# mean = np.mean(data_arr, axis=0).flatten()\n", + "# std = np.std(data_arr, axis=0).flatten()\n", + "# mean = np.median(data_arr, axis=-1)\n", + " std = np.log10(np.percentile(data_arr, [25, 50, 75], axis=-1))\n", + " y_plus = std[-1]#mean + std\n", + " y_minus = std[0]#mean - std\n", + " mean = std[1]\n", + "# y_cor = np.array([y_minus, y_plus, y_plus, y_minus])\n", + " ax_n.plot(np.exp(in_x+n_factor), mean, linestyle=styles[f], marker=shapes[n], mfc='none', markersize=marksize, alpha=a, color=colors[f])\n", + " ax_n.vlines(np.exp(in_x+n_factor), y_minus, y_plus, linewidth=3., alpha=a, color=colors[f])\n", + "# print('before '+str((np.shape(data_arr), n, n_delta_max, n_delta_min, y_plus, y_minus)))\n", + " n_delta_max[n] = max(n_delta_max[n], np.max(y_plus))\n", + " n_delta_min[n] = min(n_delta_min[n], np.min(y_minus))\n", + "# old_shape = np.shape(np.array(pz_stats[f][n]))\n", + "# new_shape = (old_shape[0], np.prod(old_shape[1:]))\n", + "# data_arr = np.array(pz_stats[f][n]).reshape(new_shape)#go from n_floats*instantiations to instantiations*n_floats\n", + "# # data_arr = np.array(pz_stats[f][n])\n", + "# # data_arr = np.median(data_arr, axis=2) * 100.\n", + "# mean = np.mean(data_arr, axis=1)\n", + "# std = np.std(data_arr, axis=1)\n", + "# y_plus = (mean + std) * 100.\n", + "# y_minus = (mean - std) * 100.\n", + "# # y_cor = np.array([y_minus, y_plus, y_plus, y_minus])\n", + "# ax_n.plot(np.exp(in_x+n_factor), mean, linestyle=styles[f], marker=shapes[n], mfc='none', markersize=marksize, alpha=a, color=colors[f])\n", + "# ax_n.vlines(np.exp(in_x+n_factor), y_minus, y_plus, linewidth=3., alpha=a, color=colors[f])\n", + "# print('before '+str((np.shape(data_arr), n, n_delta_max, n_delta_min, y_plus, y_minus)))\n", + "# n_delta_max[n] = np.max(n_delta_max[n], np.max(y_plus))\n", + "# n_delta_min[n] = np.min(n_delta_min[n], np.min(y_minus))\n", + "# print('after '+str((n_delta_max, n_delta_min)))\n", + " ax_n.set_ylabel(r'$\\log_{10}$-percent error on '+moment_names[n], rotation=rot_ang, fontsize=14, labelpad=label_space)\n", + " extremum[n] = np.max(np.abs(np.array([n_delta_min[n], n_delta_max[n]]))) + 0.25\n", + " ax_n.set_ylim(-1.*extremum[n], extremum[n])\n", + " ax.set_xscale('log')\n", + " ax.set_xticks(flat_floats)\n", + " ax.get_xaxis().set_major_formatter(mpl.ticker.ScalarFormatter())\n", + " ax.set_xlim(np.exp(min(in_x)-0.25), np.exp(max(in_x)+0.25))\n", + " ax.set_xlabel('number of parameters', fontsize=14)\n", + " ax.set_title(dataset_info[name]['name']+r' data $\\hat{p}(z)$ moment log-percent errors', fontsize=16)\n", + " ax.legend(loc=dataset_info[name]['legloc_p'])\n", + " fig.tight_layout()\n", + " fig.savefig(loc+'_clean.pdf', dpi=250)\n", + " plt.close()\n", + " \n", + " fig, ax = plt.subplots()\n", + " fig.subplots_adjust(right=1.)\n", + " ax_n = ax\n", + " for key in formats:\n", + " ax_n.plot([-10], [0], color=colors[key], label=key, linestyle=styles[key], alpha=a, linewidth=1)\n", + " for n in range(1, n_moments_use):\n", + " n_factor = 0.1 * (n - 2)\n", + " ax.scatter([-10], [0], color='k', alpha=a, marker=shapes[n], facecolors='none', s=2*marksize, label=moment_names[n])\n", + " if n>1:\n", + " ax_n = ax.twinx()\n", + " rot_ang = 270\n", + " label_space = 15.\n", + " else:\n", + " rot_ang = 90\n", + " label_space = 0.\n", + " if n>2:\n", + " ax_n.spines[\"right\"].set_position((\"axes\", 1. + 0.1 * (n-1)))\n", + " make_patch_spines_invisible(ax_n)\n", + " ax_n.spines[\"right\"].set_visible(True)\n", + " for s in range(len(formats)):\n", + " f = formats[s]\n", + " f_factor = 0.05 * (s - 1)\n", + " data_arr = np.swapaxes(np.array(pz_stats[f][n]), 0, 1)\n", + " data_arr = np.median(data_arr, axis=2) * 100.\n", + " for i in data_arr:\n", + " ax_n.plot(np.exp(in_x+n_factor), i, linestyle=styles[f], marker=shapes[n], mfc='none', markersize=marksize, color=colors[f], alpha=a)\n", + " ax_n.set_ylabel(r'median percent error on '+moment_names[n], rotation=rot_ang, fontsize=14, labelpad=label_space)\n", + " ax_n.set_ylim(-10., 10.)#(-1.*extremum[n], extremum[n])\n", + " ax.set_xscale('log')\n", + " ax.set_xticks(flat_floats)\n", + " ax.get_xaxis().set_major_formatter(mpl.ticker.ScalarFormatter())\n", + " ax.set_xlim(np.exp(min(in_x)-0.25), np.exp(max(in_x)+0.25))\n", + " ax.set_xlabel('number of parameters', fontsize=14)\n", + " ax.set_title(dataset_info[name]['name']+r' data $\\hat{p}(z)$ moment percent errors', fontsize=16)\n", + " ax.legend(loc='upper left')\n", + " fig.tight_layout()\n", + " fig.savefig(loc+'_all.pdf', dpi=250)\n", + " plt.close()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We want to plot the KLD on $\\hat{n}(z)$ for all formats as $N_{f}$ changes. We want to repeat this for many subsamples of the catalog to establush error bars on the KLD values." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def save_nz_metrics(dataset_name, n_gals_use, N_f, nz_klds, stat_name):\n", + " \n", + " path = os.path.join(dataset_name, str(n_gals_use))\n", + " loc = os.path.join(path, stat_name+str(n_gals_use)+dataset_name)\n", + " if os.path.exists(loc+'.hkl'):\n", + " with open(loc+'.hkl', 'r') as nz_file:\n", + " #read in content of list/dict\n", + " nz_stats = hickle.load(nz_file)\n", + " else:\n", + " nz_stats = {}\n", + " nz_stats['N_f'] = []\n", + " for f in formats:\n", + " nz_stats[f] = []\n", + " \n", + " if N_f not in nz_stats['N_f']:\n", + " nz_stats['N_f'].append(N_f)\n", + " for f in formats:\n", + " nz_stats[f].append([])\n", + " \n", + " where_N_f = nz_stats['N_f'].index(N_f) \n", + " \n", + " for f in formats:\n", + " nz_stats[f][where_N_f].append(nz_klds[f])\n", + "\n", + " with open(loc+'.hkl', 'w') as nz_file:\n", + " hickle.dump(nz_stats, nz_file)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def plot_nz_klds(dataset_key, n_gals_use):\n", + " \n", + " path = os.path.join(dataset_key, str(n_gals_use))\n", + " loc = os.path.join(path, 'nz_klds'+str(n_gals_use)+dataset_key)\n", + " with open(loc+'.hkl', 'r') as nz_file:\n", + " nz_stats = hickle.load(nz_file)\n", + "# if len(instantiations) == 10:\n", + "# for f in formats:\n", + "# if not np.shape(nz_stats[f]) == (4, 10):\n", + "# for s in range(len(floats)):\n", + "# nz_stats[f][s] = np.array(np.array(nz_stats[f][s])[:10]).flatten()\n", + "\n", + " flat_floats = np.array(nz_stats['N_f']).flatten()\n", + " \n", + " plt.figure(figsize=(5, 5))\n", + " for f in formats:\n", + "# print('nz klds data shape '+str(nz_stats[f][n]))\n", + " data_arr = np.swapaxes(np.array(nz_stats[f]), 0, 1)#turn N_f * instantiations into instantiations * N_f\n", + " n_i = len(data_arr)\n", + " a = 1./len(formats)#1./n_i\n", " plt.plot([10. * max(flat_floats), 10. * max(flat_floats)], [1., 10.], color=colors[f], alpha=a, label=f, linestyle=styles[f])\n", " for i in data_arr:\n", " plt.plot(flat_floats, i, color=colors[f], alpha=a, linestyle=styles[f])\n", @@ -690,45 +1271,277 @@ " plt.semilogy()\n", " plt.semilogx()\n", " plt.xticks(flat_floats, [str(ff) for ff in flat_floats])\n", - " plt.ylim(min(kld_min) / 10., 10. * max(kld_max))\n", - " plt.xlim(min(flat_floats) / 3., max(flat_floats) * 3.)\n", + " plt.ylim(min(kld_min), max(kld_max))\n", + " plt.xlim(min(flat_floats), max(flat_floats))\n", " plt.xlabel(r'number of parameters', fontsize=14)\n", " plt.ylabel(r'KLD', fontsize=14)\n", " plt.legend(loc='upper right')\n", " plt.title(r'$\\hat{n}(z)$ KLD on '+str(n_gals_use)+' from '+dataset_info[dataset_key]['name']+' mock catalog', fontsize=16)\n", - "\n", - " plt.savefig(loc+'.png', dpi=250)\n", + " plt.savefig(loc+'_all.pdf', dpi=250)\n", " plt.close()\n", "\n", " plt.figure(figsize=(5, 5))\n", " a = 1./len(formats)\n", " for f in formats:\n", + "# print('nz klds data shape '+str(nz_stats[f][n]))\n", " data_arr = np.swapaxes(np.array(nz_stats[f]), 0, 1)#turn N_f * instantiations into instantiations * N_f\n", " plt.plot([10. * max(flat_floats), 10. * max(flat_floats)], [1., 10.], color=colors[f], label=f, linestyle=styles[f])\n", " kld_min.append(np.min(data_arr))\n", " kld_max.append(np.max(data_arr))\n", - " mean = np.mean(data_arr, axis=0)\n", - " std = np.std(data_arr, axis=0)\n", + " mean = np.mean(np.log(data_arr), axis=0)\n", + " std = np.std(np.log(data_arr), axis=0)\n", " x_cor = np.array([flat_floats[:-1], flat_floats[:-1], flat_floats[1:], flat_floats[1:]])\n", - " y_plus = mean + std\n", - " y_minus = mean - std\n", + " y_plus = np.exp(mean + std)\n", + " y_minus = np.exp(mean - std)\n", " y_cor = np.array([y_minus[:-1], y_plus[:-1], y_plus[1:], y_minus[1:]])\n", - " plt.plot(flat_floats, mean, color=colors[f], linestyle=styles[f])\n", + " plt.plot(flat_floats, np.exp(mean), color=colors[f], linestyle=styles[f])\n", " plt.fill(x_cor, y_cor, color=colors[f], alpha=a, linewidth=0.)\n", " plt.semilogy()\n", " plt.semilogx()\n", " plt.xticks(flat_floats, [str(ff) for ff in flat_floats])\n", - " plt.ylim(min(kld_min) / 10., 10. * max(kld_max))\n", + " plt.ylim(min(kld_min), max(kld_max))\n", " plt.xlim(min(flat_floats), max(flat_floats))\n", " plt.xlabel(r'number of parameters', fontsize=14)\n", " plt.ylabel(r'KLD', fontsize=14)\n", " plt.legend(loc='upper right')\n", " plt.title(dataset_info[dataset_key]['name']+r' data $\\hat{n}(z)$ KLD', fontsize=16)\n", + " plt.savefig(loc+'_clean.pdf', dpi=250)\n", + " plt.close()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def plot_nz_moments(dataset_key, n_gals_use):\n", "\n", - " plt.savefig(loc+'_clean.png', dpi=250)\n", + " path = os.path.join(dataset_key, str(n_gals_use))\n", + " loc = os.path.join(path, 'nz_moments'+str(n_gals_use)+dataset_key)\n", + " with open(loc+'.hkl', 'r') as nz_file:\n", + " nz_stats = hickle.load(nz_file)\n", + " flat_floats = np.array(nz_stats['N_f']).flatten()\n", + " in_x = np.log(flat_floats)\n", + " a = 1./len(formats)\n", + " shapes = moment_shapes\n", + " marksize = 10\n", + " \n", + " def make_patch_spines_invisible(ax):\n", + " ax.set_frame_on(True)\n", + " ax.patch.set_visible(False)\n", + " for sp in ax.spines.values():\n", + " sp.set_visible(False) \n", + " \n", + " fig, ax = plt.subplots()\n", + " fig.subplots_adjust(right=1.)\n", + " ax_n = ax\n", + " for key in formats:\n", + " ax.plot([-10], [0], color=colors[key], label=key, linestyle=styles[key], alpha=a, linewidth=1)\n", + " for n in range(1, n_moments_use):\n", + " ax.scatter([-10], [0], color='k', alpha=a, marker=shapes[n], facecolors='none', s=2*marksize, label=moment_names[n])\n", + " n_factor = 0.1 * (n - 2)\n", + " truth = np.swapaxes(np.array(nz_stats['truth'][n]), 0, 1)\n", + " if n>1:\n", + " ax_n = ax.twinx()\n", + " rot_ang = 270\n", + " label_space = 15.\n", + " else:\n", + " rot_ang = 90\n", + " label_space = 0.\n", + " if n>2:\n", + " ax_n.spines[\"right\"].set_position((\"axes\", 1. + 0.1 * (n-1)))\n", + " make_patch_spines_invisible(ax_n)\n", + " ax_n.spines[\"right\"].set_visible(True)\n", + " for s in range(len(formats)):\n", + " f = formats[s]\n", + " f_factor = 0.05 * (s - 1)\n", + " data_arr = (np.swapaxes(np.array(nz_stats[f][n]), 0, 1) - truth) / truth * 100.#np.log(np.swapaxes(np.array(nz_stats[f]), 0, 1)[:][:][n])#go from n_floats*instantiations to instantiations*n_floats\n", + "# data_arr = np.abs(np.array(pz_stats[f][n]).reshape(new_shape)) * 100.#go from n_floats*instantiations*n_gals n_floats*(n_gals*n_instantiations)\n", + "# data_arr = np.median(data_arr, axis=2) * 100.\n", + "# data_arr = np.swapaxes(np.array(nz_stats[f][n]), 0, 1)* 100.#np.log(np.swapaxes(np.array(nz_stats[f]), 0, 1)[:][:][n])#go from n_floats*instantiations to instantiations*n_floats\n", + "# mean = np.mean(data_arr, axis=0).flatten()\n", + "# std = np.std(data_arr, axis=0).flatten()\n", + "# mean = np.median(data_arr, axis=-1)\n", + "# std = np.log10(np.percentile(np.abs(data_arr), [25, 50, 75], axis=0))\n", + " std = np.percentile(data_arr, [25, 50, 75], axis=0)\n", + " y_plus = std[-1]#mean + std\n", + " y_minus = std[0]#mean - std\n", + " mean = std[1]\n", + "# mean = np.mean(data_arr, axis=0).flatten()\n", + "# std = np.std(data_arr, axis=0).flatten()\n", + "# y_plus = mean + std\n", + "# y_minus = mean - std\n", + "# y_cor = np.array([y_minus[:-1], y_plus[:-1], y_plus[1:], y_minus[1:]])\n", + " ax_n.plot(np.exp(in_x+n_factor), mean, linestyle=styles[f], marker=shapes[n], mfc='none', markersize=marksize, alpha=a, color=colors[f])\n", + " ax_n.vlines(np.exp(in_x+n_factor), y_minus, y_plus, linewidth=3., alpha=a, color=colors[f])\n", + " nz_mean_max[n] = max(nz_mean_max[n], np.max(y_plus))\n", + " nz_mean_min[n] = min(nz_mean_min[n], np.min(y_minus))\n", + " ax_n.set_ylabel(r'percent error on '+moment_names[n], rotation=rot_ang, fontsize=14, labelpad=label_space)\n", + " extremum = np.max(np.abs([nz_mean_min[n], nz_mean_max[n]])) + 1.#0.25\n", + " ax_n.set_ylim(-1. * extremum, extremum)\n", + " ax.set_xscale('log')\n", + " ax.set_xticks(flat_floats)\n", + " ax.get_xaxis().set_major_formatter(mpl.ticker.ScalarFormatter())\n", + " ax.set_xlim(np.exp(min(in_x)-0.25), np.exp(max(in_x)+0.25))\n", + " ax.set_xlabel('number of parameters', fontsize=14)\n", + " ax.set_title(dataset_info[dataset_key]['name']+r' data $\\hat{n}(z)$ moment percent errors', fontsize=16)\n", + " ax.legend(loc=dataset_info[name]['legloc_n'])#FINDME!\n", + " fig.tight_layout()\n", + " fig.savefig(loc+'_clean_unlog.pdf', dpi=250)\n", + " plt.close()\n", + " \n", + " fig, ax = plt.subplots()\n", + " fig.subplots_adjust(right=1.)\n", + " ax_n = ax\n", + " for key in formats:\n", + " ax_n.plot([-10], [0], color=colors[key], label=key, linestyle=styles[key], alpha=a, linewidth=1)\n", + " for n in range(1, n_moments_use):\n", + " n_factor = 0.1 * (n - 2)\n", + " ax.scatter([-10], [0], color='k', alpha=a, marker=shapes[n], facecolors='none', s=2*marksize, label=moment_names[n])\n", + " truth = np.swapaxes(np.array(nz_stats['truth'][n]), 0, 1)\n", + " if n>1:\n", + " ax_n = ax.twinx()\n", + " rot_ang = 270\n", + " label_space = 15.\n", + " else:\n", + " rot_ang = 90\n", + " label_space = 0.\n", + " if n>2:\n", + " ax_n.spines[\"right\"].set_position((\"axes\", 1. + 0.1 * (n-1)))\n", + " make_patch_spines_invisible(ax_n)\n", + " ax_n.spines[\"right\"].set_visible(True)\n", + " for s in range(len(formats)):\n", + " f = formats[s]\n", + " f_factor = 0.05 * (s - 1)\n", + " data_arr = (np.swapaxes(np.array(nz_stats[f][n]), 0, 1) - truth) / truth * 100.\n", + " for i in data_arr:\n", + " ax_n.plot(np.exp(in_x+n_factor), i, linestyle=styles[f], marker=shapes[n], mfc='none', markersize=marksize, color=colors[f], alpha=a)\n", + " ax_n.set_ylabel(r'median percent error on '+moment_names[n], rotation=rot_ang, fontsize=14, labelpad=label_space)\n", + " ax_n.set_ylim(-15., 15.)\n", + " ax.set_xscale('log')\n", + " ax.set_xticks(flat_floats)\n", + " ax.get_xaxis().set_major_formatter(mpl.ticker.ScalarFormatter())\n", + " ax.set_xlim(np.exp(min(in_x)-0.25), np.exp(max(in_x)+0.25))\n", + " ax.set_xlabel('number of parameters', fontsize=14)\n", + " ax.set_title(dataset_info[dataset_key]['name']+r' data $\\hat{n}(z)$ moment percent errors', fontsize=16)\n", + " ax.legend(loc='lower left')\n", + " fig.tight_layout()\n", + " fig.savefig(loc+'_all.pdf', dpi=250)\n", " plt.close()" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def print_nz_moments(dataset_key, n_gals_use):\n", + " path = os.path.join(dataset_key, str(n_gals_use))\n", + " \n", + " dz = dataset_info[dataset_key]['delta_z']\n", + " z_grid = dataset_info[dataset_key]['z_grid']\n", + " full_stack = {}\n", + " all_moments = {}\n", + " for f in formats_plus:\n", + " full_stack[f] = []\n", + " all_moments[f] = []\n", + " for nf in range(len(floats)):\n", + " n_floats_use = floats[nf]\n", + " for f in formats_plus:\n", + " full_stack[f].append(np.zeros(len(z_grid)))\n", + " all_moments[f].append([])\n", + " for i in instantiations:\n", + " loc = os.path.join(path, 'nz_comp'+str(n_gals_use)+dataset_key+str(n_floats_use)+'_'+str(i))\n", + " with open(loc+'.hkl', 'r') as filename:\n", + " info = hickle.load(filename)\n", + "# z_grid = info['z_grid']\n", + " stacks = info['stacks']\n", + "# klds = info['klds']\n", + " for key in formats_plus:\n", + " full_stack[key][nf] += stacks[key]\n", + " for n in range(1, n_moments_use):\n", + " ngrid = z_grid**n\n", + " all_moments['truth'][nf].append(qp.utils.quick_moment(full_stack['truth'][nf], ngrid, dz))\n", + " for key in formats:\n", + " all_moments[key][nf].append((qp.utils.quick_moment(full_stack[key][nf], ngrid, dz) - all_moments['truth'][nf][-1]) / all_moments['truth'][nf][-1])\n", + " for f in formats:\n", + " all_moments[f] = np.array(all_moments[f])\n", + " print(dataset_key, n_gals_use, all_moments)\n", + " \n", + " in_x = np.log(floats)\n", + " a = 1./len(formats)\n", + " shapes = moment_shapes\n", + " marksize = 10\n", + " \n", + " def make_patch_spines_invisible(ax):\n", + " ax.set_frame_on(True)\n", + " ax.patch.set_visible(False)\n", + " for sp in ax.spines.values():\n", + " sp.set_visible(False)\n", + " \n", + " fig, ax = plt.subplots()\n", + " fig.subplots_adjust(right=1.)\n", + " ax_n = ax\n", + " for key in formats:\n", + " ax_n.plot([-10], [0], color=colors[key], label=key, linestyle=styles[key], alpha=a, linewidth=1)\n", + " for n in range(1, n_moments_use):\n", + " n_factor = 0.1 * (n - 2)\n", + " ax.scatter([-10], [0], color='k', alpha=a, marker=shapes[n], facecolors='none', s=2*marksize, label=moment_names[n])\n", + "# truth = np.swapaxes(np.array(nz_stats['truth'][n]), 0, 1)\n", + " if n>1:\n", + " ax_n = ax.twinx()\n", + " rot_ang = 270\n", + " label_space = 15.\n", + " else:\n", + " rot_ang = 90\n", + " label_space = 0.\n", + " if n>2:\n", + " ax_n.spines[\"right\"].set_position((\"axes\", 1. + 0.1 * (n-1)))\n", + " make_patch_spines_invisible(ax_n)\n", + " ax_n.spines[\"right\"].set_visible(True)\n", + " for s in range(len(formats)):\n", + " f = formats[s]\n", + " f_factor = 0.05 * (s - 1)\n", + " data_arr = np.swapaxes(all_moments[f], 0, 1) * 100.\n", + " ax_n.plot(np.exp(in_x+n_factor), data_arr[n-1], linestyle=styles[f], marker=shapes[n], mfc='none', markersize=marksize, color=colors[f], alpha=a)\n", + " ax_n.set_ylabel(r'percent error on '+moment_names[n], rotation=rot_ang, fontsize=14, labelpad=label_space)\n", + "# ax_n.set_ylim(-1. * extremum, extremum)\n", + " ax.set_xscale('log')\n", + " ax.set_xticks(floats)\n", + " ax.get_xaxis().set_major_formatter(mpl.ticker.ScalarFormatter())\n", + " ax.set_xlim(np.exp(min(in_x)-0.25), np.exp(max(in_x)+0.25))\n", + " ax.set_xlabel('number of parameters', fontsize=14)\n", + " ax.set_title(dataset_info[dataset_key]['name']+r' data $\\hat{n}(z)$ moments', fontsize=16)\n", + " ax.legend(loc='lower left')\n", + " fig.tight_layout()\n", + " outloc = os.path.join(path, 'nz_moments'+str(n_gals_use)+dataset_key)\n", + " fig.savefig(outloc+'_final.pdf', dpi=250)\n", + " plt.close()\n", + " \n", + " for nf in range(len(floats)):\n", + " n_floats_use = floats[nf]\n", + " plt.figure()\n", + " plt.plot(z_grid, full_stack['truth'][nf], color='black', lw=3, alpha=0.3, label='original')\n", + " for key in formats:\n", + " kld = qp.utils.quick_kl_divergence(full_stack['truth'][nf], full_stack[key][nf], dx=dz)\n", + " plt.plot(z_grid, full_stack[key][nf], color=colors[key], linestyle=styles[key], label=key+r' KLD='+str(kld)[:8])#+r'; '+str(all_moments[f][nf])+' percent error')\n", + " plt.xlabel(r'$z$', fontsize=14)\n", + " plt.ylabel(r'$\\hat{n}(z)$', fontsize=14)\n", + " plt.xlim(min(z_grid), max(z_grid))\n", + " # plt.ylim(0., max(nz_max))\n", + " plt.legend()\n", + " plt.title(dataset_info[dataset_key]['name']+r' data $\\hat{n}(z)$ with $N_{f}='+str(n_floats_use)+r'$', fontsize=16)\n", + " outloc = os.path.join(path, 'global_nz'+str(n_gals_use)+dataset_key+str(n_floats_use))\n", + " plt.savefig(outloc+'.pdf', dpi=250)\n", + " plt.close()" + ] + }, { "cell_type": "markdown", "metadata": { @@ -751,34 +1564,43 @@ "dataset_info = {}\n", "delta = 0.01\n", "\n", - "dataset_keys = ['graham', 'schmidt']#['Optical+IR', 'Optical']\n", + "dataset_keys = ['mg', 'ss']\n", "\n", - "for dataset_key in dataset_keys:\n", - " dataset_info[dataset_key] = {}\n", - " if dataset_key == 'graham':\n", - " datafilename = 'bpz_euclid_test_10_2.probs'\n", + "for name in dataset_keys:\n", + " dataset_info[name] = {}\n", + " if name == 'mg':\n", + " datafilename = 'bpz_euclid_test_10_3.probs'\n", " z_low = 0.01\n", " z_high = 3.51\n", " nc_needed = 3\n", - " plotname = 'high-quality'\n", - " elif dataset_key == 'schmidt':\n", + " plotname = 'brighter'\n", + " skip_rows = 1\n", + " skip_cols = 1\n", + " legloc_p = 'upper right'\n", + " legloc_n = 'upper left'\n", + " elif name == 'ss':\n", " datafilename = 'test_magscat_trainingfile_probs.out'\n", " z_low = 0.005\n", " z_high = 2.11\n", " nc_needed = 5\n", - " plotname = 'low-quality'\n", - " dataset_info[dataset_key]['filename'] = datafilename \n", + " plotname = 'fainter'\n", + " skip_rows = 1\n", + " skip_cols = 1\n", + " legloc_p = 'lower left'\n", + " legloc_n = 'lower right'\n", + " dataset_info[name]['filename'] = datafilename \n", " \n", - " dataset_info[dataset_key]['z_lim'] = (z_low, z_high)\n", + " dataset_info[name]['z_lim'] = (z_low, z_high)\n", " z_grid = np.arange(z_low, z_high, delta, dtype='float')#np.arange(z_low, z_high + delta, delta, dtype='float')\n", " z_range = z_high - z_low\n", " delta_z = z_range / len(z_grid)\n", - " dataset_info[dataset_key]['z_grid'] = z_grid\n", - " dataset_info[dataset_key]['delta_z'] = delta_z\n", + " dataset_info[name]['z_grid'] = z_grid\n", + " dataset_info[name]['delta_z'] = delta_z\n", "\n", - " dataset_info[dataset_key]['N_GMM'] = nc_needed\n", - " \n", - " dataset_info[dataset_key]['name'] = plotname" + " dataset_info[name]['N_GMM'] = nc_needed# will be overwritten later\n", + " dataset_info[name]['name'] = plotname\n", + " dataset_info[name]['legloc_p'] = legloc_p\n", + " dataset_info[name]['legloc_n'] = legloc_n" ] }, { @@ -789,27 +1611,26 @@ }, "outputs": [], "source": [ + "formats = ['quantiles', 'histogram', 'samples']\n", + "formats_plus = list(formats)\n", + "formats_plus.append('truth')\n", + "\n", "high_res = 300\n", - "n_plot = 5\n", - "n_moments_use = 4\n", - "moment_names = ['integral', 'mean', 'variance', 'kurtosis']\n", "\n", - "#make this a more clever structure, i.e. a dict\n", - "formats = ['quantiles', 'histogram', 'samples']\n", - "colors = {'quantiles': 'blueviolet', 'histogram': 'darkorange', 'samples': 'forestgreen'}\n", - "styles = {'quantiles': '--', 'histogram': ':', 'samples': '-.'}\n", - "stepstyles = {'quantiles': 'dashed', 'histogram': 'dotted', 'samples': 'dashdot'}\n", + "color_cycle = np.array([(230, 159, 0), (86, 180, 233), (0, 158, 115), (240, 228, 66), (0, 114, 178), (213, 94, 0), (204, 121, 167)])/256.\n", + "color_cycle_names = ['Orange', 'Sky blue', 'Bluish green', 'Yellow', 'Blue', 'Vermilion', 'Reddish purple']\n", + "n_plot = len(color_cycle)\n", "\n", - "pz_max = [1.]\n", - "nz_max = [1.]\n", - "hist_max = [1.]\n", - "dist_min = [0.]\n", - "dist_max = [0.]\n", - "moment_max = [[]] * (n_moments_use - 1)\n", - "mean_max = -10.*np.ones(n_moments_use + 1)\n", - "mean_min = 10.*np.ones(n_moments_use + 1)\n", - "kld_min = [1.]\n", - "kld_max = [1.]" + "n_moments_use = 4\n", + "moment_names = ['integral', 'mean', 'variance', 'kurtosis']\n", + "moment_shapes = ['o', '*', 'P', 'X']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For debugging, specify the randomly selected PDFs." ] }, { @@ -823,11 +1644,12 @@ "#change all for NERSC\n", "\n", "floats = [3, 10, 30, 100]\n", - "sizes = [100]#[100, 1000, 10000]\n", - "names = dataset_info.keys()#['Optical', 'Optical+IR']\n", + "sizes = [100]#[10, 100, 1000]\n", + "names = dataset_info.keys()\n", "instantiations = range(0, 10)\n", "\n", - "all_randos = [[np.random.choice(size, n_plot, replace=False) for size in sizes] for name in names]" + "all_randos = [[np.random.choice(size, n_plot, replace=False) for size in sizes] for name in names]\n", + "# all_randos = [[np.random.choice(indices, n_plot, replace=False) for size in sizes] for name in names]" ] }, { @@ -843,82 +1665,87 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": false }, "outputs": [], "source": [ - "# the \"pipeline\"\n", - "\n", - "for n in range(len(names)):\n", - " name = names[n]\n", + "# # the \"pipeline\"\n", + "# global_start = timeit.default_timer()\n", + "# for n in range(len(names)):\n", + "# name = names[n]\n", " \n", - " dataset_start = timeit.default_timer()\n", - " print('started '+name)\n", + "# dataset_start = timeit.default_timer()\n", + "# print('started '+name)\n", " \n", - " pdfs = setup_dataset(name)\n", + "# pdfs = setup_dataset(name, skip_rows, skip_cols)\n", " \n", - " for s in range(len(sizes)):\n", - " size=sizes[s]\n", + "# for s in range(len(sizes)):\n", + "# size = sizes[s]\n", " \n", - " size_start = timeit.default_timer()\n", - " print('started '+str(size)+name)\n", + "# size_start = timeit.default_timer()\n", + "# print('started '+name+str(size))\n", " \n", - " path = os.path.join(name, str(size))\n", - " if not os.path.exists(path):\n", - " os.makedirs(path)\n", + "# path = os.path.join(name, str(size))\n", + "# if not os.path.exists(path):\n", + "# os.makedirs(path)\n", " \n", - " n_gals_use = size\n", + "# n_gals_use = size\n", " \n", - " randos = all_randos[n][s]#np.random.choice(size, n_plot, replace=False)\n", + "# randos = all_randos[n][s]\n", " \n", - " for i in instantiations:\n", + "# for i in instantiations:\n", + "# # top_bonusdict = {}\n", + "# i_start = timeit.default_timer()\n", + "# print('started '+name+str(size)+' #'+str(i))\n", " \n", - " original = '_original_('+str(i)+')'\n", - " pdfs_use = make_instantiation(name, size, pdfs, bonus=original)\n", - "# plot = plot_examples(size, name, bonus=original)\n", + "# original = '_original'+str(i)\n", + "# pdfs_use = make_instantiation(name, size, pdfs, bonus=original)\n", + "# # plot = plot_examples(size, name, bonus=original)\n", + "# # top_bonusdict[original] = ['-', 0.25]\n", " \n", - " z_grid = dataset_info[name]['in_z_grid']\n", - " N_comps = dataset_info[name]['N_GMM']\n", + "# z_grid = dataset_info[name]['in_z_grid']\n", + "# N_comps = dataset_info[name]['N_GMM']\n", " \n", - " postfit = '_post-fit_('+str(i)+')'\n", - " catalog = setup_from_grid(name, pdfs_use, z_grid, N_comps, high_res=high_res, bonus=postfit)\n", - "# plot = plot_examples(size, name, bonus=postfit)\n", + "# postfit = '_postfit'+str(i)\n", + "# catalog = setup_from_grid(name, pdfs_use, z_grid, N_comps, high_res=high_res, bonus=postfit)\n", + "# # plot = plot_examples(size, name, bonus=postfit)\n", + "# # top_bonusdict[postfit] = ['-', 0.5]\n", " \n", - " for n_floats_use in floats:\n", - " \n", - " float_start = timeit.default_timer()\n", - " print('started '+str(size)+name+str(n_floats_use)+'\\#'+str(i))\n", + "# for n_floats_use in floats:\n", + "# # bonusdict = top_bonusdict.copy()\n", + "# float_start = timeit.default_timer()\n", + "# print('started '+name+str(size)+' #'+str(i)+' with '+str(n_floats_use))\n", " \n", - " (ensembles, pz_klds, metric_moments) = analyze_individual(catalog, \n", - " z_grid,#dataset_info[name]['metric_z_grid'], \n", - " n_floats_use, name, n_moments_use, i=i)\n", - "# for f in formats:\n", - "# fname = '_'+str(n_floats_use)+f+'_('+str(i)+')'\n", - "# plot = plot_examples(size, name, bonus=fname)\n", - "# plot = plot_individual(size, name, n_floats_use, i=i)\n", - " save_pz_metrics(name, size, n_floats_use, metric_moments)\n", + "# ensembles = analyze_individual(catalog, z_grid, n_floats_use, name, n_moments_use, i=i, bonus=postfit)\n", + " \n", + "# # for f in formats:\n", + "# # fname = str(n_floats_use)+f+str(i)\n", + "# # plot = plot_examples(size, name, bonus=fname)\n", + "# # bonusdict[fname] = [styles[f], 0.5]\n", + "# # plot = plot_all_examples(name, size, n_floats_use, i, bonus=bonusdict)\n", + "# # plot = plot_individual_kld(size, name, n_floats_use, i=i)\n", " \n", - " (stack_evals, nz_klds) = analyze_stacked(catalog, ensembles, z_grid,#dataset_info[name]['metric_z_grid'], \n", - " n_floats_use, name, i=i)\n", - "# plot = plot_estimators(size, name, n_floats_use, i=i)\n", - " save_nz_metrics(name, size, n_floats_use, nz_klds)\n", + "# stack_evals = analyze_stacked(catalog, ensembles, z_grid, n_floats_use, name, i=i)\n", + "# # plot = plot_estimators(size, name, n_floats_use, i=i)\n", " \n", - " print('finished '+str(size)+name+str(n_floats_use)+' in '+str(timeit.default_timer() - float_start))\n", - " \n", - "# plot = plot_pz_metrics(name, size)\n", - " \n", - "# plot = plot_nz_metrics(name, size)\n", + "# print('FINISHED '+name+str(size)+' #'+str(i)+' with '+str(n_floats_use)+' in '+str(timeit.default_timer() - float_start))\n", + "# print('FINISHED '+name+str(size)+' #'+str(i)+' in '+str(timeit.default_timer() - i_start))\n", + "# # plot = plot_pz_metrics(name, size)\n", + "# # plot = plot_pz_delta_moments(name, size) \n", + "# # plot = plot_nz_klds(name, size)\n", + "# # plot = plot_nz_moments(name, size)\n", " \n", - " print('finished '+str(size)+name+' in '+str(timeit.default_timer() - size_start))\n", + "# print('FINISHED '+name+str(size)+' in '+str(timeit.default_timer() - size_start))\n", " \n", - " print('finished '+name+' in '+str(timeit.default_timer() - dataset_start))" + "# print('FINISHED '+name+' in '+str(timeit.default_timer() - dataset_start))\n", + "# print('FINISHED everything in '+str(timeit.default_timer() - global_start))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Remake the plots to share axes." + "Remake the plots to share axes, enabling combination of runs." ] }, { @@ -929,29 +1756,101 @@ }, "outputs": [], "source": [ - "#comment out for NERSC\n", + "floats = [3, 10, 30, 100]\n", + "sizes = [100]#[10, 100, 1000]\n", + "names = dataset_info.keys()\n", + "instantiations = range(0, 10)\n", + "\n", + "all_randos = [[np.random.choice(size, n_plot, replace=False) \n", + " for size in sizes] for name in names]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "#make this a more clever structure, i.e. a dict\n", + "colors = {'quantiles': 'darkviolet', 'histogram': 'darkorange', 'samples': 'g'}\n", + "styles = {'quantiles': '--', 'histogram': ':', 'samples': '-.'}\n", + "stepstyles = {'quantiles': 'dashed', 'histogram': 'dotted', 'samples': 'dashdot'}\n", + "\n", + "colors_plus = colors.copy()\n", + "colors_plus['truth'] = 'black'\n", + "styles_plus = styles.copy()\n", + "styles_plus['truth'] = '-'\n", + "\n", + "iqr_min = [3.5]\n", + "iqr_max = [delta]\n", + "modes_max = [0]\n", + "pz_max = [1.]\n", + "nz_max = [1.]\n", + "hist_max = [1.]\n", + "dist_min = [0.]\n", + "dist_max = [0.]\n", + "pz_mean_max = -10.*np.ones(n_moments_use)\n", + "pz_mean_min = 10.*np.ones(n_moments_use)\n", + "kld_min = [1.]\n", + "kld_max = [1.]\n", + "indie_delta_kld_min = [1.]\n", + "indie_delta_kld_max = [-1.]\n", + "nz_mean_max = -10.*np.ones(n_moments_use)\n", + "nz_mean_min = 10.*np.ones(n_moments_use)\n", + "n_delta_max = -10.*np.ones(n_moments_use)\n", + "n_delta_min = 10.*np.ones(n_moments_use)\n", + "\n", + "norm = False#true for shared axes on individual instantiation plots, otherwise false\n", + "\n", + "moments_to_save = ['pz_kld_moments', 'pz_moments', 'pz_moment_deltas', 'nz_moments']\n", + "metrics_to_save = ['nz_klds']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "scrolled": true + }, + "outputs": [], + "source": [ + "# comment out for NERSC\n", + "# set norm to True and run twice to match axis limits\n", "\n", "for name in names:\n", " for size in sizes:\n", - " path = os.path.join(name, str(size))\n", + "# for stat_name in moments_to_save + metrics_to_save:\n", + "# clear_stats(name, size, stat_name)\n", " for i in instantiations:\n", - " \n", - " plot = plot_examples(size, name, bonus='_original_('+str(i)+')')\n", - " \n", - " plot = plot_examples(size, name, bonus='_post-fit_('+str(i)+')')\n", - " \n", - " for n_floats_use in floats:\n", - " \n", - " for f in formats:\n", - " fname = '_'+str(n_floats_use)+f+'_('+str(i)+')'\n", - " plot = plot_examples(size, name, bonus=fname)\n", - " plot = plot_individual(size, name, n_floats_use, i)\n", - " \n", - " plot = plot_estimators(size, name, n_floats_use, i)\n", - " \n", - " plot = plot_pz_metrics(name, size)\n", - " \n", - " plot = plot_nz_metrics(name, size)" + " top_bonusdict = {}\n", + " bo = '_original'+str(i)\n", + "# plot = plot_examples(size, name, bonus=bo, norm=norm)\n", + " top_bonusdict[bo] = ['-', 0.25]\n", + " bp = '_postfit'+str(i)\n", + "# plot = plot_examples(size, name, bonus=bp, norm=norm)\n", + " top_bonusdict[bp] = ['-', 0.5]\n", + " for n in range(len(floats)):\n", + "# bonusdict = top_bonusdict.copy()\n", + "# n_floats_use = floats[n]\n", + "# for f in formats:\n", + "# fname = str(n_floats_use)+f+str(i)\n", + "# plot = plot_examples(size, name, bonus=fname, norm=norm)\n", + "# bonusdict[fname] = [styles[f], 0.5]\n", + " plot = plot_all_examples(name, size, n_floats_use, i, bonus=bonusdict)\n", + "# plot = plot_individual_kld(size, name, n_floats_use, i)\n", + "# plot = plot_estimators(size, name, n_floats_use, i)\n", + "# for stat_name in moments_to_save:\n", + "# save_moments_wrapper(name, size, n_floats_use, i, stat_name)\n", + "# for stat_name in metrics_to_save:\n", + "# save_metrics_wrapper(name, size, n_floats_use, i, stat_name)\n", + "# plot = plot_kld_stats(name, size)\n", + "# plot = plot_pz_metrics(name, size)\n", + "# plot = plot_pz_delta_moments(name, size)\n", + "# plot = plot_nz_klds(name, size)\n", + "# plot = plot_nz_moments(name, size)" ] }, { @@ -968,7 +1867,248 @@ "collapsed": true }, "outputs": [], - "source": [] + "source": [ + "# def just_modality(dataset_key, n_gals_use, bonus=None):\n", + "# import scipy.signal\n", + "# path = os.path.join(dataset_key, str(n_gals_use))\n", + "# loc = os.path.join(path, 'pzs'+str(n_gals_use)+dataset_key+bonus)\n", + "# with open(loc+'.hkl', 'r') as filename:\n", + "# info = hickle.load(filename)\n", + "# pdfs_use = info['pdfs']\n", + "# modality, iqrs = [], []\n", + "# dpdfs = pdfs_use[:,1:] - pdfs_use[:,:-1]\n", + "# ddpdfs = dpdfs[:, 1:] - dpdfs[:, :-1]\n", + "# for i in range(n_gals_use):\n", + "# modality.append(len(scipy.signal.argrelmax(pdfs_use[i])[0]))#(len(np.where(np.signbit(ddpdfs[i]))[0]))\n", + "# cdf = np.cumsum(qp.utils.normalize_integral((dataset_info[dataset_key]['z_grid'], pdfs_use[i]), vb=False)[1])\n", + "# iqr_lo = dataset_info[dataset_key]['z_grid'][bisect.bisect_left(cdf, 0.25)]\n", + "# iqr_hi = dataset_info[dataset_key]['z_grid'][bisect.bisect_left(cdf, 0.75)]\n", + "# iqrs.append(iqr_hi - iqr_lo)\n", + "# # modality = np.array(modality)\n", + "# # iqrs = np.array(iqrs)\n", + "\n", + "# # loc = os.path.join(path, 'modality'+str(n_gals_use)+dataset_key+bonus)\n", + "# # with open(loc+'.hkl', 'w') as filename:\n", + "# # info = {}\n", + "# # info['modes'] = modality\n", + "# # info['iqrs'] = iqrs\n", + "# # hickle.dump(info, filename)\n", + "# return(modality, iqrs)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# all_modes, all_iqrs = {}, {}\n", + "# for name in names:\n", + "# all_modes[name], all_iqrs[name] = {}, {}\n", + "# for size in sizes:\n", + "# all_modes[name][str(size)], all_iqrs[name][str(size)] = [], []\n", + "# for i in instantiations:\n", + "# # print_nz_moments(name, size)\n", + "# original = '_original'+str(i)\n", + "# (modality, iqrs) = just_modality(name, size, bonus=original)\n", + "# all_modes[name][str(size)].append(modality)\n", + "# all_iqrs[name][str(size)].append(iqrs)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# for name in names:\n", + "# for size in sizes:\n", + "# modality = np.array(all_modes[name][str(size)]).flatten()\n", + "# modality_cdf = []\n", + "# modegrid = range(np.max(modality))\n", + "# for x in modegrid:\n", + "# modality_cdf.append(len(modality[modality==x]))\n", + "# plt.hist(modality, normed=True)\n", + "# plt.title(name+str(size)+'modality'+str(np.median(modality)))\n", + "# plt.show()\n", + "# plt.close()\n", + "# print(zip(modegrid, modality_cdf))\n", + "# # iqrdist = np.array(all_iqrs[name][str(size)]).flatten()\n", + "# # plt.title(name+str(size)+'iqrdist'+str(np.median(iqrdist)))\n", + "# # plt.hist(iqrdist, normed=True)\n", + "# # plt.show()\n", + "# # plt.close()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# thing = load_one_stat('ss', 100, 3, 0, 'pz_moment_deltas')\n", + "# print(np.mean(np.shinape(thing['quantiles']), axis=0))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# save_moments('ss', 100, 3, thing, 'pz_moment_deltas')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# path = os.path.join('ss', str(100))\n", + "# loc = os.path.join(path, 'pz_moment_deltas'+str(100)+'ss')\n", + "# with open(loc+'.hkl', 'r') as pz_file:\n", + "# pz_stats = hickle.load(pz_file)\n", + " \n", + "# print(np.shape(pz_stats['quantiles'][0]))#N_f * n_m * n_i * n_g" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# modified = np.array(pz_stats['quantiles']).reshape(4, 4, 1000)*100.\n", + "# print(np.shape(modified))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# print(np.shape(np.array(pz_stats[f][0]).reshape(4, 1000)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# more_modified = modified * 100.\n", + "# mean = np.mean(more_modified, axis=-1)\n", + "# print(mean)\n", + "# std = np.std(more_modified, axis=-1)\n", + "# print(std)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# # print(np.shape(modified))\n", + "# # plt.hist(modified[0][3])\n", + "# weird_x = np.log(np.array(floats))\n", + "\n", + "# moment_num = 3\n", + "# for s in range(3):\n", + "# f = formats[s]\n", + "# const = 0.1\n", + "# f_factor = const * (s - 1)\n", + "# new_data = np.array(pz_stats[f][moment_num]).reshape(4, 1000)*100.\n", + "# plt.plot(np.exp(weird_x+f_factor), np.median(new_data, axis=-1), linestyle=styles[f], marker=moment_shapes[moment_num], mfc='none', markersize=5, alpha=0.5, color=colors[f])\n", + "# violin = plt.violinplot(list(new_data), np.exp(weird_x+f_factor), showextrema=False, showmeans=False, showmedians=False, widths=np.exp(weird_x+const/2.)-np.exp(weird_x))\n", + "# # for partname in ['cmedians']:\n", + "# # vp = violin[partname]\n", + "# # vp.set_edgecolor(colors[f])\n", + "# # vp.set_linewidth(3)\n", + "# # Make the violin body blue with a red border:\n", + "# for vp in violin['bodies']:\n", + "# vp.set_facecolor(coplors[f])\n", + "# # vp.set_edgecolor('k')\n", + "# # vp.set_linewidth(0)\n", + "# vp.set_alpha(0.5)\n", + "# plt.semilogx()\n", + "# plt.ylim(-50., 50.)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# print(np.shape(new_data))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# plt.boxplot(list(new_data), floats, '')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# print(np.shape(pz_stats['quantiles'][0][0]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# print(violin.keys())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# help(plt.boxplot)" + ] }, { "cell_type": "code", @@ -989,14 +2129,14 @@ "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.13" + "pygments_lexer": "ipython3", + "version": "3.6.3" } }, "nbformat": 4, diff --git a/docs/desc-0000-qp-photo-z_approximation/research/data_exploration.ipynb b/docs/desc-0000-qp-photo-z_approximation/research/data_exploration.ipynb index a9be1d20..1599f049 100644 --- a/docs/desc-0000-qp-photo-z_approximation/research/data_exploration.ipynb +++ b/docs/desc-0000-qp-photo-z_approximation/research/data_exploration.ipynb @@ -85,8 +85,8 @@ "outputs": [], "source": [ "# choose one of these:\n", - "dataset_key = 'Euclid'# Melissa Graham's data\n", - "# dataset_key = 'LSST'# Sam Schmidt's data\n", + "# dataset_key = 'Euclid'# Melissa Graham's data\n", + "dataset_key = 'LSST'# Sam Schmidt's data\n", "dataname = dataset_key\n", "\n", "dataset_info = {}\n", @@ -192,24 +192,24 @@ }, "outputs": [], "source": [ - "colors = ['red','green','blue','cyan','magenta','yellow']\n", - "n_plot = len(colors)\n", - "\n", - "# if dataset_key == 'mg':\n", - "# indices = [1, 3, 14, 16, 19, 21]\n", - "# elif dataset_key == 'ss':\n", - "n_gals_tot = len(pdfs)\n", - "full_gal_range = range(n_gals_tot)\n", - "indices = np.random.choice(full_gal_range, n_plot)\n", - "\n", - "for i in range(n_plot):\n", - " plt.plot(dataset_info[dataset_key]['z_grid'], pdfs[indices[i]], \n", - " color=colors[i], label=dataset_key+'#'+str(indices[i]))\n", - "plt.xlabel(r'$z$', fontsize=16)\n", - "plt.ylabel(r'$p(z)$', fontsize=16)\n", - "plt.title(dataset_key+' mock catalog')\n", - "plt.legend()\n", - "plt.savefig('pz_placeholder_'+dataset_key+'.png', dpi=250)" + "# colors = ['red','green','blue','cyan','magenta','yellow']\n", + "# n_plot = len(colors)\n", + "\n", + "# # if dataset_key == 'mg':\n", + "# # indices = [1, 3, 14, 16, 19, 21]\n", + "# # elif dataset_key == 'ss':\n", + "# n_gals_tot = len(pdfs)\n", + "# full_gal_range = range(n_gals_tot)\n", + "# indices = np.random.choice(full_gal_range, n_plot)\n", + "\n", + "# for i in range(n_plot):\n", + "# plt.plot(dataset_info[dataset_key]['z_grid'], pdfs[indices[i]], \n", + "# color=colors[i], label=dataset_key+' #'+str(indices[i]))\n", + "# plt.xlabel(r'$z$', fontsize=16)\n", + "# plt.ylabel(r'$p(z)$', fontsize=16)\n", + "# plt.title(dataset_key+' mock catalog')\n", + "# plt.legend()\n", + "# plt.savefig('pz_placeholder_'+dataset_key+'.pdf', dpi=250)" ] }, { @@ -246,8 +246,10 @@ "if dataset_key == 'Euclid':\n", " chosen = 5390\n", "elif dataset_key == 'LSST':\n", - " chosen = 108019\n", - "\n", + "# chosen = 108019 \n", + " indices = [ 12543, 52661, 46216, 53296, 95524, 84574 , 2607 ,56017 , 64794, 7600]\n", + " chosen = indices[9]\n", + " \n", "start_time = timeit.default_timer()\n", "G = qp.PDF(gridded=(dataset_info[dataset_key]['z_grid'], pdfs[chosen]))\n", "print(timeit.default_timer() - start_time)\n", @@ -265,13 +267,13 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": false }, "outputs": [], "source": [ "if dataset_key == 'Euclid':\n", " nc_needed = 3\n", - "elif datanset_key == 'LSST':\n", + "elif dataset_key == 'LSST':\n", " nc_needed = 5\n", " \n", "dataset_info[dataset_key]['N_GMM'] = nc_needed" @@ -312,7 +314,8 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false + "collapsed": false, + "scrolled": true }, "outputs": [], "source": [ @@ -332,7 +335,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The `qp.composite` object can be used as the `qp.PDF.truth` to initialize a new `qp.PDF` object that doesn't have any information about the gridded or sample approximations but has a qualitatively similar shape and is thus \"realistically complex\" enough to draw conclusions about real data. Now we can approximate it any way we like! Consider this example for $N_f=20$ parameters." + "The `qp.composite` object can be used as the `qp.PDF.truth` to initialize a new `qp.PDF` object that doesn't have any information about the gridded or sample approximations but has a qualitatively similar shape and is thus \"realistically complex\" enough to draw conclusions about real data. Now we can approximate it any way we like! Consider this example for $N_f=7$ parameters." ] }, { @@ -340,16 +343,16 @@ "execution_count": null, "metadata": { "collapsed": false, - "scrolled": false + "scrolled": true }, "outputs": [], "source": [ - "N_f = 20\n", + "N_f = 7\n", "M = qp.PDF(truth=G.mix_mod, limits=dataset_info[dataset_key]['z_lim'])\n", "M.quantize(N=N_f, vb=False)\n", "M.histogramize(N=N_f, binrange=dataset_info[dataset_key]['z_lim'], vb=False)\n", "M.sample(N=N_f, using='truth', vb=False)\n", - "M.plot(loc=dataset_key+'_example_pz.png', vb=True)" + "M.plot(loc=dataset_key+'_example_pz.pdf', vb=True)" ] }, { @@ -407,7 +410,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": false }, "outputs": [], "source": [ @@ -424,7 +427,7 @@ "\n", "if dataset_key == 'Euclid':\n", " dataset_info[dataset_key]['N_GMM'] = 3\n", - "elif datanset_key == 'LSST':\n", + "elif dataset_key == 'LSST':\n", " dataset_info[dataset_key]['N_GMM'] = 5\n", "fit_components = dataset_info[dataset_key]['N_GMM']\n", "\n", @@ -485,7 +488,7 @@ " N_pdfs =len(pdfs)\n", " randos = np.random.choice(range(N_pdfs), n_plot)\n", " for i in range(n_plot):\n", - " plt.plot(z_grid, pdfs[randos[i]], label=dataset_key+'#'+str(randos[i]))\n", + " plt.plot(z_grid, pdfs[randos[i]], label=dataset_key+r'\\#'+str(randos[i]))\n", " plt.xlabel(r'$z$', fontsize=16)\n", " plt.ylabel(r'$p(z)$', fontsize=16)\n", " plt.title(dataset_key+' mock catalog')\n", @@ -1657,14 +1660,14 @@ "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.13" + "pygments_lexer": "ipython3", + "version": "3.6.3" } }, "nbformat": 4, diff --git a/docs/notebooks/demo.ipynb b/docs/notebooks/demo.ipynb index b7f50d26..049b685d 100644 --- a/docs/notebooks/demo.ipynb +++ b/docs/notebooks/demo.ipynb @@ -170,7 +170,7 @@ }, "outputs": [], "source": [ - "print P.approximate([0.314], using='quantiles')" + "print P.approximate(np.array([0.314]), using='quantiles')" ] }, { @@ -227,7 +227,7 @@ "outputs": [], "source": [ "print P.scheme\n", - "print P.approximate([0.314], using='quantiles', scheme='nearest')\n", + "print P.approximate(np.array([0.314]), using='quantiles', scheme='nearest')\n", "print P.scheme" ] }, @@ -373,23 +373,29 @@ }, "outputs": [], "source": [ - "qD1 = qp.utils.calculate_kl_divergence(P, Q, limits=(-5.,5.), vb=True)\n", + "qD1 = qp.utils.calculate_kl_divergence(P, Q, limits=(-1.,1.), vb=True)\n", "qD2 = qp.utils.calculate_kl_divergence(P, Q, limits=(-2.,2.), vb=True)\n", "qD3 = qp.utils.calculate_kl_divergence(P, Q, limits=(-3.,3.), vb=True)\n", - "\n", - "hD1 = qp.utils.calculate_kl_divergence(P, H, limits=(-5.,5.), vb=True)\n", + "print 'Quantile approximation: KLD over 1,2,3 sigma ranges = ', qD1, qD2, qD3\n", + "hD1 = qp.utils.calculate_kl_divergence(P, H, limits=(-1.,1.), vb=True)\n", "hD2 = qp.utils.calculate_kl_divergence(P, H, limits=(-2.,2.), vb=True)\n", "hD3 = qp.utils.calculate_kl_divergence(P, H, limits=(-3.,3.), vb=True)\n", - "\n", - "sD1 = qp.utils.calculate_kl_divergence(P, S, limits=(-5.,5.), vb=True)\n", + "print 'Histogram approximation: KLD over 1,2,3 sigma ranges = ', hD1, hD2, hD3\n", + "sD1 = qp.utils.calculate_kl_divergence(P, S, limits=(-1.,1.), vb=True)\n", "sD2 = qp.utils.calculate_kl_divergence(P, S, limits=(-2.,2.), vb=True)\n", "sD3 = qp.utils.calculate_kl_divergence(P, S, limits=(-3.,3.), vb=True)\n", - "\n", - "print 'Quantile approximation: KLD over 1,2,3 sigma ranges = ', qD1, qD2, qD3\n", - "print 'Histogram approximation: KLD over 1,2,3 sigma ranges = ', hD1, hD2, hD3\n", "print 'Sampled approximation: KLD over 1,2,3 sigma ranges = ', sD1, sD2, sD3" ] }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "Holy smokes, does the quantile approximation blow everything else out of the water, thanks to using spline interpolation." + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -563,7 +569,7 @@ "outputs": [], "source": [ "Cs = qp.PDF(truth=C_dist, limits = composite_lims)\n", - "Cs.sample(N=20, vb=False)\n", + "Cs.sample(N=20, using='truth', vb=False)\n", "Cs.plot()" ] }, @@ -609,14 +615,14 @@ "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.13" + "pygments_lexer": "ipython3", + "version": "3.6.3" } }, "nbformat": 4, diff --git a/docs/notebooks/kld.ipynb b/docs/notebooks/kld.ipynb index 49aa0908..f29cda6e 100644 --- a/docs/notebooks/kld.ipynb +++ b/docs/notebooks/kld.ipynb @@ -218,8 +218,9 @@ }, "outputs": [], "source": [ - "widths = np.logspace(-3.0,3.0,13)\n", + "widths = np.logspace(-2.0,2.0,13)\n", "D = np.empty_like(widths)\n", + "E = np.empty_like(widths)\n", "\n", "x = 0.0\n", "infinity = 1000.0\n", @@ -227,7 +228,7 @@ "for k,sigma in enumerate(widths):\n", " Q = qp.PDF(truth=sps.norm(loc=x, scale=sigma))\n", " D[k] = qp.utils.calculate_kl_divergence(P, Q, limits=(-infinity,infinity), vb=False)\n", - " \n", + " E[k] = qp.utils.calculate_rmse(P, Q, limits=(-infinity,infinity), vb=False)\n", "print zip(widths, D)" ] }, @@ -241,14 +242,38 @@ "source": [ "x = widths\n", "y = np.log(widths*(2.0/np.pi))\n", - "plt.plot(x, y, color='gray', linestyle='-', lw=8.0, alpha=0.5, label='log($2\\sigma/\\pi}$)')\n", - "\n", - "plt.plot(widths, D, color='black', linestyle='-', lw=2.0, alpha=1.0, label='Offset=0.0')\n", - "plt.xscale('log')\n", - "plt.ylim(0.0,32.0)\n", - "plt.xlabel('Width of approximating Gaussian $\\sigma$')\n", - "plt.ylabel('KL divergence (nats)')\n", - "l = plt.legend(loc='upper right')" + "\n", + "# plt.plot(widths, D, color='black', linestyle='-', lw=2.0, alpha=1.0, label='Offset=0.0')\n", + "# plt.xscale('log')\n", + "# plt.ylim(0.0,32.0)\n", + "# plt.xlabel('Width of approximating Gaussian $\\sigma$')\n", + "# plt.ylabel('KL divergence (nats)')\n", + "# l = plt.legend(loc='upper right')\n", + "# plt.show()\n", + "\n", + "# plt.plot(widths, E, color='black', linestyle='-', lw=2.0, alpha=1.0, label='Offset=0.0')\n", + "# plt.xscale('log')\n", + "# plt.xlabel('Width of approximating Gaussian $\\sigma$')\n", + "# plt.ylabel('RMSE')\n", + "# l = plt.legend(loc='upper right')\n", + "# plt.show()\n", + "\n", + "fig, ax = plt.subplots()\n", + "ax.plot(x, y, color='gray', linestyle='-', lw=8.0, alpha=0.5, label=r'$\\log[2\\sigma/\\pi\\sigma_{0}]$')\n", + "ax.set_xscale('log')\n", + "ax.set_xlabel(r'root variance ratio $\\sigma/\\sigma_{0}$')\n", + "ax.set_xlim(1e-2, np.max(widths))\n", + "ax.set_ylim(0, 20)\n", + "ax.plot(widths, D, color='k', linestyle='-', lw=2.0, alpha=1.0, label='KLD')\n", + "ax.set_ylabel('KLD (nats)')\n", + "l = ax.legend(loc='upper right')\n", + "axr = ax.twinx()\n", + "axr.set_ylim(0, 0.1)\n", + "axr.plot(widths, E, color='k', linestyle=':', lw=2.0, alpha=1.0, label='RMSE')\n", + "axr.set_ylabel('RMSE', rotation=270, labelpad=15)\n", + "l = axr.legend(loc= 'lower left')\n", + "fig.show()\n", + "fig.savefig('precision.pdf', dpi=250)" ] }, { @@ -279,6 +304,7 @@ "source": [ "separations = np.linspace(0.0,15.0,16)\n", "D = np.empty_like(separations)\n", + "E = np.empty_like(separations)\n", "\n", "sigma = 1.0\n", "infinity = 100.0\n", @@ -286,7 +312,7 @@ "for k,x0 in enumerate(separations):\n", " Q = qp.PDF(truth=sps.norm(loc=x0, scale=sigma))\n", " D[k] = qp.utils.calculate_kl_divergence(P, Q, limits=(-infinity,infinity), vb=False)\n", - " \n", + " E[k] = qp.utils.calculate_rmse(P, Q, limits=(-infinity,infinity), vb=False)\n", "print zip(separations, D)" ] }, @@ -298,10 +324,19 @@ }, "outputs": [], "source": [ - "plt.plot(separations, D, color='k', linestyle='-', lw=2.0, alpha=1.0, label='Width=1.0')\n", - "plt.xlabel('Separation between Gaussians')\n", - "plt.ylabel('KL divergence (nats)')\n", - "l = plt.legend(loc='upper left')" + "fig, ax = plt.subplots()\n", + "ax.set_xlabel(r'separation $|\\mu-\\mu_{0}|$')\n", + "ax.set_xlim(0, 10)\n", + "ax.set_xlim(0, np.max(D))\n", + "ax.plot(separations, D, color='k', linestyle='-', lw=2.0, alpha=1.0, label='KLD')\n", + "ax.set_ylabel('KLD (nats)')\n", + "l = ax.legend(loc='lower right')\n", + "axr = ax.twinx()\n", + "axr.plot(separations, E, color='k', linestyle=':', lw=2.0, alpha=1.0, label='RMSE')\n", + "axr.set_ylabel('RMSE', rotation=270, labelpad=15)\n", + "axr.set_xlim(0, 10)\n", + "l = axr.legend(loc='upper left')\n", + "fig.show()" ] }, { @@ -352,6 +387,7 @@ "separations = np.linspace(0.0,7.0,15)\n", "\n", "D = np.zeros([7,len(separations)])\n", + "E = np.zeros([7,len(separations)])\n", "tensions = np.empty_like(D)\n", "\n", "for j,sigma in enumerate(widths):\n", @@ -359,6 +395,7 @@ " for k,x0 in enumerate(separations):\n", " Q = qp.PDF(truth=sps.norm(loc=x0, scale=sigma))\n", " D[j,k] = qp.utils.calculate_kl_divergence(P, Q, limits=(-infinity,infinity), vb=False)\n", + " E[j,k] = qp.utils.calculate_rmse(P, Q, limits=(-infinity,infinity), vb=False)\n", " tensions[j,k] = x0 / np.sqrt(sigma*sigma + 1.0)\n", " " ] @@ -373,24 +410,47 @@ "source": [ "x = tensions[0,:]\n", "y = x**2\n", - "plt.plot(x, y, color='gray', linestyle='-', lw=8.0, alpha=0.5, label='$t^2$')\n", - "\n", - "plt.plot(tensions[0,:], D[0,:], color='black', linestyle='-', lw=2.0, alpha=1.0, label='Width=1.0')\n", - "plt.plot(tensions[1,:], D[1,:], color='violet', linestyle='-', lw=2.0, alpha=1.0, label='Width=1,5')\n", - "plt.plot(tensions[2,:], D[2,:], color='blue', linestyle='-', lw=2.0, alpha=1.0, label='Width=2.0')\n", - "plt.plot(tensions[3,:], D[3,:], color='green', linestyle='-', lw=2.0, alpha=1.0, label='Width=2.5')\n", - "plt.plot(tensions[4,:], D[4,:], color='yellow', linestyle='-', lw=2.0, alpha=1.0, label='Width=3.0')\n", - "plt.plot(tensions[5,:], D[5,:], color='orange', linestyle='-', lw=2.0, alpha=1.0, label='Width=3.5')\n", - "plt.plot(tensions[6,:], D[6,:], color='red', linestyle='-', lw=2.0, alpha=1.0, label='Width=4.0')\n", - "plt.xlabel('Tension between Gaussians, $t$ (sigma)')\n", - "plt.ylabel('KL divergence (nats)')\n", - "l = plt.legend(loc='upper left')" + "\n", + "fig, ax = plt.subplots()\n", + "ax.plot(x, y, color='gray', linestyle='-', lw=8.0, alpha=0.5, label='$t^2$')\n", + "ax.set_xlabel('tension $t$ (sigma)')\n", + "ax.set_xlim(0, np.max(tensions))\n", + "ax.plot([-1], [-1], color='black', linestyle='-', lw=2.0, alpha=1.0, label='KLD')\n", + "ax.plot([-1], [-1], color='black', linestyle=':', lw=2.0, alpha=1.0, label='RMSE')\n", + "colors = {'blueviolet':1.0, 'forestgreen':2.0, 'darkorange':3.0}\n", + "for item in colors.keys():\n", + " ax.scatter([0], [0], color=item, label='Width='+str(colors[item])[0]+r'$\\sigma$')\n", + "\n", + "ax.plot(tensions[0,:], D[0,:], color='blueviolet', linestyle='-', lw=2.0, alpha=1.0)#, label='Width=1.0')\n", + "# ax.plot(tensions[1,:], D[1,:], color='violet', linestyle='-', lw=2.0, alpha=1.0, label='Width=1,5')\n", + "ax.plot(tensions[2,:], D[2,:], color='forestgreen', linestyle='-', lw=2.0, alpha=1.0)#, label='Width=2.0')\n", + "# ax.plot(tensions[3,:], D[3,:], color='green', linestyle='-', lw=2.0, alpha=1.0, label='Width=2.5')\n", + "ax.plot(tensions[4,:], D[4,:], color='darkorange', linestyle='-', lw=2.0, alpha=1.0)#, label='Width=3.0')\n", + "# ax.plot(tensions[5,:], D[5,:], color='orange', linestyle='-', lw=2.0, alpha=1.0, label='Width=3.5')\n", + "# ax.plot(tensions[6,:], D[6,:], color='forestgreen', linestyle='-', lw=2.0, alpha=1.0, label='Width=4.0')\n", + "ax.set_ylabel('KLD (nats)')\n", + "l = ax.legend(loc='lower right')\n", + "\n", + "axr = ax.twinx()\n", + "axr.plot(tensions[0,:], E[0,:], color='blueviolet', linestyle=':', lw=2.0, alpha=1.0)#, label='Width=1.0')\n", + "# axr.plot(tensions[1,:], E[1,:], color='violet', linestyle=':', lw=2.0, alpha=1.0, label='Width=1,5')\n", + "axr.plot(tensions[2,:], E[2,:], color='forestgreen', linestyle=':', lw=2.0, alpha=1.0)#, label='Width=2.0')\n", + "# axr.plot(tensions[3,:], E[3,:], color='green', linestyle=':', lw=2.0, alpha=1.0, label='Width=2.5')\n", + "axr.plot(tensions[4,:], E[4,:], color='darkorange', linestyle=':', lw=2.0, alpha=1.0)#, label='Width=3.0')\n", + "# axr.plot(tensions[5,:], E[5,:], color='orange', linestyle=':', lw=2.0, alpha=1.0, label='Width=3.5')\n", + "# axr.plot(tensions[6,:], E[6,:], color='forestgreen', linestyle=':', lw=2.0, alpha=1.0, label='Width=4.0')\n", + "axr.set_ylabel('RMSE', rotation=270, labelpad=15)\n", + "axr.set_xlim(0, np.max(tensions))\n", + "\n", + "fig.show()\n", + "fig.savefig('tension.pdf', dpi=250)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ + "\n", "## Conclusions\n", "\n", "To summarize, the KL divergence $D$ is an appropriate metric of an approximation to a probability distribution, expressing the loss of information of the approximation from the true distribution. The simple numerical experiments in this notebook suggest the following approximate extrapolations and hypotheses. \n", diff --git a/qp/composite.py b/qp/composite.py index 383ec637..85bfbd13 100644 --- a/qp/composite.py +++ b/qp/composite.py @@ -82,16 +82,14 @@ def rvs(self, size): xs: numpy.ndarray, float samples from the PDF """ - groups = [0]*self.n_components - for item in range(size): - groups[qp.utils.choice(self.component_range, self.coefficients)] += 1 - samples = [] * size - for c in self.component_range: - for n in range(groups[c]): - samples.append(self.functions[c].rvs()) - return np.array(samples) - - def ppf(self, cdfs, ivals=None, vb=True): + groups = np.random.choice(self.component_range, size, p=self.coefficients) + u, counts = np.unique(groups, return_counts=True) + samples = np.empty(0) + for i in range(len(u)): + samples = np.append(samples, self.functions[u[i]].rvs(counts[i])) + return np.array(samples).flatten() + + def ppf(self, cdfs, ivals=None): """ Evaluates the composite PPF at locations @@ -101,8 +99,6 @@ def ppf(self, cdfs, ivals=None, vb=True): value(s) at which to find quantiles ivals: float or numpy.ndarray, float initial guesses for quantiles - vb: boolean - print progress to stdout? Returns ------- diff --git a/qp/ensemble.py b/qp/ensemble.py index 877aa934..2846c5a6 100644 --- a/qp/ensemble.py +++ b/qp/ensemble.py @@ -1,8 +1,10 @@ import numpy as np +import pathos from pathos.multiprocessing import ProcessingPool as Pool -import psutil +# import psutil import timeit import os +import sys # import sqlalchemy import scipy.interpolate as spi import matplotlib.pyplot as plt @@ -66,7 +68,7 @@ def __init__(self, N, truth=None, quantiles=None, histogram=None, gridded=None, if procs is not None: self.n_procs = procs else: - self.n_procs = psutil.cpu_count() + self.n_procs = pathos.helpers.cpu_count() self.pool = Pool(self.n_procs) print('made the pool of '+str(self.n_procs)+' in '+str(timeit.default_timer() - start_time)) @@ -99,9 +101,9 @@ def __init__(self, N, truth=None, quantiles=None, histogram=None, gridded=None, else: self.histogram = [(histogram[0], histogram[1][i]) for i in self.pdf_range] if gridded is None: - self.gridded = [None] * N + self.gridded = (None, [None] * N) else: - self.gridded = [(gridded[0], gridded[1][i]) for i in self.pdf_range] + self.gridded = (None, [(gridded[0], gridded[1][i]) for i in self.pdf_range]) self.mix_mod = None self.evaluated = None @@ -110,7 +112,7 @@ def __init__(self, N, truth=None, quantiles=None, histogram=None, gridded=None, self.make_pdfs() self.stacked = {} - + self.klds = {} def make_pdfs(self): """ @@ -121,7 +123,7 @@ def make_pdfs_helper(i): # logfile.write('making pdf '+str(i)+'\n') return qp.PDF(truth=self.truth[i], quantiles=self.quantiles[i], histogram=self.histogram[i], - gridded=self.gridded[i], samples=self.samples[i], limits=self.limits, + gridded=self.gridded[-1][i], samples=self.samples[i], limits=self.limits, scheme=self.scheme, vb=False) start_time = timeit.default_timer() @@ -155,9 +157,12 @@ def sample(self, samps=100, infty=default_infty, using=None, vb=True): TODO: change syntax samps --> N """ def sample_helper(i): + try: # with open(self.logfilename, 'wb') as logfile: # logfile.write('sampling pdf '+str(i)+'\n') - return self.pdfs[i].sample(N=samps, infty=infty, using=using, vb=False) + return self.pdfs[i].sample(N=samps, infty=infty, using=using, vb=False) + except Exception: + print('ERROR: sampling failed on '+str(i)+' because '+str(sys.exc_info()[0])) self.samples = self.pool.map(sample_helper, self.pdf_range) @@ -186,10 +191,13 @@ def quantize(self, quants=None, N=None, limits=None, vb=True): array of tuples of the CDF values and the quantiles for each PDF """ def quantize_helper(i): + # try: # with open(self.logfilename, 'wb') as logfile: # logfile.write('quantizing pdf '+str(i)+'\n') return self.pdfs[i].quantize(quants=quants, - N=N, limits=None, vb=False) + N=N, limits=None, vb=vb) + # except Exception: + # print('ERROR: quantization failed on '+str(i)+' because '+str(sys.exc_info()[0])) self.quantiles = self.pool.map(quantize_helper, self.pdf_range) self.quantiles = np.swapaxes(np.array(self.quantiles), 0, 1) @@ -219,10 +227,13 @@ def histogramize(self, binends=None, N=10, binrange=None, vb=True): of bins and values in bins """ def histogram_helper(i): + try: # with open(self.logfilename, 'wb') as logfile: # logfile.write('histogramizing pdf '+str(i)+'\n') - return self.pdfs[i].histogramize(binends=binends, N=N, + return self.pdfs[i].histogramize(binends=binends, N=N, binrange=binrange, vb=False) + except Exception: + print('ERROR: histogramization failed on '+str(i)+' because '+str(sys.exc_info()[0])) self.histogram = self.pool.map(histogram_helper, self.pdf_range) self.histogram = np.swapaxes(np.array(self.histogram), 0, 1) @@ -253,9 +264,12 @@ def mix_mod_fit(self, comps=5, using=None, vb=True): Currently only supports mixture of Gaussians """ def mixmod_helper(i): + try: # with open(self.logfilename, 'wb') as logfile: # logfile.write('fitting pdf '+str(i)+'\n') - return self.pdfs[i].mix_mod_fit(n_components=comps, using=using, vb=False) + return self.pdfs[i].mix_mod_fit(n_components=comps, using=using, vb=False) + except Exception: + print('ERROR: mixture model fitting failed on '+str(i)+' because '+str(sys.exc_info()[0])) self.mix_mod = self.pool.map(mixmod_helper, self.pdf_range) @@ -278,19 +292,22 @@ def evaluate(self, loc, using=None, norm=False, vb=True): Returns ------- - vals: ndarray, ndarray, float - the values of the PDFs (or their approximations) at the requested - location(s), of shape (npdfs, nlocs) + self.gridded: tuple(string, tuple(ndarray, ndarray, float)) + tuple of string and tuple of grid and values of the PDFs (or their approximations) at the requested location(s), of shape (npdfs, nlocs) """ def evaluate_helper(i): + try: # with open(self.logfilename, 'wb') as logfile: # logfile.write('evaluating pdf '+str(i)+'\n') - return self.pdfs[i].evaluate(loc=loc, using=using, norm=norm, vb=False) + return self.pdfs[i].evaluate(loc=loc, using=using, norm=norm, vb=vb) + except Exception: + print('REAL ERROR: evaluation with '+using+' failed on '+str(i)+' because '+str(sys.exc_info()[0])) + # return result self.gridded = self.pool.map(evaluate_helper, self.pdf_range) self.gridded = np.swapaxes(np.array(self.gridded), 0, 1) - self.gridded = (self.gridded[0][0], self.gridded[1]) + self.gridded = (using, (self.gridded[0][0], self.gridded[1])) - return self.gridded + return self.gridded[-1] def integrate(self, limits, using, dx=0.001): """ @@ -311,13 +328,57 @@ def integrate(self, limits, using, dx=0.001): value of the integral """ def integrate_helper(i): - return self.pdfs[i].integrate(limits[i], using=using, dx=dx, vb=False) + try: + return self.pdfs[i].integrate(limits[i], using=using, dx=dx, vb=False) + except Exception: + print('ERROR: integration failed on '+str(i)+' because '+str(sys.exc_info()[0])) integrals = self.pool.map(integrate_helper, self.pdf_range) return integrals - def kld(self, using=None, limits=None, dx=0.01): + def moment(self, N, using=None, limits=None, dx=0.01, vb=False): + """ + Calculates a given moment for each PDF in the ensemble + + Parameters + ---------- + N: int + number of moment + using: string + which parametrization to use + limits: tuple of floats, optional + endpoints of integration interval in which to calculate moment + dx: float + resolution of integration grid + vb: boolean + print progress to stdout? + + Returns + ------- + moments: numpy.ndarray, float + moment values of each PDF under the using approximation or truth + """ + D = int((limits[-1] - limits[0]) / dx) + grid = np.linspace(limits[0], limits[1], D) + dx = (limits[-1] - limits[0]) / (D - 1) + grid_to_N = grid ** N + + if self.gridded[0] == using and np.array_equal(self.gridded[-1][0], grid): + if vb: print('taking a shortcut') + def moment_helper(i): + return u.quick_moment(self.gridded[-1][-1][i], grid_to_N, dx) + else: + def moment_helper(i): + p_eval = self.pdfs[i].evaluate(grid, using=using, vb=vb)[1] + return u.quick_moment(p_eval, grid_to_N, dx) + + moments = self.pool.map(moment_helper, self.pdf_range) + + moments = np.array(moments) + return moments + + def kld(self, using=None, limits=None, dx=0.01, vb=False): """ Calculates the KLD for each PDF in the ensemble @@ -329,6 +390,8 @@ def kld(self, using=None, limits=None, dx=0.01): endpoints of integration interval in which to calculate KLD dx: float resolution of integration grid + vb: boolean + print progress to stdout? Returns ------- @@ -365,18 +428,34 @@ def Q_func(pdf): print(using + ' not available; try a different parametrization.') return - def kld_helper(i): - P = P_func(self.pdfs[i]) - Q = Q_func(self.pdfs[i]) - return u.calculate_kl_divergence(P, Q, limits=limits, dx=dx) + D = int((limits[-1] - limits[0]) / dx) + grid = np.linspace(limits[0], limits[1], D) + # dx = (limits[-1] - limits[0]) / (D - 1) + + self.klds[using] = np.empty(self.n_pdfs) + if self.gridded[0] == using and np.array_equal(self.gridded[-1][0], grid): + if vb: print('taking a shortcut') + def kld_helper(i): + P_eval = P_func(self.pdfs[i]).evaluate(grid, using='truth', vb=vb, norm=True)[-1] + KL = u.quick_kl_divergence(P_eval, self.gridded[-1][-1][i], dx=dx) + self.pdfs[i].klds[using] = KL + self.klds[using][i] = KL + return KL + else: + def kld_helper(i): + P_eval = P_func(self.pdfs[i]).evaluate(grid, using='truth', vb=vb, norm=True)[-1] + Q_eval = Q_func(self.pdfs[i]).evaluate(grid, vb=vb, using=using, norm=True)[-1] + KL = u.quick_kl_divergence(P_eval, Q_eval, dx=dx) + self.pdfs[i].klds[using] = KL + self.klds[using][i] = KL + return KL klds = self.pool.map(kld_helper, self.pdf_range) - klds = np.array(klds) - return klds + return self.klds - def rmse(self, using=None, limits=None, dx=0.01): + def rmse(self, using=None, limits=None, dx=0.01, vb=False): """ Calculates the RMSE for each PDF in the ensemble @@ -388,6 +467,8 @@ def rmse(self, using=None, limits=None, dx=0.01): endpoints of integration interval in which to calculate RMSE dx: float resolution of integration grid + vb: boolean + print progress to stdout? Returns ------- @@ -420,10 +501,19 @@ def Q_func(pdfs): print(using + ' not available; try a different parametrization.') return - def rmse_helper(i): - P = P_func(pdfs[i]) - Q = Q_func(pdfs[i]) - return utils.calculate_rmse(P, Q, limits=limits, dx=dx) + D = int((limits[-1] - limits[0]) / dx) + grid = np.linspace(limits[0], limits[1], D) + dx = (limits[-1] - limits[0]) / (D - 1) + + if self.gridded[0] == using and np.array_equal(self.gridded[-1][0], grid): + if vb: print('taking a shortcut') + def rmse_helper(i): + return u.quick_rmse(self.gridded[-1][-1][i], grid, dx=dx) + else: + def rmse_helper(i): + P_eval = P_func(self.pdfs[i]).evaluate(grid, norm=True, vb=vb)[-1] + Q_eval = Q_func(self.pdfs[i]).evaluate(grid, norm=True, vb=vb)[-1] + return u.quick_rmse(P_eval, Q_eval, dx=dx) rmses = self.pool.map(rmse_helper, self.pdf_range) @@ -453,10 +543,12 @@ def stack(self, loc, using, vb=True): Notes ----- Stacking refers to taking the sum of PDFs evaluated on a shared grid and normalizing it such that it integrates to unity. This is equivalent to calculating an average probability (based on the PDFs in the ensemble) over the grid. This probably should be done in a script and not by qp! The right way to do it would be to call qp.Ensemble.evaluate() and sum those outputs appropriately. + TO DO: make this do something more efficient for mixmod, grid, histogram, samples + TO DO: enable stacking on irregular grid """ loc_range = max(loc) - min(loc) delta = loc_range / len(loc) - evaluated = self.evaluate(loc, using=using, norm=True, vb=False) + evaluated = self.evaluate(loc, using=using, norm=True, vb=vb) stack = np.mean(evaluated[1], axis=0) stack /= np.sum(stack) * delta assert(np.isclose(np.sum(stack) * delta, 1.)) diff --git a/qp/pdf.py b/qp/pdf.py index 6a2a76ad..263dea38 100644 --- a/qp/pdf.py +++ b/qp/pdf.py @@ -3,10 +3,21 @@ import scipy.stats as sps import scipy.interpolate as spi import scipy.optimize as spo -import matplotlib.pyplot as plt import sklearn as skl from sklearn import mixture +import matplotlib as mpl +import matplotlib.pyplot as plt +mpl.rcParams['text.usetex'] = True +mpl.rcParams['mathtext.rm'] = 'serif' +mpl.rcParams['font.family'] = 'serif' +mpl.rcParams['font.serif'] = 'Times New Roman' +mpl.rcParams['axes.titlesize'] = 16 +mpl.rcParams['axes.labelsize'] = 16 +mpl.rcParams['savefig.dpi'] = 250 +mpl.rcParams['savefig.format'] = 'pdf' +mpl.rcParams['savefig.bbox'] = 'tight' + import qp from qp.utils import infty as default_infty from qp.utils import epsilon as default_eps @@ -39,8 +50,8 @@ def __init__(self, truth=None, quantiles=None, histogram=None, Array of length nsamples containing sampled values limits: tuple, float, optional limits past which PDF is considered to be 0. - scheme: string, optional - name of interpolation scheme to use. + scheme: string or int, optional + name of interpolation scheme to use, or order of spline interpolation. vb: boolean report on progress to stdout? @@ -88,9 +99,8 @@ def __init__(self, truth=None, quantiles=None, histogram=None, # The most recent parametrization used is, at this point, the # first one: self.last = self.first - - # We'll make an interpolator if and when we need it: - self.interpolator = None + self.interpolator = [None, None] + self.klds = {} return @@ -222,54 +232,92 @@ def quantize(self, quants=None, N=9, limits=None, vb=True): if self.truth is not None: if isinstance(self.truth, qp.composite): + if type(self.scheme) != int: + order = 5 + else: + order = self.scheme + extrapoints = np.concatenate((np.array([0.]), quantpoints, np.array([1.]))) min_delta = np.min(extrapoints[1:] - extrapoints[:-1]) - grid = np.linspace(limits[0], limits[-1], N) + + grid = np.linspace(limits[0], limits[-1], N + 1) icdf = self.truth.cdf(grid) + unit_ext = 1. / (order + 1.) low_extended = 0 - while icdf[0] > quantpoints[0] and low_extended < 5: + while icdf[0] >= quantpoints[0]: low_extended += 1 + subgrid = np.linspace(limits[0] - 1., limits[0] - unit_ext, order) + subcdf = self.truth.cdf(subgrid) + grid = np.concatenate((subgrid, grid)) + icdf = np.concatenate((subcdf, icdf)) limits = (limits[0] - 1., limits[-1]) - grid = np.linspace(limits[0], limits[-1], N) - icdf = self.truth.cdf(grid) - if vb: - print('lower limits extended '+str(low_extended)+' times') + if vb: + print('lower limits extended '+str(low_extended)+' times') high_extended = 0 - while icdf[-1] < quantpoints[-1] and high_extended < 5: + while icdf[-1] <= quantpoints[-1]: high_extended += 1 + subgrid = np.linspace(limits[-1] + unit_ext, limits[-1] + 1., order) + subcdf = self.truth.cdf(subgrid) + grid = np.concatenate((grid, subgrid)) + icdf = np.concatenate((icdf, subcdf)) limits = (limits[0], limits[-1] + 1.) - grid = np.linspace(limits[0], limits[-1], N) - icdf = self.truth.cdf(grid) - if vb: - print('upper_limits extended '+str(high_extended)+' times') + if vb: + print('upper_limits extended '+str(high_extended)+' times') new_deltas = icdf[1:] - icdf[:-1] expanded = 0 - while np.max(new_deltas) >= 2. * min_delta and expanded < 10: + while np.max(new_deltas) >= min_delta: expanded += 1 - where_wrong = np.where(new_deltas >= 2. * min_delta)[0] + where_wrong = np.where(new_deltas >= min_delta)[0] flipped = np.flip(where_wrong, axis=0) for i in flipped: - delta_i = new_deltas[i] / (N + 1) - subgrid = np.linspace(grid[i] + delta_i, grid[i+1] - delta_i, 10) - grid = np.insert(grid, i+1, subgrid) - icdf = np.insert(icdf, i+1, self.truth.cdf(subgrid)) + delta_i = new_deltas[i] / (order + 1.) + subgrid = np.linspace(grid[i] + delta_i, grid[i+1] - delta_i, order) + grid = np.sort(np.insert(grid, i, subgrid)) + subcdf = self.truth.cdf(subgrid) + icdf = np.sort(np.insert(icdf, i, subcdf)) new_deltas = icdf[1:] - icdf[:-1] if vb: print('grid expanded '+str(expanded)+' times') - locs = np.array([bisect.bisect_right(icdf[:-1], quantpoints[n]) for n in range(N)]) - - quantiles = self.truth.ppf(quantpoints, ivals=grid[locs]) + # locs = np.array([bisect.bisect_right(icdf[:-1], quantpoints[n]) for n in range(N)]) + i = np.min(np.where(icdf > default_eps**(1./order))) + f = np.max(np.where(1.-icdf > default_eps**(1./order))) + icdf = icdf[i:f+1] + grid = grid[i:f+1] + + # if vb: print('about to interpolate the CDF: '+str((icdf, grid))) + # if vb: print('made the interpolator') + #quantiles self.truth.ppf(quantpoints, ivals=grid[locs]) + + # alternate = spi.interp1d(x, y, kind='linear', bounds_error=False, fill_value=default_eps) + # backup = qp.utils.make_kludge_interpolator((x, y), outside=default_eps) + + quantiles = np.flip(quantpoints, axis=0) + try: + while (order>0) and (not np.array_equal(quantiles, np.sort(quantiles))): + if vb: print('order is '+str(order)) + b = spi.InterpolatedUnivariateSpline(icdf, grid, k=order, ext=1) + quantiles = b(quantpoints) + order -= 1 + assert(not np.any(np.isnan(quantiles))) + assert(type(quantiles) is not dfitpack.error) + except AssertionError: + print('ERROR: splines failed because '+str(AssertionError)+', defaulting to optimization for '+str((icdf, grid))) + locs = np.array([bisect.bisect_right(icdf[:-1], quantpoints[n]) for n in range(N)]) + quantiles = self.truth.ppf(quantpoints, ivals=grid[locs]) + assert(not np.any(np.isnan(quantiles))) + if vb: print('output quantiles = '+str(quantiles)) else: quantiles = self.truth.ppf(quantpoints) else: print('New quantiles can only be computed from a truth distribution in this version.') return - if vb: - print("Resulting "+str(len(quantiles))+" quantiles: "+str(quantiles)) # integrals = self.truth.cdf(quantiles) # assert np.isclose(integrals, quantpoints) + assert(type(quantiles) is np.ndarray) self.quantiles = (quantpoints, quantiles) + if vb: + print("Resulting "+str(len(quantiles))+" quantiles: "+str(self.quantiles)) self.limits = (min(limits[0], np.min(quantiles)), max(limits[-1], np.max(quantiles))) self.last = 'quantiles' return self.quantiles @@ -392,7 +440,7 @@ def gmm(x, *args): stdevs = np.sqrt(estimator.covariances_[:, 0, 0]) if vb: - print(weights, means, stdevs) + print('weights, means, stds = '+str((weights, means, stdevs))) components = [] for i in comp_range: @@ -433,7 +481,7 @@ def sample(self, N=1000, infty=default_infty, using=None, vb=True): Notes ----- - TO DO: have quantiles use linear interpolator to get inverse CDF, then sample uniform in "bins" + TO DO: all formats should use rejection sampling TO DO: change infty to upper and lower bounds to use for quantiles TO DO: check for existence of parametrization before using it """ @@ -448,7 +496,7 @@ def sample(self, N=1000, infty=default_infty, using=None, vb=True): samples = self.mix_mod.rvs(size=N) elif using == 'gridded': - interpolator = self.interpolate(using = 'gridded', vb=vb) + interpolator = self.interpolate(using = 'gridded', vb=vb)[0] # (xlims, ylims) = self.evaluate(self.limits, using='gridded', vb=vb) (xmin, xmax) = (min(self.gridded[0]), max(self.gridded[0])) (ymin, ymax) = (min(self.gridded[1]), max(self.gridded[1])) @@ -466,7 +514,7 @@ def sample(self, N=1000, infty=default_infty, using=None, vb=True): self.quantiles = self.quantize(vb=vb) (x, y) = qp.utils.evaluate_quantiles(self.quantiles, vb=vb) - (endpoints, weights) = qp.utils.normalize_quantiles(self.quantiles[0], (x, y), vb=vb) + (endpoints, weights) = qp.utils.normalize_quantiles(self.quantiles, (x, y), vb=vb) # endpoints = np.insert(self.quantiles[1], [0, -1], self.limits) # weights = qp.utils.evaluate_quantiles(self.quantiles)[1]# self.evaluate((endpoints[1:]+endpoints[:-1])/2.) # interpolator = self.interpolate(using='quantiles', vb=False) @@ -508,17 +556,17 @@ def interpolate(self, using=None, vb=True): Returns ------- - self.interpolator + interpolator an interpolator object Notes ----- - The `self.interpolator` object is a function that is used by the + The `interpolator` object is a function that is used by the `approximate` method. It employs [`scipy.interpolate.interp1d`](https://docs.scipy.org/doc/scipy/reference/generated/scipy.interpolate.interp1d.html) - to carry out the interpolation, using the internal - `self.scheme` attribute to choose the interpolation scheme. - TO DO: There's got to be a better to do quantile interpolation! Maybe use inverse CDF? + to carry out the interpolation for the gridded format, using the internal + `self.scheme` attribute to choose the interpolation scheme. For quantile interpolation, it uses a `scipy.interpolate.InterpolatedUnivariateSpline` object, with self.scheme being the integer order of the spline. + TO DO: store the interpolators separately with using tags """ if using is None: using = self.last @@ -532,19 +580,128 @@ def interpolate(self, using=None, vb=True): if self.quantiles is None: self.quantiles = self.quantize(vb=vb) + if type(self.scheme) != int: + order = min(5, len(self.quantiles[0])) + else: + order = self.scheme + + if vb: print('input quantiles are '+str(self.quantiles[1])) (x, y) = qp.utils.evaluate_quantiles(self.quantiles, vb=vb) - (x, y) = qp.utils.normalize_quantiles(self.quantiles[0], (x, y), vb=vb) - self.interpolator = spi.interp1d(x, y, kind=self.scheme, bounds_error=False, fill_value=default_eps) - if vb: - print 'Created a `'+self.scheme+'` interpolator for the '+using+' parametrization.' - return self.interpolator + if vb: print('evaluated quantile PDF: '+str((x, y))) + # [x_crit_lo, x_crit_hi] = [x[0], x[-1]] + # [y_crit_lo, y_crit_hi] = [y[0], y[-1]] + (x, y) = qp.utils.normalize_quantiles(self.quantiles, (x, y), vb=vb) + if vb: print('complete evaluated quantile PDF: '+str((x, y))) + alternate = spi.interp1d(x, y, kind='linear', bounds_error=False, fill_value=default_eps) + backup = qp.utils.make_kludge_interpolator((x, y), outside=default_eps) + + z = np.insert(self.quantiles[1], 0, min(x)) + z = np.append(z, max(x)) + q = np.insert(self.quantiles[0], 0, 0.) + q = np.append(q, 1.) + + # knots, coeffs, degree = spi.splrep(z, q, k=order, s=0) + # + # def inside(xi): + # yi = spi.splev(xi, (knots, coeffs, degree), der=1) + # coeffs[yi<0] + [x_crit_lo, x_crit_hi] = [self.quantiles[1][0], self.quantiles[1][-1]] + [y_crit_lo, y_crit_hi] = [-1., -1.] + + try: + while (order>0) and ((y_crit_lo <= 0.) or (y_crit_hi <= 0.)): + if vb: print('order is '+str(order)) + inside = spi.InterpolatedUnivariateSpline(z, q, k=order, ext=1).derivative() + [y_crit_lo, y_crit_hi] = inside([x_crit_lo, x_crit_hi]) + order -= 1 + assert((y_crit_lo > 0.) and (y_crit_hi > 0.)) + except AssertionError: + print('ERROR: spline tangents '+str((y_crit_lo, y_crit_hi))+'<0') + if type(self.scheme) == str: + scheme = self.scheme + else: + scheme = 'linear' + if vb: print('defaulting to '+scheme+' interpolation') + inside_int = spi.interp1d(z, q, kind=scheme, bounds_error=False, fill_value=default_eps) + derivative = (q[1:] - q[:-1]) / (z[1:] - z[:-1]) + derivative = np.insert(derivative, 0, default_eps) + derivative = np.append(derivative, default_eps) + def inside(xf): + nx = len(xf) + yf = np.ones(nx) * default_eps + for n in range(nx): + i = bisect.bisect_left(z, xf[n]) + yf[n] = derivative[i] + return(yf) + [y_crit_lo, y_crit_hi] = inside([x_crit_lo, x_crit_hi]) + assert((y_crit_lo > 0.) and (y_crit_hi > 0.)) + + def quantile_interpolator(xf): + yf = np.ones(np.shape(xf)) * default_eps + in_inds = ((xf >= self.quantiles[1][0]) & (xf <= self.quantiles[1][-1])).nonzero()[0] + lo_inds = ((xf < self.quantiles[1][0]) & (xf >= z[0])).nonzero()[0] + hi_inds = ((xf > self.quantiles[1][-1]) & (xf <= z[-1])).nonzero()[0] + if vb: + print('divided into '+str((lo_inds, in_inds, hi_inds))) + + try: + yf[in_inds] = inside(xf[in_inds]) + assert(np.all(yf >= default_eps)) + if vb: + print 'Created a k=`'+str(order)+'`B-spline interpolator for the '+using+' parametrization.' + except AssertionError: + print('ERROR: spline interpolation failed with '+str((xf[in_inds], yf[in_inds]))) + try: + yf[in_inds] = alternate(xf[in_inds]) + assert(np.all(yf >= default_eps)) + if vb: + print 'Created a linear interpolator for the '+using+' parametrization.' + except AssertionError: + print 'ERROR: linear interpolation failed for the '+using+' parametrization with '+str((xf[in_inds], yf[in_inds])) + yf[in_inds] = backup(xf[in_inds]) + if vb: + print 'Doing linear interpolation by hand for the '+using+' parametrization.' + assert(np.all(yf >= default_eps)) + if vb: + print('evaluated inside '+str((xf[in_inds], yf[in_inds]))) + + try: + tan_lo = y_crit_lo / (x_crit_lo - z[0]) + yf[lo_inds] = tan_lo * (xf[lo_inds] - z[0])# yf[in_inds[0]] / (xf[in_inds[0]] - z[0]) + assert(np.all(yf >= default_eps)) + if vb: + print('evaluated below '+str((xf[lo_inds], yf[lo_inds]))) + except AssertionError: + print('ERROR: linear extrapolation below failed with '+str((xf[lo_inds], yf[lo_inds]))+' via '+str((tan_lo, x_crit_lo, z[0]))) + + try: + tan_hi = y_crit_hi / (z[-1] - x_crit_hi) + yf[hi_inds] = tan_hi * (z[-1] - xf[hi_inds])# yf[in_inds[-1]] * (xf[hi_inds] - z[-1]) / (xf[in_inds[-1]] - z[-1]) + assert(np.all(yf >= default_eps)) + if vb: + print('evaluated above '+str((xf[hi_inds], yf[hi_inds]))) + except AssertionError: + print('ERROR: linear extrapolation above failed with '+str((xf[hi_inds], yf[hi_inds]))+' via '+str((tan_hi, z[-1], x_crit_hi))) + + return(yf) + # if vb: + # print(tck) + + #still not enforcing integration at ends + # def quantile_interpolator(xf): + # yf = np.ones(len(xf)) * default_eps + # subset = ((xf>z[0]) == (xf0.) + if not np.all(dx>0.): + print('broken delta quantile values: '+str(xs)) + assert(np.all(dx>0.)) mx = (xs[1:] + xs[:-1]) / 2. y = dq / dx # print(np.dot(y, dx)) @@ -275,9 +246,9 @@ def evaluate_samples(x): y = kde(sx) return ((sx, y)) -def calculate_moment(p, N, using=None, limits=lims, dx=0.01, vb=False): +def calculate_moment(p, N, using=None, limits=None, dx=0.01, vb=False): """ - Calculates moments of a distribution + Calculates a moment of a qp.PDF object Parameters ---------- @@ -289,27 +260,56 @@ def calculate_moment(p, N, using=None, limits=lims, dx=0.01, vb=False): endpoints of integration interval over which to calculate moments dx: float resolution of integration grid + vb: Boolean + print progress to stdout? Returns ------- M: float - values of the moment + value of the moment """ + if limits is None: + limits = p.limits if using is None: using = p.first # Make a grid from the limits and resolution - grid = np.arange(limits[0], limits[1], dx) - grid_to_N = grid ** N + d = int((limits[-1] - limits[0]) / dx) + grid = np.linspace(limits[0], limits[1], d) + dx = (limits[-1] - limits[0]) / (d - 1) # Evaluate the functions on the grid pe = p.evaluate(grid, using=using, vb=vb)[1] # pe = normalize_gridded(pe)[1] # calculate the moment - M = dx * np.dot(grid_to_N, pe) + grid_to_N = grid ** N + M = quick_moment(pe, grid_to_N, dx) + return M + +def quick_moment(p_eval, grid_to_N, dx): + """ + Calculates a moment of an evaluated PDF + + Parameters + ---------- + p_eval: numpy.ndarray, float + the values of a probability distribution + grid: numpy.ndarray, float + the grid upon which p_eval was evaluated + dx: float + the difference between regular grid points + N: int + order of the moment to be calculated + + Returns + ------- + M: float + value of the moment + """ + M = np.dot(grid_to_N, p_eval) * dx return M def calculate_kl_divergence(p, q, limits=lims, dx=0.01, vb=False): """ - Calculates the Kullback-Leibler Divergence between two PDFs. + Calculates the Kullback-Leibler Divergence between two qp.PDF objects. Parameters ---------- @@ -332,9 +332,12 @@ def calculate_kl_divergence(p, q, limits=lims, dx=0.01, vb=False): Notes ----- TO DO: change this to calculate_kld + TO DO: have this take number of points not dx! """ # Make a grid from the limits and resolution - grid = np.arange(limits[0], limits[1], dx) + N = int((limits[-1] - limits[0]) / dx) + grid = np.linspace(limits[0], limits[1], N) + dx = (limits[-1] - limits[0]) / (N - 1) # Evaluate the functions on the grid and normalize pe = p.evaluate(grid, vb=vb, norm=True) pn = pe[1] @@ -346,17 +349,48 @@ def calculate_kl_divergence(p, q, limits=lims, dx=0.01, vb=False): #denominator = max(np.sum(qe), epsilon) # qn = qe / np.sum(qe)#denominator # Compute the log of the normalized PDFs - logquotient = safelog(pn / qn) + # logquotient = safelog(pn / qn) # logp = safelog(pn) # logq = safelog(qn) # Calculate the KLD from q to p - Dpq = np.dot(pn * logquotient, np.ones(len(grid)) * dx) - assert(Dpq >= 0.) + Dpq = quick_kl_divergence(pn, qn, dx=dx)# np.dot(pn * logquotient, np.ones(len(grid)) * dx) + if Dpq < 0.: + print('broken KLD: '+str((Dpq, pn, qn, dx))) + Dpq = epsilon + return Dpq + +def quick_kl_divergence(p_eval, q_eval, dx=0.01): + """ + Calculates the Kullback-Leibler Divergence between two evaluations of PDFs. + + Parameters + ---------- + p_eval: numpy.ndarray, float + evaluations of probability distribution whose distance _from_ `q` will be calculated + q_eval: numpy.ndarray, float + evaluations of probability distribution whose distance _to_ `p` will be calculated. + dx: float + resolution of integration grid + + Returns + ------- + Dpq: float + the value of the Kullback-Leibler Divergence from `q` to `p` + + Notes + ----- + TO DO: change this to quick_kld + """ + logquotient = safelog(p_eval) - safelog(q_eval) + # logp = safelog(pn) + # logq = safelog(qn) + # Calculate the KLD from q to p + Dpq = dx * np.sum(p_eval * logquotient) return Dpq def calculate_rmse(p, q, limits=lims, dx=0.01, vb=False): """ - Calculates the Root Mean Square Error between two PDFs. + Calculates the Root Mean Square Error between two qp.PDF objects. Parameters ---------- @@ -375,13 +409,66 @@ def calculate_rmse(p, q, limits=lims, dx=0.01, vb=False): ------- rms: float the value of the RMS error between `q` and `p` + + Notes + ----- + TO DO: change dx to N """ # Make a grid from the limits and resolution - npoints = int((limits[1] - limits[0]) / dx) - grid = np.linspace(limits[0], limits[1], npoints) + N = int((limits[-1] - limits[0]) / dx) + grid = np.linspace(limits[0], limits[1], N) + dx = (limits[-1] - limits[0]) / (N - 1) # Evaluate the functions on the grid pe = p.evaluate(grid, vb=vb)[1] qe = q.evaluate(grid, vb=vb)[1] # Calculate the RMS between p and q - rms = np.sqrt(np.sum((pe - qe) ** 2) / npoints) + rms = quick_rmse(pe, qe, N)# np.sqrt(dx * np.sum((pe - qe) ** 2)) return rms + +def quick_rmse(p_eval, q_eval, N): + """ + Calculates the Root Mean Square Error between two evaluations of PDFs. + + Parameters + ---------- + p_eval: numpy.ndarray, float + evaluation of probability distribution function whose distance between its truth and the approximation of `q` will be calculated. + q_eval: numpy.ndarray, float + evaluation of probability distribution function whose distance between its approximation and the truth of `p` will be calculated. + N: int + number of points at which PDFs were evaluated + + Returns + ------- + rms: float + the value of the RMS error between `q` and `p` + """ + # Calculate the RMS between p and q + rms = np.sqrt(np.sum((p_eval - q_eval) ** 2) / N) + return rms + +def make_kludge_interpolator((x, y), outside=epsilon): + """ + Linear interpolation by hand for debugging + + Parameters + ---------- + (x, y): tuple, numpy.ndarray, float + where interpolator is fit + outside: float + value to use outside interpolation range + + Returns + ------- + kludge_interpolator: function + evaluates linear interpolant based on input points + """ + dx = x[1:] - x[:-1] + dy = y[1:] - y[:-1] + def kludge_interpolator(xf): + yf = np.ones(np.shape(xf)) * epsilon + for i in range(len(x)): + inside = ((xf >= x[i]) & (xf <= x[i+1])).nonzero()[0] + yf[inside] = y[i] + (y[i+1] - y[i]) * (xf[inside] - x[i]) / dx[i] + return yf + return kludge_interpolator diff --git a/requirements.txt b/requirements.txt index 27075613..ef4c8767 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,5 @@ -hickle matplotlib numpy pathos -psutil scipy sklearn diff --git a/setup.py b/setup.py index 1fae7858..7f62c5a0 100644 --- a/setup.py +++ b/setup.py @@ -19,5 +19,5 @@ "Operating System :: OS Independent", "Programming Language :: Python", ], - install_requires=["matplotlib", "hickle", "numpy", "pathos", "psutil", "scipy", "sklearn"] + install_requires=["matplotlib", "numpy", "pathos", "scipy", "sklearn"] )