From c7061af6fe31bf344c3713f3f82678887de9415f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Wiewi=C3=B3rka?= Date: Mon, 4 Nov 2019 07:54:29 +0100 Subject: [PATCH] sequila-py (#155) * sequila.py * Latest fixes * Adding CI/CD for sequila.py --- Jenkinsfile | 3 + examples/bdg-sequila.ipynb | 247 +++++++++++++++++++++++++++++++++++++ python/README.rst | 0 python/environment.yml | 11 ++ python/sequila/__init__.py | 1 + python/sequila/sequila.py | 48 +++++++ python/setup.py | 22 ++++ 7 files changed, 332 insertions(+) create mode 100644 examples/bdg-sequila.ipynb create mode 100644 python/README.rst create mode 100644 python/environment.yml create mode 100644 python/sequila/__init__.py create mode 100644 python/sequila/sequila.py create mode 100644 python/setup.py diff --git a/Jenkinsfile b/Jenkinsfile index 2b6b851d..778ca89e 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -91,6 +91,9 @@ node { populateGlobalVariables() try { + stage('Package sequila-py') { + sh 'bash -c "source /sequila/bin/activate && cd python && python3.6 setup.py sdist bdist_wheel && twine check dist/* && twine upload -r zsibio dist/* && deactivate"' + } stage('Test Scala code') { echo 'Testing Scala code....' diff --git a/examples/bdg-sequila.ipynb b/examples/bdg-sequila.ipynb new file mode 100644 index 00000000..c696f618 --- /dev/null +++ b/examples/bdg-sequila.ipynb @@ -0,0 +1,247 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "import os\n", + "os.getcwd()\n", + "import sys\n", + "sys.path.append(\"/Users/marek/.virtualenvs/bdg-sequila/lib/python3.7/site-packages\")\n", + "sys.path.append(\"/Users/marek/git/forks/bdg-sequila/python\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "import findspark\n", + "from pyspark.sql import SparkSession, DataFrame, SQLContext\n", + "findspark.init()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.sql import SparkSession\n", + "spark = SparkSession \\\n", + ".builder \\\n", + ".config('spark.ui.enabled','false') \\\n", + ".config('spark.driver.memory','2g') \\\n", + ".config('spark.driver.bindAddress', 'localhost') \\\n", + ".config('spark.driver.host', 'localhost') \\\n", + ".appName('SeQuiLa') \\\n", + ".config('spark.jars','/Users/marek/git/forks/bdg-sequila/target/scala-2.11/bdg-sequila-assembly-0.5.6-spark-2.4.3.jar') \\\n", + ".getOrCreate()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#config paths\n", + "bamPath = '/Users/marek/git/forks/bdg-sequila/src/test/resources/NA12878.slice.bam'\n", + "vcfPath = '/Users/marek/git/forks/bdg-sequila/src/test/resources/vcf/test.vcf'" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "#register sequila.py\n", + "from sequila import SequilaSession\n", + "ss = SequilaSession(spark)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "DataFrame[]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# create table\n", + "tableNameBAM = 'test'\n", + "ss.sql(f'CREATE TABLE {tableNameBAM} \\\n", + "USING org.biodatageeks.datasources.BAM.BAMDataSource \\\n", + "OPTIONS(path \"{bamPath}\")')" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------+----------+-----+---+-----+----+--------------------+--------------------+-----+----------+---------+\n", + "|sampleId|contigName|start|end|cigar|mapq| baseq| sequence|flags|materefind|SAMRecord|\n", + "+--------+----------+-----+---+-----+----+--------------------+--------------------+-----+----------+---------+\n", + "| NA12878| chr1| 34|109| 76M| 2|ACCCADEFFECEFGGFD...|CCCTAACCCTAACCCTA...| 99| 1| null|\n", + "+--------+----------+-----+---+-----+----+--------------------+--------------------+-----+----------+---------+\n", + "\n" + ] + } + ], + "source": [ + "# query table\n", + "ss.sql(f\"SELECT * FROM {tableNameBAM} LIMIT 1\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------+---------+-----------+\n", + "|database|tableName|isTemporary|\n", + "+--------+---------+-----------+\n", + "| default| test| false|\n", + "+--------+---------+-----------+\n", + "\n", + "+--------+----------+-----+---+-----+----+--------------------+--------------------+-----+----------+---------+\n", + "|sampleId|contigName|start|end|cigar|mapq| baseq| sequence|flags|materefind|SAMRecord|\n", + "+--------+----------+-----+---+-----+----+--------------------+--------------------+-----+----------+---------+\n", + "| NA12878| chr1| 34|109| 76M| 2|ACCCADEFFECEFGGFD...|CCCTAACCCTAACCCTA...| 99| 1| null|\n", + "+--------+----------+-----+---+-----+----+--------------------+--------------------+-----+----------+---------+\n", + "\n" + ] + } + ], + "source": [ + "ss.sql(\"show tables\").show()\n", + "ss.sql(f\"SELECT * FROM {tableNameBAM} LIMIT 1\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+----------+-----+---+--------+\n", + "|contigName|start|end|coverage|\n", + "+----------+-----+---+--------+\n", + "| chr1| 34| 34| 1|\n", + "| chr1| 35| 35| 2|\n", + "| chr1| 36| 37| 3|\n", + "| chr1| 38| 40| 4|\n", + "| chr1| 41| 49| 5|\n", + "+----------+-----+---+--------+\n", + "\n" + ] + } + ], + "source": [ + "ss.sql(f\"SELECT * FROM bdg_coverage('{tableNameBAM}','NA12878', 'blocks') LIMIT 5\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "DataFrame[]" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tableNameVCF = 'test_vcf'\n", + "ss.sql(f'CREATE TABLE {tableNameVCF} \\\n", + "USING org.biodatageeks.datasources.VCF.VCFDataSource \\\n", + "OPTIONS(path \"{vcfPath}\")')" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+------+-----+-----+-----+---------+---+---+----+------+--------------------+---+---+---+-----+---------+\n", + "|contig| pos|start| stop| id|ref|alt|qual|filter| info| gt| gq| dp| hq|sample_id|\n", + "+------+-----+-----+-----+---------+---+---+----+------+--------------------+---+---+---+-----+---------+\n", + "| 20|14370|14369|14370|rs6054257| G| A| 29| PASS|[ns -> 3, db -> D...|0|0| 48| 1|51,51| NA00001|\n", + "| 20|14370|14369|14370|rs6054257| G| A| 29| PASS|[ns -> 3, db -> D...|1|0| 48| 8|51,51| NA00002|\n", + "| 20|14370|14369|14370|rs6054257| G| A| 29| PASS|[ns -> 3, db -> D...|1/1| 43| 5| .,.| NA00003|\n", + "| 20|17330|17329|17330| .| T| A| 3| q10|[ns -> 3, dp -> 1...|0|0| 49| 3|58,50| NA00001|\n", + "| 20|17330|17329|17330| .| T| A| 3| q10|[ns -> 3, dp -> 1...|0|1| 3| 5| 65,3| NA00002|\n", + "+------+-----+-----+-----+---------+---+---+----+------+--------------------+---+---+---+-----+---------+\n", + "\n" + ] + } + ], + "source": [ + "ss.sql(f\"SELECT * FROM {tableNameVCF} LIMIT 5\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "ss.stop()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "bdg-sequila", + "language": "python", + "name": "bdg-sequila" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/python/README.rst b/python/README.rst new file mode 100644 index 00000000..e69de29b diff --git a/python/environment.yml b/python/environment.yml new file mode 100644 index 00000000..7dbf8608 --- /dev/null +++ b/python/environment.yml @@ -0,0 +1,11 @@ +name: sequila +dependencies: +- python=3.7 +- pytest +- pip +- pip: + - findspark + - pyspark==2.4.3 + - setuptools==41.2.0 # Python packaging + - typeguard==2.5.0 + - twine==2.0.0 # Pypi publishing \ No newline at end of file diff --git a/python/sequila/__init__.py b/python/sequila/__init__.py new file mode 100644 index 00000000..e36a22ea --- /dev/null +++ b/python/sequila/__init__.py @@ -0,0 +1 @@ +from sequila.sequila import * diff --git a/python/sequila/sequila.py b/python/sequila/sequila.py new file mode 100644 index 00000000..deca3b96 --- /dev/null +++ b/python/sequila/sequila.py @@ -0,0 +1,48 @@ +from pyspark.sql import SparkSession +from typeguard import check_argument_types + + +def register(session: SparkSession): + """ + register(session) + + Register SQL extensions for a Spark session. + + :param session: Spark session + """ + assert check_argument_types() + sparkSession = session._jvm.org.apache.spark.sql.SparkSession.builder().enableHiveSupport().getOrCreate() + ss = session._jvm.org.apache.spark.sql.SequilaSession(sparkSession) + session._jvm.org.biodatageeks.utils.SequilaRegister.register(ss) + session._jvm.org.biodatageeks.utils.UDFRegister.register(ss) + return + + +class SequilaSession (SparkSession): + def __init__(self, session: SparkSession, jsparkSession=None): + """Creates a new SequilaSession. + + """ + ss = session._jvm.org.apache.spark.sql.SequilaSession(session._jsparkSession) + session._jvm.org.biodatageeks.utils.SequilaRegister.register(ss) + session._jvm.org.biodatageeks.utils.UDFRegister.register(ss) + session._jvm.SequilaSession.setDefaultSession(ss) + sequilaSession = SequilaSession._instantiatedSession + from pyspark.sql.context import SQLContext + self._sc = sequilaSession._sc + self._jsc = self._sc._jsc + self._jvm = session._jvm + if jsparkSession is None: + if self._jvm.SequilaSession.getDefaultSession().isDefined() \ + and not self._jvm.SequilaSession.getDefaultSession().get() \ + .sparkContext().isStopped(): + jsparkSession = self._jvm.SequilaSession.getDefaultSession().get() + else: + jsparkSession = self._jvm.SequilaSession(self._jsc.sc()) + self._jsparkSession = jsparkSession + self._jwrapped = self._jsparkSession.sqlContext() + self._wrapped = SQLContext(self._sc, self, self._jwrapped) + if SequilaSession._instantiatedSession is None \ + or SequilaSession._instantiatedSession._sc._jsc is None: + SequilaSession._instantiatedSession = self + self._jvm.SparkSession.setDefaultSession(self._jsparkSession) diff --git a/python/setup.py b/python/setup.py new file mode 100644 index 00000000..ec267a64 --- /dev/null +++ b/python/setup.py @@ -0,0 +1,22 @@ +from setuptools import setup + +setup( + name='sequila.py', + version='0.1.0', + packages=['sequila'], + install_requires=[ + 'typeguard==2.5.0', + 'pyspark==2.4.3', + 'findspark' + ], + author='biodatageeks.org', + description='A SQL-based solution for large-scale genomic analysis', + long_description=open('README.rst').read(), + long_description_content_type='text/x-rst', + license='Apache License 2.0', + classifiers=[ + 'Intended Audience :: Developers', + 'Programming Language :: Python :: 3.6', + ], + url='https://github.com/ZSI-Bio/bdg-sequila' +) \ No newline at end of file