Skip to content
This repository has been archived by the owner on Nov 28, 2020. It is now read-only.

Commit

Permalink
sequila-py (#155)
Browse files Browse the repository at this point in the history
* sequila.py

* Latest fixes

* Adding CI/CD for sequila.py
  • Loading branch information
mwiewior authored Nov 4, 2019
1 parent d83c3f0 commit c7061af
Show file tree
Hide file tree
Showing 7 changed files with 332 additions and 0 deletions.
3 changes: 3 additions & 0 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,9 @@ node {

populateGlobalVariables()
try {
stage('Package sequila-py') {
sh 'bash -c "source /sequila/bin/activate && cd python && python3.6 setup.py sdist bdist_wheel && twine check dist/* && twine upload -r zsibio dist/* && deactivate"'
}
stage('Test Scala code') {

echo 'Testing Scala code....'
Expand Down
247 changes: 247 additions & 0 deletions examples/bdg-sequila.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,247 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"import os\n",
"os.getcwd()\n",
"import sys\n",
"sys.path.append(\"/Users/marek/.virtualenvs/bdg-sequila/lib/python3.7/site-packages\")\n",
"sys.path.append(\"/Users/marek/git/forks/bdg-sequila/python\")"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"import findspark\n",
"from pyspark.sql import SparkSession, DataFrame, SQLContext\n",
"findspark.init()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from pyspark.sql import SparkSession\n",
"spark = SparkSession \\\n",
".builder \\\n",
".config('spark.ui.enabled','false') \\\n",
".config('spark.driver.memory','2g') \\\n",
".config('spark.driver.bindAddress', 'localhost') \\\n",
".config('spark.driver.host', 'localhost') \\\n",
".appName('SeQuiLa') \\\n",
".config('spark.jars','/Users/marek/git/forks/bdg-sequila/target/scala-2.11/bdg-sequila-assembly-0.5.6-spark-2.4.3.jar') \\\n",
".getOrCreate()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#config paths\n",
"bamPath = '/Users/marek/git/forks/bdg-sequila/src/test/resources/NA12878.slice.bam'\n",
"vcfPath = '/Users/marek/git/forks/bdg-sequila/src/test/resources/vcf/test.vcf'"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"#register sequila.py\n",
"from sequila import SequilaSession\n",
"ss = SequilaSession(spark)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"DataFrame[]"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# create table\n",
"tableNameBAM = 'test'\n",
"ss.sql(f'CREATE TABLE {tableNameBAM} \\\n",
"USING org.biodatageeks.datasources.BAM.BAMDataSource \\\n",
"OPTIONS(path \"{bamPath}\")')"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+--------+----------+-----+---+-----+----+--------------------+--------------------+-----+----------+---------+\n",
"|sampleId|contigName|start|end|cigar|mapq| baseq| sequence|flags|materefind|SAMRecord|\n",
"+--------+----------+-----+---+-----+----+--------------------+--------------------+-----+----------+---------+\n",
"| NA12878| chr1| 34|109| 76M| 2|ACCCADEFFECEFGGFD...|CCCTAACCCTAACCCTA...| 99| 1| null|\n",
"+--------+----------+-----+---+-----+----+--------------------+--------------------+-----+----------+---------+\n",
"\n"
]
}
],
"source": [
"# query table\n",
"ss.sql(f\"SELECT * FROM {tableNameBAM} LIMIT 1\").show()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+--------+---------+-----------+\n",
"|database|tableName|isTemporary|\n",
"+--------+---------+-----------+\n",
"| default| test| false|\n",
"+--------+---------+-----------+\n",
"\n",
"+--------+----------+-----+---+-----+----+--------------------+--------------------+-----+----------+---------+\n",
"|sampleId|contigName|start|end|cigar|mapq| baseq| sequence|flags|materefind|SAMRecord|\n",
"+--------+----------+-----+---+-----+----+--------------------+--------------------+-----+----------+---------+\n",
"| NA12878| chr1| 34|109| 76M| 2|ACCCADEFFECEFGGFD...|CCCTAACCCTAACCCTA...| 99| 1| null|\n",
"+--------+----------+-----+---+-----+----+--------------------+--------------------+-----+----------+---------+\n",
"\n"
]
}
],
"source": [
"ss.sql(\"show tables\").show()\n",
"ss.sql(f\"SELECT * FROM {tableNameBAM} LIMIT 1\").show()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+----------+-----+---+--------+\n",
"|contigName|start|end|coverage|\n",
"+----------+-----+---+--------+\n",
"| chr1| 34| 34| 1|\n",
"| chr1| 35| 35| 2|\n",
"| chr1| 36| 37| 3|\n",
"| chr1| 38| 40| 4|\n",
"| chr1| 41| 49| 5|\n",
"+----------+-----+---+--------+\n",
"\n"
]
}
],
"source": [
"ss.sql(f\"SELECT * FROM bdg_coverage('{tableNameBAM}','NA12878', 'blocks') LIMIT 5\").show()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"DataFrame[]"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tableNameVCF = 'test_vcf'\n",
"ss.sql(f'CREATE TABLE {tableNameVCF} \\\n",
"USING org.biodatageeks.datasources.VCF.VCFDataSource \\\n",
"OPTIONS(path \"{vcfPath}\")')"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+------+-----+-----+-----+---------+---+---+----+------+--------------------+---+---+---+-----+---------+\n",
"|contig| pos|start| stop| id|ref|alt|qual|filter| info| gt| gq| dp| hq|sample_id|\n",
"+------+-----+-----+-----+---------+---+---+----+------+--------------------+---+---+---+-----+---------+\n",
"| 20|14370|14369|14370|rs6054257| G| A| 29| PASS|[ns -> 3, db -> D...|0|0| 48| 1|51,51| NA00001|\n",
"| 20|14370|14369|14370|rs6054257| G| A| 29| PASS|[ns -> 3, db -> D...|1|0| 48| 8|51,51| NA00002|\n",
"| 20|14370|14369|14370|rs6054257| G| A| 29| PASS|[ns -> 3, db -> D...|1/1| 43| 5| .,.| NA00003|\n",
"| 20|17330|17329|17330| .| T| A| 3| q10|[ns -> 3, dp -> 1...|0|0| 49| 3|58,50| NA00001|\n",
"| 20|17330|17329|17330| .| T| A| 3| q10|[ns -> 3, dp -> 1...|0|1| 3| 5| 65,3| NA00002|\n",
"+------+-----+-----+-----+---------+---+---+----+------+--------------------+---+---+---+-----+---------+\n",
"\n"
]
}
],
"source": [
"ss.sql(f\"SELECT * FROM {tableNameVCF} LIMIT 5\").show()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"ss.stop()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "bdg-sequila",
"language": "python",
"name": "bdg-sequila"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Empty file added python/README.rst
Empty file.
11 changes: 11 additions & 0 deletions python/environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
name: sequila
dependencies:
- python=3.7
- pytest
- pip
- pip:
- findspark
- pyspark==2.4.3
- setuptools==41.2.0 # Python packaging
- typeguard==2.5.0
- twine==2.0.0 # Pypi publishing
1 change: 1 addition & 0 deletions python/sequila/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from sequila.sequila import *
48 changes: 48 additions & 0 deletions python/sequila/sequila.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from pyspark.sql import SparkSession
from typeguard import check_argument_types


def register(session: SparkSession):
"""
register(session)
Register SQL extensions for a Spark session.
:param session: Spark session
"""
assert check_argument_types()
sparkSession = session._jvm.org.apache.spark.sql.SparkSession.builder().enableHiveSupport().getOrCreate()
ss = session._jvm.org.apache.spark.sql.SequilaSession(sparkSession)
session._jvm.org.biodatageeks.utils.SequilaRegister.register(ss)
session._jvm.org.biodatageeks.utils.UDFRegister.register(ss)
return


class SequilaSession (SparkSession):
def __init__(self, session: SparkSession, jsparkSession=None):
"""Creates a new SequilaSession.
"""
ss = session._jvm.org.apache.spark.sql.SequilaSession(session._jsparkSession)
session._jvm.org.biodatageeks.utils.SequilaRegister.register(ss)
session._jvm.org.biodatageeks.utils.UDFRegister.register(ss)
session._jvm.SequilaSession.setDefaultSession(ss)
sequilaSession = SequilaSession._instantiatedSession
from pyspark.sql.context import SQLContext
self._sc = sequilaSession._sc
self._jsc = self._sc._jsc
self._jvm = session._jvm
if jsparkSession is None:
if self._jvm.SequilaSession.getDefaultSession().isDefined() \
and not self._jvm.SequilaSession.getDefaultSession().get() \
.sparkContext().isStopped():
jsparkSession = self._jvm.SequilaSession.getDefaultSession().get()
else:
jsparkSession = self._jvm.SequilaSession(self._jsc.sc())
self._jsparkSession = jsparkSession
self._jwrapped = self._jsparkSession.sqlContext()
self._wrapped = SQLContext(self._sc, self, self._jwrapped)
if SequilaSession._instantiatedSession is None \
or SequilaSession._instantiatedSession._sc._jsc is None:
SequilaSession._instantiatedSession = self
self._jvm.SparkSession.setDefaultSession(self._jsparkSession)
22 changes: 22 additions & 0 deletions python/setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from setuptools import setup

setup(
name='sequila.py',
version='0.1.0',
packages=['sequila'],
install_requires=[
'typeguard==2.5.0',
'pyspark==2.4.3',
'findspark'
],
author='biodatageeks.org',
description='A SQL-based solution for large-scale genomic analysis',
long_description=open('README.rst').read(),
long_description_content_type='text/x-rst',
license='Apache License 2.0',
classifiers=[
'Intended Audience :: Developers',
'Programming Language :: Python :: 3.6',
],
url='https://github.com/ZSI-Bio/bdg-sequila'
)

0 comments on commit c7061af

Please sign in to comment.