Skip to content
This repository was archived by the owner on Oct 23, 2023. It is now read-only.

Commit d894eec

Browse files
authored
Merge pull request #158 from CSCfi/dev
release-1.8.0
2 parents 35d4cc4 + a599cde commit d894eec

File tree

5 files changed

+111
-69
lines changed

5 files changed

+111
-69
lines changed

beacon_api/conf/config.ini

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
title=GA4GHBeacon at CSC
88

99
# Version of the Beacon implementation
10-
version=1.7.2
10+
version=1.8.0
1111

1212
# Author of this software
1313
author=CSC developers

beacon_api/utils/db_load.py

Lines changed: 85 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
"""
3838

3939
import os
40+
import sys
4041
import argparse
4142
import json
4243
import itertools
@@ -46,6 +47,7 @@
4647
import asyncpg
4748
from cyvcf2 import VCF
4849

50+
from pathlib import Path
4951
from datetime import datetime
5052

5153
from .logging import LOG
@@ -280,19 +282,19 @@ def _chunks(self, iterable, size):
280282
for first in iterator:
281283
yield itertools.chain([first], itertools.islice(iterator, size - 1))
282284

283-
async def load_datafile(self, vcf, datafile, dataset_id, n=1000):
285+
async def load_datafile(self, vcf, datafile, dataset_id, n=1000, min_ac=1):
284286
"""Parse data from datafile and send it to be inserted."""
285287
LOG.info(f"Read data from {datafile}")
286288
try:
287289
LOG.info("Generate database queue(s)")
288290
data = self._chunks(vcf, n)
289291
for record in data:
290-
await self.insert_variants(dataset_id, list(record))
292+
await self.insert_variants(dataset_id, list(record), min_ac)
291293
LOG.info(f"{datafile} has been processed")
292294
except Exception as e:
293295
LOG.error(f"AN ERROR OCCURRED WHILE GENERATING DB QUEUE -> {e}")
294296

295-
async def insert_variants(self, dataset_id, variants):
297+
async def insert_variants(self, dataset_id, variants, min_ac):
296298
"""Insert variant data to the database."""
297299
LOG.info(f"Received {len(variants)} variants for insertion to {dataset_id}")
298300
try:
@@ -301,67 +303,82 @@ async def insert_variants(self, dataset_id, variants):
301303
LOG.info("Insert variants into the database")
302304
for variant in variants:
303305
# params = (frequency, count, actual variant Type)
304-
# Nothing interesting on the variant with no aaf
305-
# because none of the samples have it
306-
if variant.aaf > 0:
307-
params = self._unpack(variant)
308-
# Coordinates that are read from VCF are 1-based,
309-
# cyvcf2 reads them as 0-based, and they are inserted into the DB as such
310-
311-
# We Process Breakend Records into a different table for now
312-
if params[5] != []:
313-
# await self.insert_mates(dataset_id, variant, params)
314-
# Most likely there will be only one BND per Record
315-
for bnd in params[5]:
306+
params = self._unpack(variant)
307+
# Coordinates that are read from VCF are 1-based,
308+
# cyvcf2 reads them as 0-based, and they are inserted into the DB as such
309+
310+
# params may carry single variants [1] or packed variants [20, 15, 10, 1]
311+
# The first check prunes for single variants, packed variants must be removed afterwards
312+
if params[1][0] >= min_ac:
313+
# Remove packed variants that don't meet the minimum allele count requirements
314+
# Packed variants are always ordered from largest to smallest, this process starts
315+
# popping values from the right (small) side until there are no more small values to pop
316+
while params[1][-1] < min_ac:
317+
params[0].pop() # aaf
318+
params[1].pop() # ac
319+
params[2].pop() # vt
320+
params[3].pop() # alt
321+
if len(params[5]) > 0:
322+
params[5].pop() # bnd
323+
324+
# Nothing interesting on the variant with no aaf
325+
# because none of the samples have it
326+
if variant.aaf > 0:
327+
328+
# We Process Breakend Records into a different table for now
329+
if params[5] != []:
330+
# await self.insert_mates(dataset_id, variant, params)
331+
# Most likely there will be only one BND per Record
332+
for bnd in params[5]:
333+
await self._conn.execute(
334+
"""INSERT INTO beacon_mate_table
335+
(datasetId, chromosome, chromosomeStart, chromosomePos,
336+
mate, mateStart, matePos, reference, alternate, alleleCount,
337+
callCount, frequency, "end")
338+
SELECT ($1), ($2), ($3), ($4),
339+
($5), ($6), ($7), ($8), t.alt, t.ac, ($11), t.freq, ($13)
340+
FROM (SELECT unnest($9::varchar[]) alt, unnest($10::integer[]) ac,
341+
unnest($12::float[]) freq) t
342+
ON CONFLICT (datasetId, chromosome, mate, chromosomePos, matePos)
343+
DO NOTHING""",
344+
dataset_id,
345+
variant.CHROM.replace("chr", ""),
346+
variant.start,
347+
variant.ID,
348+
bnd[0].replace("chr", ""),
349+
bnd[1],
350+
bnd[6],
351+
variant.REF,
352+
params[3],
353+
params[1],
354+
params[4],
355+
params[0],
356+
variant.end,
357+
)
358+
else:
316359
await self._conn.execute(
317-
"""INSERT INTO beacon_mate_table
318-
(datasetId, chromosome, chromosomeStart, chromosomePos,
319-
mate, mateStart, matePos, reference, alternate, alleleCount,
320-
callCount, frequency, "end")
321-
SELECT ($1), ($2), ($3), ($4),
322-
($5), ($6), ($7), ($8), t.alt, t.ac, ($11), t.freq, ($13)
323-
FROM (SELECT unnest($9::varchar[]) alt, unnest($10::integer[]) ac,
324-
unnest($12::float[]) freq) t
325-
ON CONFLICT (datasetId, chromosome, mate, chromosomePos, matePos)
326-
DO NOTHING""",
360+
"""INSERT INTO beacon_data_table
361+
(datasetId, chromosome, start, reference, alternate,
362+
"end", aggregatedVariantType, alleleCount, callCount, frequency, variantType)
363+
SELECT ($1), ($2), ($3), ($4), t.alt, ($6), ($7), t.ac, ($9), t.freq, t.vt
364+
FROM (SELECT unnest($5::varchar[]) alt, unnest($8::integer[]) ac,
365+
unnest($10::float[]) freq, unnest($11::varchar[]) as vt) t
366+
ON CONFLICT (datasetId, chromosome, start, reference, alternate)
367+
DO NOTHING""",
327368
dataset_id,
328369
variant.CHROM.replace("chr", ""),
329370
variant.start,
330-
variant.ID,
331-
bnd[0].replace("chr", ""),
332-
bnd[1],
333-
bnd[6],
334371
variant.REF,
335372
params[3],
373+
variant.end,
374+
variant.var_type.upper(),
336375
params[1],
337376
params[4],
338377
params[0],
339-
variant.end,
378+
params[2],
340379
)
341-
else:
342-
await self._conn.execute(
343-
"""INSERT INTO beacon_data_table
344-
(datasetId, chromosome, start, reference, alternate,
345-
"end", aggregatedVariantType, alleleCount, callCount, frequency, variantType)
346-
SELECT ($1), ($2), ($3), ($4), t.alt, ($6), ($7), t.ac, ($9), t.freq, t.vt
347-
FROM (SELECT unnest($5::varchar[]) alt, unnest($8::integer[]) ac,
348-
unnest($10::float[]) freq, unnest($11::varchar[]) as vt) t
349-
ON CONFLICT (datasetId, chromosome, start, reference, alternate)
350-
DO NOTHING""",
351-
dataset_id,
352-
variant.CHROM.replace("chr", ""),
353-
variant.start,
354-
variant.REF,
355-
params[3],
356-
variant.end,
357-
variant.var_type.upper(),
358-
params[1],
359-
params[4],
360-
params[0],
361-
params[2],
362-
)
363-
364-
LOG.debug("Variants have been inserted")
380+
381+
LOG.debug("Variants have been inserted")
365382
except Exception as e:
366383
LOG.error(f"AN ERROR OCCURRED WHILE ATTEMPTING TO INSERT VARIANTS -> {e}")
367384

@@ -379,6 +396,7 @@ async def init_beacon_db(arguments=None):
379396
"""Run database operations here."""
380397
# Fetch command line arguments
381398
args = parse_arguments(arguments)
399+
validate_arguments(args)
382400

383401
# Initialise the database connection
384402
db = BeaconDB()
@@ -400,12 +418,22 @@ async def init_beacon_db(arguments=None):
400418
dataset_id = await db.load_metadata(vcf, args.metadata, args.datafile)
401419

402420
# Insert data into the database
403-
await db.load_datafile(vcf, args.datafile, dataset_id)
421+
await db.load_datafile(vcf, args.datafile, dataset_id, min_ac=int(args.min_allele_count))
404422

405423
# Close the database connection
406424
await db.close()
407425

408426

427+
def validate_arguments(arguments):
428+
"""Check that given arguments are valid."""
429+
if not Path(arguments.datafile).is_file():
430+
sys.exit(f"Could not find datafile: {arguments.datafile}")
431+
if not Path(arguments.metadata).is_file():
432+
sys.exit(f"Could not find metadata file: {arguments.metadata}")
433+
if not arguments.min_allele_count.isdigit():
434+
sys.exit(f"Minimum allele count --min_allele_count must be a positive integer, received: {arguments.min_allele_count}")
435+
436+
409437
def parse_arguments(arguments):
410438
"""Parse command line arguments."""
411439
parser = argparse.ArgumentParser(
@@ -415,7 +443,8 @@ def parse_arguments(arguments):
415443
)
416444
parser.add_argument("datafile", help=".vcf file containing variant information")
417445
parser.add_argument("metadata", help=".json file containing metadata associated to datafile")
418-
parser.add_argument("--samples", default=None, help="comma separated string of samples to process")
446+
parser.add_argument("--samples", default=None, help="comma separated string of samples to process. EXPERIMENTAL")
447+
parser.add_argument("--min_allele_count", default="1", help="minimum allele count can be raised to ignore rare variants. Default value is 1")
419448
return parser.parse_args(arguments)
420449

421450

docs/instructions.rst

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -160,10 +160,11 @@ Starting PostgreSQL using Docker:
160160
.. code-block:: console
161161
162162
cd beacon-python
163-
docker run -e POSTGRES_USER=beacon \
163+
docker run -d \
164+
-e POSTGRES_USER=beacon \
164165
-e POSTGRES_PASSWORD=beacon \
165166
-e POSTGRES_DB=beacondb \
166-
-v "$PWD/data":/docker-entrypoint-initdb.d
167+
-v "$PWD/data":/docker-entrypoint-initdb.d \
167168
-p 5432:5432 postgres:11.6
168169
169170
.. hint:: If one has their own database the ``beacon_init`` utility can be skipped,
@@ -182,19 +183,25 @@ For loading datasets to database we provide the ``beacon_init`` utility:
182183

183184
.. code-block:: console
184185
185-
╰─$ beacon_init --help
186-
usage: beacon_init [-h] datafile metadata
186+
$ beacon_init --help
187+
usage: beacon_init [-h] [--samples SAMPLES]
188+
[--min_allele_count MIN_ALLELE_COUNT]
189+
datafile metadata
187190
188191
Load datafiles with associated metadata into the beacon database. See example
189192
data and metadata files in the /data directory.
190193
191194
positional arguments:
192-
datafile .vcf file containing variant information
193-
metadata .json file containing metadata associated to datafile
195+
datafile .vcf file containing variant information
196+
metadata .json file containing metadata associated to datafile
194197
195198
optional arguments:
196-
--samples comma separated string of samples to process
197-
-h, --help show this help message and exit
199+
-h, --help show this help message and exit
200+
--samples SAMPLES comma separated string of samples to process.
201+
EXPERIMENTAL
202+
--min_allele_count MIN_ALLELE_COUNT
203+
minimum allele count can be raised to ignore rare
204+
variants. Default value is 1
198205
199206
As an example, a dataset metadata could be:
200207

@@ -221,12 +228,18 @@ For loading data into the database we can proceed as follows:
221228
222229
$ beacon_init data/ALL.chrMT.phase3_callmom-v0_4.20130502.genotypes.vcf.gz data/example_metadata.json
223230
224-
For loading data into the database from selected samples only we can proceed as follows:
231+
(EXPERIMENTAL) For loading data into the database from selected samples only we can proceed as follows:
225232

226233
.. code-block:: console
227234
228235
$ beacon_init data/ALL.chrMT.phase3_callmom-v0_4.20130502.genotypes.vcf.gz data/example_metadata.json --samples HG0001,HG0002,HG0003
229236
237+
For ignoring rare alleles, set a minimum allele count with ``--min_allele_count``:
238+
239+
.. code-block:: console
240+
241+
$ beacon_init data/ALL.chrMT.phase3_callmom-v0_4.20130502.genotypes.vcf.gz data/example_metadata.json --min_allele_count 20
242+
230243
.. note:: One dataset can have multiple files, in order to add more files to one dataset, repeat the command above.
231244
The parameters ``callCount`` and ``variantCount`` from the metadata file reflect values of the entire dataset.
232245
These values can be initialised with ``0`` if they are not known and updated in ``beacon_dataset_counts_table`` table.

tests/test_basic.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,15 +59,15 @@ async def create_tables(self, sql_file):
5959
"""Mimic create_tables."""
6060
pass
6161

62-
async def insert_variants(self, dataset_id, variants, len_samples):
62+
async def insert_variants(self, dataset_id, variants, min_ac):
6363
"""Mimic insert_variants."""
6464
pass
6565

6666
async def load_metadata(self, vcf, metafile, datafile):
6767
"""Mimic load_metadata."""
6868
pass
6969

70-
async def load_datafile(self, vcf, datafile, datasetId):
70+
async def load_datafile(self, vcf, datafile, datasetId, n=1000, min_ac=1):
7171
"""Mimic load_datafile."""
7272
return ["datasetId", "variants"]
7373

tests/test_db_load.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -312,7 +312,7 @@ async def test_insert_variants(self, db_mock, mock_log):
312312
db_mock.return_value = Connection()
313313
await self._db.connection()
314314
db_mock.assert_called()
315-
await self._db.insert_variants('DATASET1', ['C'])
315+
await self._db.insert_variants('DATASET1', ['C'], 1)
316316
# Should assert logs
317317
mock_log.info.mock_calls = ['Received 1 variants for insertion to DATASET1',
318318
'Insert variants into the database']

0 commit comments

Comments
 (0)