Working on #1 - FDA import process works. 3 hours for first import, 1.25 hours on subsequent updates.

This commit is contained in:
2013-10-27 18:33:30 -04:00
parent e2bc602264
commit e0232aa02d
13 changed files with 575 additions and 71 deletions

View File

@@ -2,7 +2,11 @@ from __future__ import with_statement
from alembic import context
from sqlalchemy import engine_from_config, pool
from logging.config import fileConfig
import mercy.MercyApplication
import mercy.models
import mercy.config
db = mercy.MercyApplication.get_db()
# this is the Alembic Config object, which provides
# access to the values within the .ini file in use.
config = context.config
@@ -15,7 +19,7 @@ fileConfig(config.config_file_name)
# for 'autogenerate' support
# from myapp import mymodel
# target_metadata = mymodel.Base.metadata
target_metadata = None
target_metadata = db.Model.metadata
# other values from the config, defined by the needs of env.py,
# can be acquired:
@@ -47,6 +51,10 @@ def run_migrations_online():
and associate a connection with the context.
"""
alembic_config = config.get_section(config.config_ini_section)
alembic_config['sqlalchemy.url'] = mercy.config.SQLALCHEMY_URI
engine = engine_from_config(
config.get_section(config.config_ini_section),
prefix='sqlalchemy.',

View File

@@ -0,0 +1,158 @@
"""Initial schema
Revision ID: 2b64ad923738
Revises: None
Create Date: 2013-10-27 11:46:11.475707
"""
# revision identifiers, used by Alembic.
revision = '2b64ad923738'
down_revision = None
from alembic import op
import sqlalchemy as sa
def upgrade():
### commands auto generated by Alembic - please adjust! ###
op.create_table('fda_products',
sa.Column('id', sa.Integer(), primary_key=True, autoincrement=True, nullable=False),
sa.Column('productid', sa.String(), index=True, unique=True, nullable=False),
sa.Column('ndc', sa.String(), index=True, nullable=False),
sa.Column('type', sa.String(), nullable=False),
sa.Column('proprietaryName', sa.String(), nullable=False),
sa.Column('proprietaryNameSuffix', sa.String(), nullable=True),
sa.Column('genericName', sa.String(), nullable=False),
sa.Column('marketingCategoryName', sa.String(), nullable=False),
sa.Column('labelerName', sa.String(), nullable=False),
sa.Column('deaSchedule', sa.String(), nullable=False),
sa.PrimaryKeyConstraint('id')
)
op.create_table('drugbank_packagers',
sa.Column('id', sa.Integer(), primary_key=True, autoincrement=True, nullable=False),
sa.Column('name', sa.String(), nullable=False),
sa.Column('url', sa.String(), nullable=True),
sa.PrimaryKeyConstraint('id'),
sa.UniqueConstraint('name')
)
op.create_table('drugbank_manufacturers',
sa.Column('id', sa.Integer(), primary_key=True, autoincrement=True, nullable=False),
sa.Column('name', sa.String(), nullable=False),
sa.PrimaryKeyConstraint('id'),
sa.UniqueConstraint('name')
)
op.create_table('fda_pharma_classes',
sa.Column('id', sa.Integer(), primary_key=True, autoincrement=True, nullable=False),
sa.Column('name', sa.String(), nullable=False),
sa.PrimaryKeyConstraint('id'),
sa.UniqueConstraint('name')
)
op.create_table('fda_product_substances',
sa.Column('id', sa.Integer(), primary_key=True, autoincrement=True, nullable=False),
sa.Column('name', sa.String(), nullable=False),
sa.PrimaryKeyConstraint('id')
)
op.create_table('drugbank_categories',
sa.Column('id', sa.Integer(), primary_key=True, autoincrement=True, nullable=False),
sa.Column('name', sa.String(), nullable=False),
sa.PrimaryKeyConstraint('id'),
sa.UniqueConstraint('name')
)
op.create_table('fda_pharma_class_maps',
sa.Column('id', sa.Integer(), primary_key=True, autoincrement=True, nullable=False),
sa.Column('product_id', sa.Integer(), nullable=False),
sa.Column('pharma_id', sa.Integer(), nullable=False),
sa.ForeignKeyConstraint(['pharma_id'], ['fda_pharma_classes.id'], ),
sa.ForeignKeyConstraint(['product_id'], ['fda_products.id'], ),
sa.PrimaryKeyConstraint('id', 'pharma_id')
)
op.create_table('drugbank_drugs',
sa.Column('id', sa.Integer(), primary_key=True, autoincrement=True, nullable=False),
sa.Column('dbid', sa.String(), unique=True, nullable=True),
sa.Column('name', sa.String(), nullable=False),
sa.Column('indication', sa.String(), nullable=False),
sa.Column('fda_product_id', sa.String(), nullable=True),
sa.Column('wikipedia', sa.String(), nullable=True),
sa.ForeignKeyConstraint(['fda_product_id'], ['fda_products.productid'], ),
sa.PrimaryKeyConstraint('id')
)
op.create_table('fda_product_substance_map',
sa.Column('id', sa.Integer(), primary_key=True, autoincrement=True, nullable=False),
sa.Column('product_id', sa.Integer(), nullable=False),
sa.Column('substance_id', sa.Integer(), nullable=False),
sa.Column('quantity', sa.Float(), nullable=False),
sa.Column('units', sa.String(), nullable=False),
sa.ForeignKeyConstraint(['product_id'], ['fda_products.id'], ),
sa.ForeignKeyConstraint(['substance_id'], ['fda_product_substances.id'], ),
sa.PrimaryKeyConstraint('id')
)
op.create_table('drugbank_synonyms',
sa.Column('id', sa.Integer(), primary_key=True, autoincrement=True, nullable=False),
sa.Column('drug_id', sa.Integer(), nullable=False),
sa.Column('name', sa.String(), nullable=False),
sa.ForeignKeyConstraint(['drug_id'], ['drugbank_drugs.id'], ),
sa.PrimaryKeyConstraint('drug_id')
)
op.create_table('drugbank_packager_maps',
sa.Column('id', sa.Integer(), primary_key=True, autoincrement=True, nullable=False),
sa.Column('drug_id', sa.Integer(), nullable=False),
sa.Column('packager_id', sa.Integer(), nullable=False),
sa.ForeignKeyConstraint(['drug_id'], ['drugbank_drugs.id'], ),
sa.ForeignKeyConstraint(['packager_id'], ['drugbank_packagers.id'], ),
sa.PrimaryKeyConstraint('drug_id')
)
op.create_table('drugbank_genericnames',
sa.Column('id', sa.Integer(), primary_key=True, autoincrement=True, nullable=False),
sa.Column('drug_id', sa.Integer(), nullable=False),
sa.Column('name', sa.String(), nullable=False),
sa.ForeignKeyConstraint(['drug_id'], ['drugbank_drugs.id'], ),
sa.PrimaryKeyConstraint('drug_id')
)
op.create_table('drugbank_prices',
sa.Column('id', sa.Integer(), primary_key=True, autoincrement=True, nullable=False),
sa.Column('drug_id', sa.Integer(), nullable=False),
sa.Column('description', sa.String(), nullable=False),
sa.Column('currency', sa.String(), nullable=False),
sa.Column('cost', sa.Float(), nullable=False),
sa.Column('unit', sa.String(), nullable=False),
sa.ForeignKeyConstraint(['drug_id'], ['drugbank_drugs.id'], ),
sa.PrimaryKeyConstraint('drug_id')
)
op.create_table('drugbank_manufacturer_maps',
sa.Column('id', sa.Integer(), primary_key=True, autoincrement=True, nullable=False),
sa.Column('drug_id', sa.Integer(), nullable=False),
sa.Column('manufacturer_id', sa.Integer(), nullable=False),
sa.ForeignKeyConstraint(['drug_id'], ['drugbank_drugs.id'], ),
sa.ForeignKeyConstraint(['manufacturer_id'], ['drugbank_manufacturers.id'], ),
sa.PrimaryKeyConstraint('drug_id')
)
op.create_table('drugbank_category_maps',
sa.Column('id', sa.Integer(), primary_key=True, autoincrement=True, nullable=False),
sa.Column('drug_id', sa.Integer(), nullable=False),
sa.Column('category_id', sa.Integer(), nullable=False),
sa.ForeignKeyConstraint(['category_id'], ['drugbank_categories.id'], ),
sa.ForeignKeyConstraint(['drug_id'], ['drugbank_drugs.id'], ),
sa.PrimaryKeyConstraint('drug_id')
)
### end Alembic commands ###
def downgrade():
### commands auto generated by Alembic - please adjust! ###
op.drop_table('drugbank_category_maps')
op.drop_table('drugbank_manufacturer_maps')
op.drop_table('drugbank_prices')
op.drop_table('drugbank_genericnames')
op.drop_table('drugbank_packager_maps')
op.drop_table('drugbank_synonyms')
op.drop_table('fda_product_substance_map')
op.drop_table('drugbank_drugs')
op.drop_table('fda_pharma_class_maps')
op.drop_table('drugbank_categories')
op.drop_table('fda_product_substances')
op.drop_table('fda_pharma_classes')
op.drop_table('fda_products')
op.drop_table('drugbank_manufacturers')
op.drop_table('drugbank_packagers')
### end Alembic commands ###

View File

@@ -0,0 +1,125 @@
"""Initial revision
Revision ID: 5ac93692b0ab
Revises: None
Create Date: 2013-10-26 13:25:27.853595
"""
# revision identifiers, used by Alembic.
revision = '5ac93692b0ab'
down_revision = None
from alembic import op
import sqlalchemy as sa
from sqlalchemy.dialects import postgresql
def upgrade():
### commands auto generated by Alembic - please adjust! ###
op.create_table('drugbank_manufacturers',
sa.Column('id', sa.Integer(), nullable=False),
sa.Column('name', sa.String(), nullable=False),
sa.PrimaryKeyConstraint('id')
)
op.create_table('drugbank_packagers',
sa.Column('id', sa.Integer(), nullable=False),
sa.Column('name', sa.String(), nullable=False),
sa.Column('url', sa.String(), nullable=True),
sa.PrimaryKeyConstraint('id')
)
op.create_table('drugbank_categories',
sa.Column('id', sa.Integer(), nullable=False),
sa.Column('name', sa.String(), nullable=False),
sa.PrimaryKeyConstraint('id')
)
op.create_table('fda_products',
sa.Column('id', sa.String(), nullable=False),
sa.Column('ndc', sa.String(), nullable=False),
sa.Column('type', sa.String(), nullable=False),
sa.Column('proprietaryName', sa.String(), nullable=False),
sa.Column('proprietaryNameSuffix', sa.String(), nullable=True),
sa.Column('genericName', sa.String(), nullable=False),
sa.Column('marketingCategoryName', sa.String(), nullable=False),
sa.Column('labelerName', sa.String(), nullable=False),
sa.Column('deaSchedule', sa.String(), nullable=False),
sa.PrimaryKeyConstraint('id')
)
op.create_table('drugbank_drugs',
sa.Column('id', sa.String(), nullable=False),
sa.Column('name', sa.String(), nullable=False),
sa.Column('indication', sa.String(), nullable=False),
sa.Column('ndc_id', sa.String(), nullable=True),
sa.Column('wikipedia', sa.String(), nullable=True),
sa.ForeignKeyConstraint(['ndc_id'], ['fda_products.id'], ),
sa.PrimaryKeyConstraint('id'),
sa.UniqueConstraint('id')
)
op.create_table('fda_product_substances',
sa.Column('fda_product_id', sa.String(), nullable=False),
sa.Column('substanceName', sa.String(), nullable=False),
sa.Column('strengthNumber', sa.Float(), nullable=False),
sa.Column('strengthUnit', sa.String(), nullable=False),
sa.Column('pharmaClasses', postgresql.ARRAY(sa.String()), nullable=False),
sa.ForeignKeyConstraint(['fda_product_id'], ['fda_products.id'], ),
sa.PrimaryKeyConstraint('fda_product_id')
)
op.create_table('drugbank_synonyms',
sa.Column('drug_id', sa.String(), nullable=False),
sa.Column('name', sa.String(), nullable=False),
sa.ForeignKeyConstraint(['drug_id'], ['drugbank_drugs.id'], ),
sa.PrimaryKeyConstraint('drug_id')
)
op.create_table('drugbank_packager_maps',
sa.Column('drug_id', sa.String(), nullable=False),
sa.Column('packager_id', sa.Integer(), nullable=False),
sa.ForeignKeyConstraint(['drug_id'], ['drugbank_drugs.id'], ),
sa.ForeignKeyConstraint(['packager_id'], ['drugbank_packagers.id'], ),
sa.PrimaryKeyConstraint('drug_id')
)
op.create_table('drugbank_genericnames',
sa.Column('drug_id', sa.String(), nullable=False),
sa.Column('name', sa.String(), nullable=False),
sa.ForeignKeyConstraint(['drug_id'], ['drugbank_drugs.id'], ),
sa.PrimaryKeyConstraint('drug_id')
)
op.create_table('drugbank_prices',
sa.Column('drug_id', sa.String(), nullable=False),
sa.Column('description', sa.String(), nullable=False),
sa.Column('currency', sa.String(), nullable=False),
sa.Column('cost', sa.Float(), nullable=False),
sa.Column('unit', sa.String(), nullable=False),
sa.ForeignKeyConstraint(['drug_id'], ['drugbank_drugs.id'], ),
sa.PrimaryKeyConstraint('drug_id')
)
op.create_table('drugbank_manufacturer_maps',
sa.Column('drug_id', sa.String(), nullable=False),
sa.Column('manufacturer_id', sa.Integer(), nullable=False),
sa.ForeignKeyConstraint(['drug_id'], ['drugbank_drugs.id'], ),
sa.ForeignKeyConstraint(['manufacturer_id'], ['drugbank_manufacturers.id'], ),
sa.PrimaryKeyConstraint('drug_id')
)
op.create_table('drugbank_category_maps',
sa.Column('drug_id', sa.String(), nullable=False),
sa.Column('category_id', sa.Integer(), nullable=False),
sa.ForeignKeyConstraint(['category_id'], ['drugbank_categories.id'], ),
sa.ForeignKeyConstraint(['drug_id'], ['drugbank_drugs.id'], ),
sa.PrimaryKeyConstraint('drug_id')
)
### end Alembic commands ###
def downgrade():
### commands auto generated by Alembic - please adjust! ###
op.drop_table('drugbank_category_maps')
op.drop_table('drugbank_manufacturer_maps')
op.drop_table('drugbank_prices')
op.drop_table('drugbank_genericnames')
op.drop_table('drugbank_packager_maps')
op.drop_table('drugbank_synonyms')
op.drop_table('fda_product_substances')
op.drop_table('drugbank_drugs')
op.drop_table('fda_products')
op.drop_table('drugbank_categories')
op.drop_table('drugbank_packagers')
op.drop_table('drugbank_manufacturers')
### end Alembic commands ###

1
mercy/config.py Normal file
View File

@@ -0,0 +1 @@
SQLALCHEMY_URI = 'postgresql://mercy:mercy@postgresql.aklabs.net/mercy'

3
mercy/exceptions.py Normal file
View File

@@ -0,0 +1,3 @@
class CorruptTarError(Exception):
pass

View File

8
mercy/importers/fda.py Normal file
View File

@@ -0,0 +1,8 @@
import mercy.db
class FDAImporter:
def __init__(self, *args, **kwargs):
self.__database = mercy.db.Database()
def read(self, fname):
raise Exception("FDAImporter.read doesn't do anything yet")

3
mercy/models/__init__.py Normal file
View File

@@ -0,0 +1,3 @@
import simplemodel
import fda
import drugbank

69
mercy/models/drugbank.py Normal file
View File

@@ -0,0 +1,69 @@
import sqlalchemy as sa
from mercy.models.simplemodel import SimpleModel
import mercy.MercyApplication
import sqlalchemy.dialects.postgresql as pgdialect
db = mercy.MercyApplication.get_db()
class Drug(SimpleModel, db.Model):
__tablename__ = "drugbank_drugs"
id = sa.Column(sa.String, primary_key=True, unique=True)
name = sa.Column(sa.String, nullable=False, index=True)
indication = sa.Column(sa.String, nullable=False)
ndc_id = sa.Column(sa.String, sa.ForeignKey('fda_products.id'), nullable=True)
wikipedia = sa.Column(sa.String, nullable=True)
__repr_keys__ = { 'id': basestring,
'name': basestring,
'ndc_id': basestring
}
class Price(SimpleModel, db.Model):
__tablename__ = "drugbank_prices"
drug_id = sa.Column(sa.String, sa.ForeignKey(Drug.id), primary_key=True, nullable=False)
description = sa.Column(sa.String, nullable=False)
currency = sa.Column(sa.String, nullable=False)
cost = sa.Column(sa.Float, nullable=False, index=True)
unit = sa.Column(sa.String, nullable=False)
class CategoryName(SimpleModel, db.Model):
__tablename__ = "drugbank_categories"
id = sa.Column(sa.Integer, primary_key=True, autoincrement=True, nullable=False)
name = sa.Column(sa.String, nullable=False)
class CategoryMap(SimpleModel, db.Model):
__tablename__ = "drugbank_category_maps"
drug_id = sa.Column(sa.String, sa.ForeignKey(Drug.id), primary_key=True, nullable=False)
category_id = sa.Column(sa.Integer, sa.ForeignKey(CategoryName.id), nullable=False)
class Packager(SimpleModel, db.Model):
__tablename__ = "drugbank_packagers"
id = sa.Column(sa.Integer, primary_key=True, autoincrement=True, nullable=False)
name = sa.Column(sa.String, nullable=False)
url = sa.Column(sa.String, nullable=True)
class PackagerMap(SimpleModel, db.Model):
__tablename__ = "drugbank_packager_maps"
drug_id = sa.Column(sa.String, sa.ForeignKey(Drug.id), primary_key=True, nullable=False)
packager_id = sa.Column(sa.Integer, sa.ForeignKey(Packager.id), nullable=False)
class Manufacturer(SimpleModel, db.Model):
__tablename__ = "drugbank_manufacturers"
id = sa.Column(sa.Integer, primary_key=True, autoincrement=True, nullable=False)
name = sa.Column(sa.String, nullable=False)
class ManufacturerMap(SimpleModel, db.Model):
__tablename__ = "drugbank_manufacturer_maps"
drug_id = sa.Column(sa.String, sa.ForeignKey(Drug.id), primary_key=True, nullable=False)
manufacturer_id = sa.Column(sa.Integer, sa.ForeignKey(Manufacturer.id), nullable=False)
class GenericName(SimpleModel, db.Model):
__tablename__ = "drugbank_genericnames"
drug_id = sa.Column(sa.String, sa.ForeignKey(Drug.id), primary_key=True, nullable=False)
name = sa.Column(sa.String, nullable=False)
class Synonym(SimpleModel, db.Model):
__tablename__ = "drugbank_synonyms"
drug_id = sa.Column(sa.String, sa.ForeignKey(Drug.id), primary_key=True, nullable=False)
name = sa.Column(sa.String, nullable=False)

37
mercy/models/fda.py Normal file
View File

@@ -0,0 +1,37 @@
import sqlalchemy as sa
from mercy.models.simplemodel import SimpleModel
import mercy.MercyApplication
import sqlalchemy.dialects.postgresql as pgdialect
db = mercy.MercyApplication.get_db()
class Product(SimpleModel, db.Model):
__tablename__ = 'fda_products'
id = sa.Column(sa.String, primary_key=True)
ndc = sa.Column(sa.String, nullable=False)
type = sa.Column(sa.String, nullable=False)
proprietaryName = sa.Column(sa.String, nullable=False, index=True)
proprietaryNameSuffix = sa.Column(sa.String)
genericName = sa.Column(sa.String, nullable=False, index=True)
marketingCategoryName = sa.Column(sa.String, nullable=False)
labelerName = sa.Column(sa.String, nullable=False)
deaSchedule = sa.Column(sa.String, nullable=False)
__repr_keys__ = { 'id': basestring,
'ndc': basestring,
'genericName': basestring,
'proprietaryName': basestring,
'proprietaryNameSuffix': basestring}
class ProductSubstance(SimpleModel, db.Model):
__tablename__ = 'fda_product_substances'
fda_product_id = sa.Column(sa.String,
sa.ForeignKey(Product.id),
primary_key=True,
nullable=False)
substanceName = sa.Column(sa.String, nullable=False)
strengthNumber = sa.Column(sa.Float, nullable=False)
strengthUnit = sa.Column(sa.String, nullable=False)
pharmaClasses = sa.Column(pgdialect.ARRAY(sa.String), nullable=False)

View File

@@ -0,0 +1,26 @@
import mercy.MercyApplication
db = mercy.MercyApplication.get_db()
class SimpleModel():
def __init__(self, *args, **kwargs):
db.Model.__init__(self, *args, **kwargs)
for (k, v) in kwargs.iteritems():
if hasattr(self, k):
setattr(self, k, v)
else:
raise AttributeError("Invalid attribute {} => {}".format(k, v))
def __repr__(self, *args, **kwargs):
try:
getattr(self.__class__, "__repr_keys__")
except AttributeError, e:
return db.Model.__repr__(self, *args, **kwargs)
values = []
for (name, otype) in self.__class__.__repr_keys__.iteritems():
if otype == basestring:
values += "'{}'".format(str(getattr(self.__class__, name)))
else:
values += str(getattr(self.__class__, name))
return "<{}({})>".format(self.__class__.__name__, ', '.join(values))

10
scripts/mercy-import-fda Normal file
View File

@@ -0,0 +1,10 @@
#!/usr/bin/env python
import sys
import mercy.importers.fda
startIdx = 0
if ( len(sys.argv) >= 3 ):
startIdx = int(sys.argv[2])
sys.exit(mercy.importers.fda.FDAImporter().read(sys.argv[1], startIdx=startIdx))

View File

@@ -0,0 +1,56 @@
import os
import nose
from nose.tools import raises
import mercy.models
import mercy.importers.fda
import mercy.exceptions
VALID_ROWS=[]
FIXTUREFILE=os.path.abspath(
os.path.join(
__file__,
"..",
"fixtures",
"fda_database.tar.gz"
)
)
FIXTUREFILE_BAD=os.path.abspath(
os.path.join(
__file__,
"..",
"fixtures",
"fda_database_bad.tar.gz"
)
)
FIXTUREFILE_CORRUPT=os.path.abspath(
os.path.join(
__file__,
"..",
"fixtures",
"fda_database_corrupt.tar.gz"
)
)
@raises(mercy.exceptions.CorruptTarError)
def test_fda_import_fails_on_corrupt_tar():
importer = mercy.importers.fda.FDAImporter()
impoter.read(FIXTUREFILE_CORRUPT)
def test_fda_import_populates_table():
importer = FDAImporter().read(FIXTUREFILE)
rows = mercy.models.fda.Product.query.all()
for i in range(0, len(rows)):
row = rows[i]
canned_row = CANNED_ROWS[i]
assert(len(row) == len(canned_row))
for j in canned_row.keys():
assert(row[j] == canned_row[j])
@raises(AttributeError, KeyError, ValueError)
def test_fda_import_rejects_bad_records:
importer = mercy.importers.fda.FDAImporter()
importer.read(FIXTUREFILE_BAD)