SoLoad
This is the main repository for SoLoad -- a package that defines processes for removing, extracting, and loading data to the data warehouse.
SOLOAD ETL Library
SOLOAD is an ETL library, currently supporting pipelines from HAS (OpenMRS), GMS, and Census to our data warehouse. For easy install:
cd soidatawarehouse/soload
pip install -r requirements.txt
python setup.py install
You can also build a whl artifact for deployment.
Interaction and execution of the pipeline is fairly straightforward. Pipelines defined and run in SoFlow. See below for an example of running a pipeline from SoLoad.
CorePipeline()
from soload.pipelines import *
# msi = True is pipeline is being run on the vm
# msi = False if pipeline is being run on a local machine
pipe = CorePipeline('Dev_v01xx', msi = True)
# DEFAULT
pipe.run(sources_to_clear = None, spawn_new=False, clear_core=False,
load_has=False, update_has=False,
load_gms=False, update_gms=False)
# To update has data in Core:
pipe.run(sources_to_clear = None, spawn_new=False, clear_core=False,
load_has=False, update_has=True,
load_gms=True, update_gms=False)
CensusPipeline()
from soload.pipelines import *
pipe = CensusPipeline('Dev_v01xx', msi = True)
# DEFAULT
pipe.run(spawn_new=False, extract_source=False, remove_views = False, create_views = False)
# To extract Census data and create views:
pipe.run(spawn_new=False, extract_source=True, remove_views = True, create_views = True)
OpenMRSPipeline()
from soload.pipelines import *
pipe = OpenMRSPipeline('Dev_v01xx', msi = True)
# DEFAULT
pipe.run(disciplines = pipe.all_disciplines, spawn_new = False, extract_legacy=False,
extract_source=False, remove_views=False, create_views=False,
create_connected_has_table = False)
# To extract OpenMRS data:
pipe.run(disciplines = pipe.all_disciplines, spawn_new = False, extract_legacy=False,
extract_source=True, remove_views=False, create_views=False,
create_connected_has_table = False)
# To drop HAS tables and relaod from legacy & source
pipe.run(disciplines = pipe.all_disciplines, spawn_new = True, extract_legacy=True,
extract_source=True, remove_views=False, create_views=False,
create_connected_has_table = False)
GMSPipeline()
from soload.pipelines import *
pipe = GMSPipeline('Dev_v01xx', msi = True)
#DEFAULT
pipe.run(programs = pipe.all_programs, spawn_new = False, extract_source=False, clear_source=False)
# To extract GMS data:
pipe.run(programs = pipe.all_programs, spawn_new = True, extract_source=True, clear_source=False)
CommonPipeline()
To extract common tables from csv files and load to warehouse:
from soload.pipelines import *
pipe = CommonPipeline('Dev_v01xx', msi = True)
# DEFAULT
pipe.run(spawn_new=False, create_WorldGames = False, create_RegionalGames = False,
create_ProgramsCDC = False, create_CountriesCDC = False,
create_MultipleEventOwners = False, create_CountryISOCodes = False)
# Drop ProgramsCDC and reload
pipe.run(spawn_new=True, create_WorldGames = False, create_RegionalGames = False,
create_ProgramsCDC = True, create_CountriesCDC = False,
create_MultipleEventOwners = False, create_CountryISOCodes = False)
SHEPipeline()
To extract common tables from csv files and load to warehouse:
from soload.pipelines import *
pipe = SHEPipeline('Dev_v01xx', msi = True)
# DEFAULT
pipe.run(spawn_new = False, create_summary_table=False, create_long_summary_table = False)
SportPartnershipPipeline()
To extract common tables from csv files and load to warehouse: