Skip to content

SoLoad

This is the main repository for SoLoad -- a package that defines processes for removing, extracting, and loading data to the data warehouse.

SOLOAD ETL Library

SOLOAD is an ETL library, currently supporting pipelines from HAS (OpenMRS), GMS, and Census to our data warehouse. For easy install:

  1. cd soidatawarehouse/soload
  2. pip install -r requirements.txt
  3. python setup.py install

You can also build a whl artifact for deployment.

Interaction and execution of the pipeline is fairly straightforward. Pipelines defined and run in SoFlow. See below for an example of running a pipeline from SoLoad.

CorePipeline()

from soload.pipelines import *
# msi = True is pipeline is being run on the vm
# msi = False if pipeline is being run on a local machine

pipe = CorePipeline('Dev_v01xx', msi = True) 

# DEFAULT
pipe.run(sources_to_clear = None, spawn_new=False, clear_core=False, 
                    load_has=False, update_has=False, 
                    load_gms=False, update_gms=False)


# To update has data in Core:

pipe.run(sources_to_clear = None, spawn_new=False, clear_core=False, 
                    load_has=False, update_has=True, 
                    load_gms=True, update_gms=False)

CensusPipeline()

from soload.pipelines import *

pipe = CensusPipeline('Dev_v01xx', msi = True)

# DEFAULT
pipe.run(spawn_new=False, extract_source=False, remove_views = False, create_views = False)

# To extract Census data and create views:
pipe.run(spawn_new=False, extract_source=True, remove_views = True, create_views = True)

OpenMRSPipeline()

from soload.pipelines import *

pipe = OpenMRSPipeline('Dev_v01xx', msi = True)

# DEFAULT
pipe.run(disciplines = pipe.all_disciplines, spawn_new = False, extract_legacy=False, 
            extract_source=False, remove_views=False, create_views=False, 
            create_connected_has_table = False)


# To extract OpenMRS data:
pipe.run(disciplines = pipe.all_disciplines, spawn_new = False, extract_legacy=False, 
            extract_source=True, remove_views=False, create_views=False, 
            create_connected_has_table = False)

# To drop HAS tables and relaod from legacy & source
pipe.run(disciplines = pipe.all_disciplines, spawn_new = True, extract_legacy=True, 
            extract_source=True, remove_views=False, create_views=False, 
            create_connected_has_table = False)

GMSPipeline()

from soload.pipelines import *

pipe = GMSPipeline('Dev_v01xx', msi = True)

#DEFAULT
pipe.run(programs = pipe.all_programs, spawn_new = False, extract_source=False, clear_source=False)

# To extract GMS data:
pipe.run(programs = pipe.all_programs, spawn_new = True, extract_source=True, clear_source=False)

CommonPipeline()

To extract common tables from csv files and load to warehouse:

from soload.pipelines import *

pipe = CommonPipeline('Dev_v01xx', msi = True)

# DEFAULT
pipe.run(spawn_new=False, create_WorldGames = False, create_RegionalGames = False, 
            create_ProgramsCDC = False, create_CountriesCDC = False,
            create_MultipleEventOwners = False, create_CountryISOCodes = False)

# Drop ProgramsCDC and reload
pipe.run(spawn_new=True, create_WorldGames = False, create_RegionalGames = False, 
            create_ProgramsCDC = True, create_CountriesCDC = False,
            create_MultipleEventOwners = False, create_CountryISOCodes = False)

SHEPipeline()

To extract common tables from csv files and load to warehouse:

from soload.pipelines import *

pipe = SHEPipeline('Dev_v01xx', msi = True)

# DEFAULT
pipe.run(spawn_new = False, create_summary_table=False, create_long_summary_table = False)

SportPartnershipPipeline()

To extract common tables from csv files and load to warehouse:

from soload.pipelines import *

pipe = SportPartnershipPipeline('Dev_v01xx', msi = True)

# DEFAULT
pipe.run(spawn_new = False, create_sport_partnership_table=False)