Skip to content

Part 8 Create PyConfig

!carrot run py --help
Usage: carrot run py [OPTIONS] COMMAND [ARGS]...

  Commands for using python configurations to run the ETL transformation.

Options:
  --help  Show this message and exit.

Commands:
  list      List all the python classes there are available to run
  make      Generate a python class from the OMOP mapping json
  map       Perform OMOP Mapping given a python configuration file.
  register  Register a python class with the tool
  remove    remove a registered class

!carrot run py make --name ExampleDataset ../data/rules.json
Recreating file /Users/calummacdonald/Usher/CO-CONNECT/docs/docs/CaRROT-CDM/notebooks/ExampleDataset.py

This automatically creates a file that looks like this:

# %load ExampleDataset.py
from carrot.cdm import define_person, define_condition_occurrence, define_visit_occurrence, define_measurement, define_observation, define_drug_exposure
from carrot.cdm import CommonDataModel
import json

class ExampleDataset(CommonDataModel):

    def __init__(self,**kwargs):
        """ 
        initialise the inputs and setup indexing 
        """
        super().__init__(**kwargs)


    @define_person
    def person_0(self):
        """
        Create CDM object for person
        """
        self.birth_datetime.series = self.inputs["Demographics.csv"]["Age"]
        self.gender_concept_id.series = self.inputs["Demographics.csv"]["Sex"]
        self.gender_source_concept_id.series = self.inputs["Demographics.csv"]["Sex"]
        self.gender_source_value.series = self.inputs["Demographics.csv"]["Sex"]
        self.person_id.series = self.inputs["Demographics.csv"]["ID"]

        # --- insert field operations --- 
        self.birth_datetime.series = self.tools.get_datetime_from_age(self.birth_datetime.series)

        # --- insert term mapping --- 
        self.gender_concept_id.series = self.gender_concept_id.series.map(
            {
                "Male": 8507
            }
        )
        self.gender_source_concept_id.series = self.gender_source_concept_id.series.map(
            {
                "Male": 8507
            }
        )

    @define_person
    def person_1(self):
        """
        Create CDM object for person
        """
        self.birth_datetime.series = self.inputs["Demographics.csv"]["Age"]
        self.gender_concept_id.series = self.inputs["Demographics.csv"]["Sex"]
        self.gender_source_concept_id.series = self.inputs["Demographics.csv"]["Sex"]
        self.gender_source_value.series = self.inputs["Demographics.csv"]["Sex"]
        self.person_id.series = self.inputs["Demographics.csv"]["ID"]

        # --- insert field operations --- 
        self.birth_datetime.series = self.tools.get_datetime_from_age(self.birth_datetime.series)

        # --- insert term mapping --- 
        self.gender_concept_id.series = self.gender_concept_id.series.map(
            {
                "Female": 8532
            }
        )
        self.gender_source_concept_id.series = self.gender_source_concept_id.series.map(
            {
                "Female": 8532
            }
        )

    @define_observation
    def observation_0(self):
        """
        Create CDM object for observation
        """
        self.observation_concept_id.series = self.inputs["Serology.csv"]["IgG"]
        self.observation_datetime.series = self.inputs["Serology.csv"]["Date"]
        self.observation_source_concept_id.series = self.inputs["Serology.csv"]["IgG"]
        self.observation_source_value.series = self.inputs["Serology.csv"]["IgG"]
        self.person_id.series = self.inputs["Serology.csv"]["ID"]

        # --- insert field operations --- 

        # --- insert term mapping --- 
        self.observation_concept_id.series = self.tools.make_scalar(self.observation_concept_id.series,4288455)
        self.observation_source_concept_id.series = self.tools.make_scalar(self.observation_source_concept_id.series,4288455)

    @define_observation
    def observation_1(self):
        """
        Create CDM object for observation
        """
        self.observation_concept_id.series = self.inputs["Hospital_Visit.csv"]["reason"]
        self.observation_datetime.series = self.inputs["Hospital_Visit.csv"]["admission_date"]
        self.observation_source_concept_id.series = self.inputs["Hospital_Visit.csv"]["reason"]
        self.observation_source_value.series = self.inputs["Hospital_Visit.csv"]["reason"]
        self.person_id.series = self.inputs["Hospital_Visit.csv"]["ID"]

        # --- insert field operations --- 

        # --- insert term mapping --- 
        self.observation_concept_id.series = self.observation_concept_id.series.map(
            {
                "Heart Attack": 4059317
            }
        )
        self.observation_source_concept_id.series = self.observation_source_concept_id.series.map(
            {
                "Heart Attack": 4059317
            }
        )

    @define_observation
    def observation_2(self):
        """
        Create CDM object for observation
        """
        self.observation_concept_id.series = self.inputs["Hospital_Visit.csv"]["reason"]
        self.observation_datetime.series = self.inputs["Hospital_Visit.csv"]["admission_date"]
        self.observation_source_concept_id.series = self.inputs["Hospital_Visit.csv"]["reason"]
        self.observation_source_value.series = self.inputs["Hospital_Visit.csv"]["reason"]
        self.person_id.series = self.inputs["Hospital_Visit.csv"]["ID"]

        # --- insert field operations --- 

        # --- insert term mapping --- 
        self.observation_concept_id.series = self.observation_concept_id.series.map(
            {
                "COVID-19": 37311065
            }
        )
        self.observation_source_concept_id.series = self.observation_source_concept_id.series.map(
            {
                "COVID-19": 37311065
            }
        )

    @define_observation
    def observation_3(self):
        """
        Create CDM object for observation
        """
        self.observation_concept_id.series = self.inputs["Hospital_Visit.csv"]["reason"]
        self.observation_datetime.series = self.inputs["Hospital_Visit.csv"]["admission_date"]
        self.observation_source_concept_id.series = self.inputs["Hospital_Visit.csv"]["reason"]
        self.observation_source_value.series = self.inputs["Hospital_Visit.csv"]["reason"]
        self.person_id.series = self.inputs["Hospital_Visit.csv"]["ID"]

        # --- insert field operations --- 

        # --- insert term mapping --- 
        self.observation_concept_id.series = self.observation_concept_id.series.map(
            {
                "Cancer": 40757663
            }
        )
        self.observation_source_concept_id.series = self.observation_source_concept_id.series.map(
            {
                "Cancer": 40757663
            }
        )

    @define_condition_occurrence
    def condition_occurrence_0(self):
        """
        Create CDM object for condition_occurrence
        """
        self.condition_concept_id.series = self.inputs["Symptoms.csv"]["Headache"]
        self.condition_end_datetime.series = self.inputs["Symptoms.csv"]["date_occurrence"]
        self.condition_source_concept_id.series = self.inputs["Symptoms.csv"]["Headache"]
        self.condition_source_value.series = self.inputs["Symptoms.csv"]["Headache"]
        self.condition_start_datetime.series = self.inputs["Symptoms.csv"]["date_occurrence"]
        self.person_id.series = self.inputs["Symptoms.csv"]["ID"]

        # --- insert field operations --- 

        # --- insert term mapping --- 
        self.condition_concept_id.series = self.condition_concept_id.series.map(
            {
                "Yes": 378253
            }
        )
        self.condition_source_concept_id.series = self.condition_source_concept_id.series.map(
            {
                "Yes": 378253
            }
        )

    @define_condition_occurrence
    def condition_occurrence_1(self):
        """
        Create CDM object for condition_occurrence
        """
        self.condition_concept_id.series = self.inputs["Symptoms.csv"]["Fatigue"]
        self.condition_end_datetime.series = self.inputs["Symptoms.csv"]["date_occurrence"]
        self.condition_source_concept_id.series = self.inputs["Symptoms.csv"]["Fatigue"]
        self.condition_source_value.series = self.inputs["Symptoms.csv"]["Fatigue"]
        self.condition_start_datetime.series = self.inputs["Symptoms.csv"]["date_occurrence"]
        self.person_id.series = self.inputs["Symptoms.csv"]["ID"]

        # --- insert field operations --- 

        # --- insert term mapping --- 
        self.condition_concept_id.series = self.condition_concept_id.series.map(
            {
                "Yes": 4223659
            }
        )
        self.condition_source_concept_id.series = self.condition_source_concept_id.series.map(
            {
                "Yes": 4223659
            }
        )

    @define_condition_occurrence
    def condition_occurrence_2(self):
        """
        Create CDM object for condition_occurrence
        """
        self.condition_concept_id.series = self.inputs["Symptoms.csv"]["Dizzy"]
        self.condition_end_datetime.series = self.inputs["Symptoms.csv"]["date_occurrence"]
        self.condition_source_concept_id.series = self.inputs["Symptoms.csv"]["Dizzy"]
        self.condition_source_value.series = self.inputs["Symptoms.csv"]["Dizzy"]
        self.condition_start_datetime.series = self.inputs["Symptoms.csv"]["date_occurrence"]
        self.person_id.series = self.inputs["Symptoms.csv"]["ID"]

        # --- insert field operations --- 

        # --- insert term mapping --- 
        self.condition_concept_id.series = self.condition_concept_id.series.map(
            {
                "Yes": 4223938
            }
        )
        self.condition_source_concept_id.series = self.condition_source_concept_id.series.map(
            {
                "Yes": 4223938
            }
        )

    @define_condition_occurrence
    def condition_occurrence_3(self):
        """
        Create CDM object for condition_occurrence
        """
        self.condition_concept_id.series = self.inputs["Symptoms.csv"]["Cough"]
        self.condition_end_datetime.series = self.inputs["Symptoms.csv"]["date_occurrence"]
        self.condition_source_concept_id.series = self.inputs["Symptoms.csv"]["Cough"]
        self.condition_source_value.series = self.inputs["Symptoms.csv"]["Cough"]
        self.condition_start_datetime.series = self.inputs["Symptoms.csv"]["date_occurrence"]
        self.person_id.series = self.inputs["Symptoms.csv"]["ID"]

        # --- insert field operations --- 

        # --- insert term mapping --- 
        self.condition_concept_id.series = self.condition_concept_id.series.map(
            {
                "Yes": 254761
            }
        )
        self.condition_source_concept_id.series = self.condition_source_concept_id.series.map(
            {
                "Yes": 254761
            }
        )

    @define_condition_occurrence
    def condition_occurrence_4(self):
        """
        Create CDM object for condition_occurrence
        """
        self.condition_concept_id.series = self.inputs["Symptoms.csv"]["Fever"]
        self.condition_end_datetime.series = self.inputs["Symptoms.csv"]["date_occurrence"]
        self.condition_source_concept_id.series = self.inputs["Symptoms.csv"]["Fever"]
        self.condition_source_value.series = self.inputs["Symptoms.csv"]["Fever"]
        self.condition_start_datetime.series = self.inputs["Symptoms.csv"]["date_occurrence"]
        self.person_id.series = self.inputs["Symptoms.csv"]["ID"]

        # --- insert field operations --- 

        # --- insert term mapping --- 
        self.condition_concept_id.series = self.condition_concept_id.series.map(
            {
                "Yes": 437663
            }
        )
        self.condition_source_concept_id.series = self.condition_source_concept_id.series.map(
            {
                "Yes": 437663
            }
        )

    @define_condition_occurrence
    def condition_occurrence_5(self):
        """
        Create CDM object for condition_occurrence
        """
        self.condition_concept_id.series = self.inputs["Symptoms.csv"]["Muscle_Pain"]
        self.condition_end_datetime.series = self.inputs["Symptoms.csv"]["date_occurrence"]
        self.condition_source_concept_id.series = self.inputs["Symptoms.csv"]["Muscle_Pain"]
        self.condition_source_value.series = self.inputs["Symptoms.csv"]["Muscle_Pain"]
        self.condition_start_datetime.series = self.inputs["Symptoms.csv"]["date_occurrence"]
        self.person_id.series = self.inputs["Symptoms.csv"]["ID"]

        # --- insert field operations --- 

        # --- insert term mapping --- 
        self.condition_concept_id.series = self.condition_concept_id.series.map(
            {
                "Yes": 442752
            }
        )
        self.condition_source_concept_id.series = self.condition_source_concept_id.series.map(
            {
                "Yes": 442752
            }
        )

    @define_condition_occurrence
    def condition_occurrence_6(self):
        """
        Create CDM object for condition_occurrence
        """
        self.condition_concept_id.series = self.inputs["Hospital_Visit.csv"]["reason"]
        self.condition_end_datetime.series = self.inputs["Hospital_Visit.csv"]["admission_date"]
        self.condition_source_concept_id.series = self.inputs["Hospital_Visit.csv"]["reason"]
        self.condition_source_value.series = self.inputs["Hospital_Visit.csv"]["reason"]
        self.condition_start_datetime.series = self.inputs["Hospital_Visit.csv"]["admission_date"]
        self.person_id.series = self.inputs["Hospital_Visit.csv"]["ID"]

        # --- insert field operations --- 

        # --- insert term mapping --- 
        self.condition_concept_id.series = self.condition_concept_id.series.map(
            {
                "Pneumonia": 255848
            }
        )
        self.condition_source_concept_id.series = self.condition_source_concept_id.series.map(
            {
                "Pneumonia": 255848
            }
        )

    @define_condition_occurrence
    def condition_occurrence_7(self):
        """
        Create CDM object for condition_occurrence
        """
        self.condition_concept_id.series = self.inputs["GP_Records.csv"]["comorbidity"]
        self.condition_end_datetime.series = self.inputs["GP_Records.csv"]["date_of_visit"]
        self.condition_source_concept_id.series = self.inputs["GP_Records.csv"]["comorbidity"]
        self.condition_source_value.series = self.inputs["GP_Records.csv"]["comorbidity"]
        self.condition_start_datetime.series = self.inputs["GP_Records.csv"]["date_of_visit"]
        self.person_id.series = self.inputs["GP_Records.csv"]["ID"]

        # --- insert field operations --- 

        # --- insert term mapping --- 
        self.condition_concept_id.series = self.condition_concept_id.series.map(
            {
                "Mental Health": 4131548
            }
        )
        self.condition_source_concept_id.series = self.condition_source_concept_id.series.map(
            {
                "Mental Health": 4131548
            }
        )

    @define_condition_occurrence
    def condition_occurrence_8(self):
        """
        Create CDM object for condition_occurrence
        """
        self.condition_concept_id.series = self.inputs["GP_Records.csv"]["comorbidity"]
        self.condition_end_datetime.series = self.inputs["GP_Records.csv"]["date_of_visit"]
        self.condition_source_concept_id.series = self.inputs["GP_Records.csv"]["comorbidity"]
        self.condition_source_value.series = self.inputs["GP_Records.csv"]["comorbidity"]
        self.condition_start_datetime.series = self.inputs["GP_Records.csv"]["date_of_visit"]
        self.person_id.series = self.inputs["GP_Records.csv"]["ID"]

        # --- insert field operations --- 

        # --- insert term mapping --- 
        self.condition_concept_id.series = self.condition_concept_id.series.map(
            {
                "Mental Health": 432586
            }
        )
        self.condition_source_concept_id.series = self.condition_source_concept_id.series.map(
            {
                "Mental Health": 432586
            }
        )

    @define_condition_occurrence
    def condition_occurrence_9(self):
        """
        Create CDM object for condition_occurrence
        """
        self.condition_concept_id.series = self.inputs["GP_Records.csv"]["comorbidity"]
        self.condition_end_datetime.series = self.inputs["GP_Records.csv"]["date_of_visit"]
        self.condition_source_concept_id.series = self.inputs["GP_Records.csv"]["comorbidity"]
        self.condition_source_value.series = self.inputs["GP_Records.csv"]["comorbidity"]
        self.condition_start_datetime.series = self.inputs["GP_Records.csv"]["date_of_visit"]
        self.person_id.series = self.inputs["GP_Records.csv"]["ID"]

        # --- insert field operations --- 

        # --- insert term mapping --- 
        self.condition_concept_id.series = self.condition_concept_id.series.map(
            {
                "Diabetes Type-II": 201826
            }
        )
        self.condition_source_concept_id.series = self.condition_source_concept_id.series.map(
            {
                "Diabetes Type-II": 201826
            }
        )

    @define_condition_occurrence
    def condition_occurrence_10(self):
        """
        Create CDM object for condition_occurrence
        """
        self.condition_concept_id.series = self.inputs["GP_Records.csv"]["comorbidity"]
        self.condition_end_datetime.series = self.inputs["GP_Records.csv"]["date_of_visit"]
        self.condition_source_concept_id.series = self.inputs["GP_Records.csv"]["comorbidity"]
        self.condition_source_value.series = self.inputs["GP_Records.csv"]["comorbidity"]
        self.condition_start_datetime.series = self.inputs["GP_Records.csv"]["date_of_visit"]
        self.person_id.series = self.inputs["GP_Records.csv"]["ID"]

        # --- insert field operations --- 

        # --- insert term mapping --- 
        self.condition_concept_id.series = self.condition_concept_id.series.map(
            {
                "Heart Condition": 4185932
            }
        )
        self.condition_source_concept_id.series = self.condition_source_concept_id.series.map(
            {
                "Heart Condition": 4185932
            }
        )

    @define_condition_occurrence
    def condition_occurrence_11(self):
        """
        Create CDM object for condition_occurrence
        """
        self.condition_concept_id.series = self.inputs["GP_Records.csv"]["comorbidity"]
        self.condition_end_datetime.series = self.inputs["GP_Records.csv"]["date_of_visit"]
        self.condition_source_concept_id.series = self.inputs["GP_Records.csv"]["comorbidity"]
        self.condition_source_value.series = self.inputs["GP_Records.csv"]["comorbidity"]
        self.condition_start_datetime.series = self.inputs["GP_Records.csv"]["date_of_visit"]
        self.person_id.series = self.inputs["GP_Records.csv"]["ID"]

        # --- insert field operations --- 

        # --- insert term mapping --- 
        self.condition_concept_id.series = self.condition_concept_id.series.map(
            {
                "High Blood Pressure": 316866
            }
        )
        self.condition_source_concept_id.series = self.condition_source_concept_id.series.map(
            {
                "High Blood Pressure": 316866
            }
        )

    @define_drug_exposure
    def drug_exposure_0(self):
        """
        Create CDM object for drug_exposure
        """
        self.drug_concept_id.series = self.inputs["Vaccinations.csv"]["type"]
        self.drug_exposure_end_datetime.series = self.inputs["Vaccinations.csv"]["date_of_vaccination"]
        self.drug_exposure_start_datetime.series = self.inputs["Vaccinations.csv"]["date_of_vaccination"]
        self.drug_source_concept_id.series = self.inputs["Vaccinations.csv"]["type"]
        self.drug_source_value.series = self.inputs["Vaccinations.csv"]["type"]
        self.person_id.series = self.inputs["Vaccinations.csv"]["ID"]

        # --- insert field operations --- 

        # --- insert term mapping --- 
        self.drug_concept_id.series = self.drug_concept_id.series.map(
            {
                "Moderna": 35894915
            }
        )
        self.drug_source_concept_id.series = self.drug_source_concept_id.series.map(
            {
                "Moderna": 35894915
            }
        )

    @define_drug_exposure
    def drug_exposure_1(self):
        """
        Create CDM object for drug_exposure
        """
        self.drug_concept_id.series = self.inputs["Vaccinations.csv"]["type"]
        self.drug_exposure_end_datetime.series = self.inputs["Vaccinations.csv"]["date_of_vaccination"]
        self.drug_exposure_start_datetime.series = self.inputs["Vaccinations.csv"]["date_of_vaccination"]
        self.drug_source_concept_id.series = self.inputs["Vaccinations.csv"]["type"]
        self.drug_source_value.series = self.inputs["Vaccinations.csv"]["type"]
        self.person_id.series = self.inputs["Vaccinations.csv"]["ID"]

        # --- insert field operations --- 

        # --- insert term mapping --- 
        self.drug_concept_id.series = self.drug_concept_id.series.map(
            {
                "AstraZenica": 35894915
            }
        )
        self.drug_source_concept_id.series = self.drug_source_concept_id.series.map(
            {
                "AstraZenica": 35894915
            }
        )

    @define_drug_exposure
    def drug_exposure_2(self):
        """
        Create CDM object for drug_exposure
        """
        self.drug_concept_id.series = self.inputs["Vaccinations.csv"]["type"]
        self.drug_exposure_end_datetime.series = self.inputs["Vaccinations.csv"]["date_of_vaccination"]
        self.drug_exposure_start_datetime.series = self.inputs["Vaccinations.csv"]["date_of_vaccination"]
        self.drug_source_concept_id.series = self.inputs["Vaccinations.csv"]["type"]
        self.drug_source_value.series = self.inputs["Vaccinations.csv"]["type"]
        self.person_id.series = self.inputs["Vaccinations.csv"]["ID"]

        # --- insert field operations --- 

        # --- insert term mapping --- 
        self.drug_concept_id.series = self.drug_concept_id.series.map(
            {
                "Pfizer": 35894915
            }
        )
        self.drug_source_concept_id.series = self.drug_source_concept_id.series.map(
            {
                "Pfizer": 35894915
            }
        )

    @define_drug_exposure
    def drug_exposure_3(self):
        """
        Create CDM object for drug_exposure
        """
        self.drug_concept_id.series = self.inputs["Vaccinations.csv"]["type"]
        self.drug_exposure_end_datetime.series = self.inputs["Vaccinations.csv"]["date_of_vaccination"]
        self.drug_exposure_start_datetime.series = self.inputs["Vaccinations.csv"]["date_of_vaccination"]
        self.drug_source_concept_id.series = self.inputs["Vaccinations.csv"]["type"]
        self.drug_source_value.series = self.inputs["Vaccinations.csv"]["type"]
        self.person_id.series = self.inputs["Vaccinations.csv"]["ID"]

        # --- insert field operations --- 

        # --- insert term mapping --- 
        self.drug_concept_id.series = self.drug_concept_id.series.map(
            {
                "Moderna": 37003518
            }
        )
        self.drug_source_concept_id.series = self.drug_source_concept_id.series.map(
            {
                "Moderna": 37003518
            }
        )

    @define_drug_exposure
    def drug_exposure_4(self):
        """
        Create CDM object for drug_exposure
        """
        self.drug_concept_id.series = self.inputs["Vaccinations.csv"]["type"]
        self.drug_exposure_end_datetime.series = self.inputs["Vaccinations.csv"]["date_of_vaccination"]
        self.drug_exposure_start_datetime.series = self.inputs["Vaccinations.csv"]["date_of_vaccination"]
        self.drug_source_concept_id.series = self.inputs["Vaccinations.csv"]["type"]
        self.drug_source_value.series = self.inputs["Vaccinations.csv"]["type"]
        self.person_id.series = self.inputs["Vaccinations.csv"]["ID"]

        # --- insert field operations --- 

        # --- insert term mapping --- 
        self.drug_concept_id.series = self.drug_concept_id.series.map(
            {
                "Pfizer": 37003436
            }
        )
        self.drug_source_concept_id.series = self.drug_source_concept_id.series.map(
            {
                "Pfizer": 37003436
            }
        )

Loading some inputs..

import carrot
import glob
inputs = carrot.tools.load_csv(glob.glob('../data/part1/*'))
inputs
2022-06-17 15:17:54 - LocalDataCollection - INFO - DataCollection Object Created
2022-06-17 15:17:54 - LocalDataCollection - INFO - Registering  Blood_Test.csv [<carrot.io.common.DataBrick object at 0x111f1be50>]
2022-06-17 15:17:54 - LocalDataCollection - INFO - Registering  Demographics.csv [<carrot.io.common.DataBrick object at 0x111fd15b0>]
2022-06-17 15:17:54 - LocalDataCollection - INFO - Registering  GP_Records.csv [<carrot.io.common.DataBrick object at 0x111fd12e0>]
2022-06-17 15:17:54 - LocalDataCollection - INFO - Registering  Hospital_Visit.csv [<carrot.io.common.DataBrick object at 0x111f56b80>]
2022-06-17 15:17:54 - LocalDataCollection - INFO - Registering  Serology.csv [<carrot.io.common.DataBrick object at 0x111f56730>]
2022-06-17 15:17:54 - LocalDataCollection - INFO - Registering  Symptoms.csv [<carrot.io.common.DataBrick object at 0x11207f070>]
2022-06-17 15:17:54 - LocalDataCollection - INFO - Registering  Vaccinations.csv [<carrot.io.common.DataBrick object at 0x11207f340>]
2022-06-17 15:17:54 - LocalDataCollection - INFO - Registering  pks.csv [<carrot.io.common.DataBrick object at 0x115f20340>]

<carrot.io.plugins.local.LocalDataCollection at 0x111fd1f40>

A new instances can be created from the created python class

instance = ExampleDataset(inputs=inputs)
instance
2022-06-17 15:17:54 - ExampleDataset - INFO - CommonDataModel (5.3.1) created with co-connect-tools version 0.0.0
2022-06-17 15:17:54 - ExampleDataset - INFO - Running with an DataCollection object
2022-06-17 15:17:54 - ExampleDataset - INFO - Turning on automatic cdm column filling
2022-06-17 15:17:54 - ExampleDataset - INFO - Added condition_occurrence_0 of type condition_occurrence
2022-06-17 15:17:54 - ExampleDataset - INFO - Added condition_occurrence_1 of type condition_occurrence
2022-06-17 15:17:54 - ExampleDataset - INFO - Added condition_occurrence_10 of type condition_occurrence
2022-06-17 15:17:54 - ExampleDataset - INFO - Added condition_occurrence_11 of type condition_occurrence
2022-06-17 15:17:54 - ExampleDataset - INFO - Added condition_occurrence_2 of type condition_occurrence
2022-06-17 15:17:54 - ExampleDataset - INFO - Added condition_occurrence_3 of type condition_occurrence
2022-06-17 15:17:54 - ExampleDataset - INFO - Added condition_occurrence_4 of type condition_occurrence
2022-06-17 15:17:54 - ExampleDataset - INFO - Added condition_occurrence_5 of type condition_occurrence
2022-06-17 15:17:54 - ExampleDataset - INFO - Added condition_occurrence_6 of type condition_occurrence
2022-06-17 15:17:54 - ExampleDataset - INFO - Added condition_occurrence_7 of type condition_occurrence
2022-06-17 15:17:54 - ExampleDataset - INFO - Added condition_occurrence_8 of type condition_occurrence
2022-06-17 15:17:54 - ExampleDataset - INFO - Added condition_occurrence_9 of type condition_occurrence
2022-06-17 15:17:54 - ExampleDataset - INFO - Added drug_exposure_0 of type drug_exposure
2022-06-17 15:17:54 - ExampleDataset - INFO - Added drug_exposure_1 of type drug_exposure
2022-06-17 15:17:54 - ExampleDataset - INFO - Added drug_exposure_2 of type drug_exposure
2022-06-17 15:17:54 - ExampleDataset - INFO - Added drug_exposure_3 of type drug_exposure
2022-06-17 15:17:54 - ExampleDataset - INFO - Added drug_exposure_4 of type drug_exposure
2022-06-17 15:17:54 - ExampleDataset - INFO - Added observation_0 of type observation
2022-06-17 15:17:54 - ExampleDataset - INFO - Added observation_1 of type observation
2022-06-17 15:17:54 - ExampleDataset - INFO - Added observation_2 of type observation
2022-06-17 15:17:54 - ExampleDataset - INFO - Added observation_3 of type observation
2022-06-17 15:17:54 - ExampleDataset - INFO - Added person_0 of type person
2022-06-17 15:17:54 - ExampleDataset - INFO - Added person_1 of type person

<__main__.ExampleDataset at 0x115f20f40>
instance.process()
2022-06-17 15:17:54 - ExampleDataset - INFO - Starting processing in order: ['person', 'condition_occurrence', 'drug_exposure', 'observation']
2022-06-17 15:17:54 - ExampleDataset - INFO - Number of objects to process for each table...
{
      "condition_occurrence": 12,
      "drug_exposure": 5,
      "observation": 4,
      "person": 2
}
2022-06-17 15:17:54 - ExampleDataset - INFO - for person: found 2 objects
2022-06-17 15:17:54 - ExampleDataset - INFO - working on person
2022-06-17 15:17:54 - ExampleDataset - INFO - starting on person_0
2022-06-17 15:17:55 - LocalDataCollection - INFO - Retrieving initial dataframe for 'Demographics.csv' for the first time
2022-06-17 15:17:55 - Person - WARNING - Requiring non-null values in gender_concept_id removed 438 rows, leaving 562 rows.
2022-06-17 15:17:55 - Person - WARNING - Requiring non-null values in birth_datetime removed 1 rows, leaving 561 rows.
2022-06-17 15:17:55 - Person - INFO - Automatically formatting data columns.
2022-06-17 15:17:55 - Person - INFO - created df (0x115fa5310)[person_0]
2022-06-17 15:17:55 - ExampleDataset - INFO - finished person_0 (0x115fa5310) ... 1/2 completed, 561 rows
2022-06-17 15:17:55 - ExampleDataset - INFO - starting on person_1
2022-06-17 15:17:55 - Person - WARNING - Requiring non-null values in gender_concept_id removed 565 rows, leaving 435 rows.
2022-06-17 15:17:55 - Person - INFO - Automatically formatting data columns.
2022-06-17 15:17:55 - Person - INFO - created df (0x115fec4f0)[person_1]
2022-06-17 15:17:55 - ExampleDataset - INFO - finished person_1 (0x115fec4f0) ... 2/2 completed, 435 rows
2022-06-17 15:17:55 - ExampleDataset - INFO - called save_dateframe but outputs are not defined. save_files: True

could not convert string to float: 'na'
could not convert string to float: 'na'

2022-06-17 15:17:55 - ExampleDataset - INFO - finalised person on iteration 0 producing 996 rows from 2 tables
2022-06-17 15:17:55 - LocalDataCollection - INFO - Getting next chunk of data
2022-06-17 15:17:55 - LocalDataCollection - INFO - All input files for this object have now been used.
2022-06-17 15:17:55 - LocalDataCollection - INFO - resetting used bricks
2022-06-17 15:17:55 - ExampleDataset - INFO - for condition_occurrence: found 12 objects
2022-06-17 15:17:55 - ExampleDataset - INFO - working on condition_occurrence
2022-06-17 15:17:55 - ExampleDataset - INFO - starting on condition_occurrence_0
2022-06-17 15:17:55 - LocalDataCollection - INFO - Retrieving initial dataframe for 'Symptoms.csv' for the first time
2022-06-17 15:17:55 - ConditionOccurrence - WARNING - Requiring non-null values in condition_concept_id removed 55 rows, leaving 275 rows.
2022-06-17 15:17:55 - ConditionOccurrence - WARNING - Requiring non-null values in condition_start_datetime removed 1 rows, leaving 274 rows.
2022-06-17 15:17:55 - ConditionOccurrence - INFO - Automatically formatting data columns.
2022-06-17 15:17:55 - ConditionOccurrence - INFO - created df (0x115fec340)[condition_occurrence_0]
2022-06-17 15:17:55 - ExampleDataset - INFO - finished condition_occurrence_0 (0x115fec340) ... 1/12 completed, 274 rows
2022-06-17 15:17:55 - ExampleDataset - INFO - starting on condition_occurrence_1
2022-06-17 15:17:55 - ConditionOccurrence - WARNING - Requiring non-null values in condition_concept_id removed 95 rows, leaving 235 rows.
2022-06-17 15:17:55 - ConditionOccurrence - WARNING - Requiring non-null values in condition_start_datetime removed 1 rows, leaving 234 rows.
2022-06-17 15:17:55 - ConditionOccurrence - INFO - Automatically formatting data columns.
2022-06-17 15:17:55 - ConditionOccurrence - INFO - created df (0x11603e5b0)[condition_occurrence_1]
2022-06-17 15:17:55 - ExampleDataset - INFO - finished condition_occurrence_1 (0x11603e5b0) ... 2/12 completed, 234 rows
2022-06-17 15:17:55 - ExampleDataset - INFO - starting on condition_occurrence_10
2022-06-17 15:17:55 - LocalDataCollection - INFO - Retrieving initial dataframe for 'GP_Records.csv' for the first time
2022-06-17 15:17:55 - ConditionOccurrence - WARNING - Requiring non-null values in condition_concept_id removed 1738 rows, leaving 214 rows.
2022-06-17 15:17:55 - ConditionOccurrence - INFO - Automatically formatting data columns.
2022-06-17 15:17:55 - ConditionOccurrence - INFO - created df (0x11628fd30)[condition_occurrence_10]
2022-06-17 15:17:55 - ExampleDataset - INFO - finished condition_occurrence_10 (0x11628fd30) ... 3/12 completed, 214 rows
2022-06-17 15:17:55 - ExampleDataset - ERROR - There are person_ids in this table that are not in the output person table!
2022-06-17 15:17:55 - ExampleDataset - ERROR - Either they are not in the original data, or while creating the person table, 
2022-06-17 15:17:55 - ExampleDataset - ERROR - studies have been removed due to lack of required fields, such as birthdate.
2022-06-17 15:17:55 - ExampleDataset - ERROR - 213/214 were good, 1 studies are removed.
2022-06-17 15:17:55 - ExampleDataset - INFO - starting on condition_occurrence_11
2022-06-17 15:17:55 - ConditionOccurrence - WARNING - Requiring non-null values in condition_concept_id removed 1822 rows, leaving 130 rows.
2022-06-17 15:17:55 - ConditionOccurrence - INFO - Automatically formatting data columns.
2022-06-17 15:17:55 - ConditionOccurrence - INFO - created df (0x1162a1c70)[condition_occurrence_11]
2022-06-17 15:17:55 - ExampleDataset - INFO - finished condition_occurrence_11 (0x1162a1c70) ... 4/12 completed, 130 rows
2022-06-17 15:17:55 - ExampleDataset - INFO - starting on condition_occurrence_2
2022-06-17 15:17:55 - ConditionOccurrence - WARNING - Requiring non-null values in condition_concept_id removed 195 rows, leaving 135 rows.
2022-06-17 15:17:55 - ConditionOccurrence - WARNING - Requiring non-null values in condition_start_datetime removed 1 rows, leaving 134 rows.
2022-06-17 15:17:55 - ConditionOccurrence - INFO - Automatically formatting data columns.
2022-06-17 15:17:55 - ConditionOccurrence - INFO - created df (0x11601b5b0)[condition_occurrence_2]
2022-06-17 15:17:55 - ExampleDataset - INFO - finished condition_occurrence_2 (0x11601b5b0) ... 5/12 completed, 134 rows
2022-06-17 15:17:55 - ExampleDataset - INFO - starting on condition_occurrence_3
2022-06-17 15:17:55 - ConditionOccurrence - WARNING - Requiring non-null values in condition_concept_id removed 100 rows, leaving 230 rows.
2022-06-17 15:17:55 - ConditionOccurrence - WARNING - Requiring non-null values in condition_start_datetime removed 1 rows, leaving 229 rows.
2022-06-17 15:17:55 - ConditionOccurrence - INFO - Automatically formatting data columns.
2022-06-17 15:17:55 - ConditionOccurrence - INFO - created df (0x1162b97f0)[condition_occurrence_3]
2022-06-17 15:17:55 - ExampleDataset - INFO - finished condition_occurrence_3 (0x1162b97f0) ... 6/12 completed, 229 rows
2022-06-17 15:17:56 - ExampleDataset - INFO - starting on condition_occurrence_4
2022-06-17 15:17:56 - ConditionOccurrence - WARNING - Requiring non-null values in condition_concept_id removed 265 rows, leaving 65 rows.
2022-06-17 15:17:56 - ConditionOccurrence - INFO - Automatically formatting data columns.
2022-06-17 15:17:56 - ConditionOccurrence - INFO - created df (0x1162a11f0)[condition_occurrence_4]
2022-06-17 15:17:56 - ExampleDataset - INFO - finished condition_occurrence_4 (0x1162a11f0) ... 7/12 completed, 65 rows
2022-06-17 15:17:56 - ExampleDataset - INFO - starting on condition_occurrence_5
2022-06-17 15:17:56 - ConditionOccurrence - WARNING - Requiring non-null values in condition_concept_id removed 295 rows, leaving 35 rows.
2022-06-17 15:17:56 - ConditionOccurrence - WARNING - Requiring non-null values in condition_start_datetime removed 1 rows, leaving 34 rows.
2022-06-17 15:17:56 - ConditionOccurrence - INFO - Automatically formatting data columns.
2022-06-17 15:17:56 - ConditionOccurrence - INFO - created df (0x1162f20a0)[condition_occurrence_5]
2022-06-17 15:17:56 - ExampleDataset - INFO - finished condition_occurrence_5 (0x1162f20a0) ... 8/12 completed, 34 rows
2022-06-17 15:17:56 - ExampleDataset - INFO - starting on condition_occurrence_6
2022-06-17 15:17:56 - LocalDataCollection - INFO - Retrieving initial dataframe for 'Hospital_Visit.csv' for the first time

2022-06-17 15:17:56 - ConditionOccurrence - WARNING - Requiring non-null values in condition_concept_id removed 1029 rows, leaving 171 rows.
2022-06-17 15:17:56 - ConditionOccurrence - INFO - Automatically formatting data columns.
2022-06-17 15:17:56 - ConditionOccurrence - INFO - created df (0x1162f2850)[condition_occurrence_6]
2022-06-17 15:17:56 - ExampleDataset - INFO - finished condition_occurrence_6 (0x1162f2850) ... 9/12 completed, 171 rows
2022-06-17 15:17:56 - ExampleDataset - INFO - starting on condition_occurrence_7
2022-06-17 15:17:56 - ConditionOccurrence - WARNING - Requiring non-null values in condition_concept_id removed 1508 rows, leaving 444 rows.
2022-06-17 15:17:56 - ConditionOccurrence - INFO - Automatically formatting data columns.
2022-06-17 15:17:56 - ConditionOccurrence - INFO - created df (0x1162f2c70)[condition_occurrence_7]
2022-06-17 15:17:56 - ExampleDataset - INFO - finished condition_occurrence_7 (0x1162f2c70) ... 10/12 completed, 444 rows
2022-06-17 15:17:56 - ExampleDataset - ERROR - There are person_ids in this table that are not in the output person table!
2022-06-17 15:17:56 - ExampleDataset - ERROR - Either they are not in the original data, or while creating the person table, 
2022-06-17 15:17:56 - ExampleDataset - ERROR - studies have been removed due to lack of required fields, such as birthdate.
2022-06-17 15:17:56 - ExampleDataset - ERROR - 441/444 were good, 3 studies are removed.
2022-06-17 15:17:56 - ExampleDataset - INFO - starting on condition_occurrence_8
2022-06-17 15:17:56 - ConditionOccurrence - WARNING - Requiring non-null values in condition_concept_id removed 1508 rows, leaving 444 rows.
2022-06-17 15:17:56 - ConditionOccurrence - INFO - Automatically formatting data columns.
2022-06-17 15:17:56 - ConditionOccurrence - INFO - created df (0x1162d3400)[condition_occurrence_8]
2022-06-17 15:17:56 - ExampleDataset - INFO - finished condition_occurrence_8 (0x1162d3400) ... 11/12 completed, 444 rows
2022-06-17 15:17:56 - ExampleDataset - ERROR - There are person_ids in this table that are not in the output person table!
2022-06-17 15:17:56 - ExampleDataset - ERROR - Either they are not in the original data, or while creating the person table, 
2022-06-17 15:17:56 - ExampleDataset - ERROR - studies have been removed due to lack of required fields, such as birthdate.
2022-06-17 15:17:56 - ExampleDataset - ERROR - 441/444 were good, 3 studies are removed.
2022-06-17 15:17:56 - ExampleDataset - INFO - starting on condition_occurrence_9
2022-06-17 15:17:56 - ConditionOccurrence - WARNING - Requiring non-null values in condition_concept_id removed 1688 rows, leaving 264 rows.
2022-06-17 15:17:56 - ConditionOccurrence - INFO - Automatically formatting data columns.
2022-06-17 15:17:56 - ConditionOccurrence - INFO - created df (0x116322ca0)[condition_occurrence_9]
2022-06-17 15:17:56 - ExampleDataset - INFO - finished condition_occurrence_9 (0x116322ca0) ... 12/12 completed, 264 rows
2022-06-17 15:17:56 - ExampleDataset - ERROR - Removed 2 row(s) due to duplicates found when merging condition_occurrence
2022-06-17 15:17:56 - ExampleDataset - WARNING - Example duplicates...
2022-06-17 15:17:56 - ExampleDataset - WARNING -                          person_id  condition_concept_id condition_start_date  \
condition_occurrence_id                                                         
38                           125.0                378253           2020-04-11   
40                           125.0                378253           2020-04-11   
308                          125.0               4223659           2020-04-11   
310                          125.0               4223659           2020-04-11   

                           condition_start_datetime condition_end_date  \
condition_occurrence_id                                                  
38                       2020-04-11 00:00:00.000000         2020-04-11   
40                       2020-04-11 00:00:00.000000         2020-04-11   
308                      2020-04-11 00:00:00.000000         2020-04-11   
310                      2020-04-11 00:00:00.000000         2020-04-11   

                             condition_end_datetime condition_source_value  \
condition_occurrence_id                                                      
38                       2020-04-11 00:00:00.000000                    Yes   
40                       2020-04-11 00:00:00.000000                    Yes   
308                      2020-04-11 00:00:00.000000                    Yes   
310                      2020-04-11 00:00:00.000000                    Yes   

                         condition_source_concept_id  
condition_occurrence_id                               
38                                            378253  
40                                            378253  
308                                          4223659  
310                                          4223659  
2022-06-17 15:17:56 - ExampleDataset - INFO - called save_dateframe but outputs are not defined. save_files: True
2022-06-17 15:17:56 - ExampleDataset - INFO - finalised condition_occurrence on iteration 0 producing 2630 rows from 12 tables
2022-06-17 15:17:56 - LocalDataCollection - INFO - Getting next chunk of data
2022-06-17 15:17:56 - LocalDataCollection - INFO - All input files for this object have now been used.
2022-06-17 15:17:56 - LocalDataCollection - INFO - resetting used bricks
2022-06-17 15:17:56 - ExampleDataset - INFO - for drug_exposure: found 5 objects
2022-06-17 15:17:56 - ExampleDataset - INFO - working on drug_exposure
2022-06-17 15:17:56 - ExampleDataset - INFO - starting on drug_exposure_0
2022-06-17 15:17:56 - LocalDataCollection - INFO - Retrieving initial dataframe for 'Vaccinations.csv' for the first time
2022-06-17 15:17:56 - DrugExposure - WARNING - Requiring non-null values in drug_concept_id removed 475 rows, leaving 245 rows.
2022-06-17 15:17:56 - DrugExposure - INFO - Automatically formatting data columns.
2022-06-17 15:17:56 - DrugExposure - INFO - created df (0x1163222b0)[drug_exposure_0]
2022-06-17 15:17:56 - ExampleDataset - INFO - finished drug_exposure_0 (0x1163222b0) ... 1/5 completed, 245 rows
2022-06-17 15:17:56 - ExampleDataset - INFO - starting on drug_exposure_1
2022-06-17 15:17:56 - DrugExposure - WARNING - Requiring non-null values in drug_concept_id removed 494 rows, leaving 226 rows.
2022-06-17 15:17:56 - DrugExposure - WARNING - Requiring non-null values in drug_exposure_start_datetime removed 1 rows, leaving 225 rows.
2022-06-17 15:17:56 - DrugExposure - INFO - Automatically formatting data columns.
2022-06-17 15:17:56 - DrugExposure - INFO - created df (0x11634ab20)[drug_exposure_1]

2022-06-17 15:17:56 - ExampleDataset - INFO - finished drug_exposure_1 (0x11634ab20) ... 2/5 completed, 225 rows
2022-06-17 15:17:56 - ExampleDataset - ERROR - There are person_ids in this table that are not in the output person table!
2022-06-17 15:17:56 - ExampleDataset - ERROR - Either they are not in the original data, or while creating the person table, 
2022-06-17 15:17:56 - ExampleDataset - ERROR - studies have been removed due to lack of required fields, such as birthdate.
2022-06-17 15:17:56 - ExampleDataset - ERROR - 224/225 were good, 1 studies are removed.
2022-06-17 15:17:57 - ExampleDataset - INFO - starting on drug_exposure_2
2022-06-17 15:17:57 - DrugExposure - WARNING - Requiring non-null values in drug_concept_id removed 471 rows, leaving 249 rows.
2022-06-17 15:17:57 - DrugExposure - INFO - Automatically formatting data columns.
2022-06-17 15:17:57 - DrugExposure - INFO - created df (0x1163752e0)[drug_exposure_2]
2022-06-17 15:17:57 - ExampleDataset - INFO - finished drug_exposure_2 (0x1163752e0) ... 3/5 completed, 249 rows
2022-06-17 15:17:57 - ExampleDataset - ERROR - There are person_ids in this table that are not in the output person table!
2022-06-17 15:17:57 - ExampleDataset - ERROR - Either they are not in the original data, or while creating the person table, 
2022-06-17 15:17:57 - ExampleDataset - ERROR - studies have been removed due to lack of required fields, such as birthdate.
2022-06-17 15:17:57 - ExampleDataset - ERROR - 248/249 were good, 1 studies are removed.
2022-06-17 15:17:57 - ExampleDataset - INFO - starting on drug_exposure_3
2022-06-17 15:17:57 - DrugExposure - WARNING - Requiring non-null values in drug_concept_id removed 475 rows, leaving 245 rows.
2022-06-17 15:17:57 - DrugExposure - INFO - Automatically formatting data columns.
2022-06-17 15:17:57 - DrugExposure - INFO - created df (0x116375700)[drug_exposure_3]
2022-06-17 15:17:57 - ExampleDataset - INFO - finished drug_exposure_3 (0x116375700) ... 4/5 completed, 245 rows
2022-06-17 15:17:57 - ExampleDataset - INFO - starting on drug_exposure_4
2022-06-17 15:17:57 - DrugExposure - WARNING - Requiring non-null values in drug_concept_id removed 471 rows, leaving 249 rows.
2022-06-17 15:17:57 - DrugExposure - INFO - Automatically formatting data columns.
2022-06-17 15:17:57 - DrugExposure - INFO - created df (0x1163b6610)[drug_exposure_4]
2022-06-17 15:17:57 - ExampleDataset - INFO - finished drug_exposure_4 (0x1163b6610) ... 5/5 completed, 249 rows
2022-06-17 15:17:57 - ExampleDataset - ERROR - There are person_ids in this table that are not in the output person table!
2022-06-17 15:17:57 - ExampleDataset - ERROR - Either they are not in the original data, or while creating the person table, 
2022-06-17 15:17:57 - ExampleDataset - ERROR - studies have been removed due to lack of required fields, such as birthdate.
2022-06-17 15:17:57 - ExampleDataset - ERROR - 248/249 were good, 1 studies are removed.
2022-06-17 15:17:57 - ExampleDataset - INFO - called save_dateframe but outputs are not defined. save_files: True
2022-06-17 15:17:57 - ExampleDataset - INFO - finalised drug_exposure on iteration 0 producing 1210 rows from 5 tables
2022-06-17 15:17:57 - LocalDataCollection - INFO - Getting next chunk of data
2022-06-17 15:17:57 - LocalDataCollection - INFO - All input files for this object have now been used.
2022-06-17 15:17:57 - LocalDataCollection - INFO - resetting used bricks
2022-06-17 15:17:57 - ExampleDataset - INFO - for observation: found 4 objects
2022-06-17 15:17:57 - ExampleDataset - INFO - working on observation
2022-06-17 15:17:57 - ExampleDataset - INFO - starting on observation_0
2022-06-17 15:17:57 - LocalDataCollection - INFO - Retrieving initial dataframe for 'Serology.csv' for the first time
2022-06-17 15:17:57 - Observation - INFO - Automatically formatting data columns.
2022-06-17 15:17:57 - Observation - INFO - created df (0x1163e8ee0)[observation_0]
2022-06-17 15:17:57 - ExampleDataset - INFO - finished observation_0 (0x1163e8ee0) ... 1/4 completed, 413 rows
2022-06-17 15:17:57 - ExampleDataset - ERROR - There are person_ids in this table that are not in the output person table!
2022-06-17 15:17:57 - ExampleDataset - ERROR - Either they are not in the original data, or while creating the person table, 
2022-06-17 15:17:57 - ExampleDataset - ERROR - studies have been removed due to lack of required fields, such as birthdate.
2022-06-17 15:17:57 - ExampleDataset - ERROR - 410/413 were good, 3 studies are removed.
2022-06-17 15:17:57 - ExampleDataset - INFO - starting on observation_1
2022-06-17 15:17:57 - LocalDataCollection - INFO - Retrieving initial dataframe for 'Hospital_Visit.csv' for the first time
2022-06-17 15:17:57 - Observation - WARNING - Requiring non-null values in observation_concept_id removed 937 rows, leaving 263 rows.
2022-06-17 15:17:57 - Observation - INFO - Automatically formatting data columns.
2022-06-17 15:17:57 - Observation - INFO - created df (0x1163e8e20)[observation_1]
2022-06-17 15:17:57 - ExampleDataset - INFO - finished observation_1 (0x1163e8e20) ... 2/4 completed, 263 rows
2022-06-17 15:17:57 - ExampleDataset - ERROR - There are person_ids in this table that are not in the output person table!
2022-06-17 15:17:57 - ExampleDataset - ERROR - Either they are not in the original data, or while creating the person table, 
2022-06-17 15:17:57 - ExampleDataset - ERROR - studies have been removed due to lack of required fields, such as birthdate.
2022-06-17 15:17:57 - ExampleDataset - ERROR - 262/263 were good, 1 studies are removed.
2022-06-17 15:17:57 - ExampleDataset - INFO - starting on observation_2
2022-06-17 15:17:57 - Observation - WARNING - Requiring non-null values in observation_concept_id removed 1023 rows, leaving 177 rows.
2022-06-17 15:17:57 - Observation - INFO - Automatically formatting data columns.
2022-06-17 15:17:57 - Observation - INFO - created df (0x116429dc0)[observation_2]
2022-06-17 15:17:57 - ExampleDataset - INFO - finished observation_2 (0x116429dc0) ... 3/4 completed, 177 rows
2022-06-17 15:17:57 - ExampleDataset - ERROR - There are person_ids in this table that are not in the output person table!
2022-06-17 15:17:57 - ExampleDataset - ERROR - Either they are not in the original data, or while creating the person table, 

2022-06-17 15:17:57 - ExampleDataset - ERROR - studies have been removed due to lack of required fields, such as birthdate.
2022-06-17 15:17:57 - ExampleDataset - ERROR - 176/177 were good, 1 studies are removed.
2022-06-17 15:17:57 - ExampleDataset - INFO - starting on observation_3
2022-06-17 15:17:57 - Observation - WARNING - Requiring non-null values in observation_concept_id removed 851 rows, leaving 349 rows.
2022-06-17 15:17:57 - Observation - INFO - Automatically formatting data columns.
2022-06-17 15:17:57 - Observation - INFO - created df (0x111fd1d00)[observation_3]
2022-06-17 15:17:57 - ExampleDataset - INFO - finished observation_3 (0x111fd1d00) ... 4/4 completed, 349 rows
2022-06-17 15:17:57 - ExampleDataset - ERROR - Removed 1 row(s) due to duplicates found when merging observation
2022-06-17 15:17:57 - ExampleDataset - WARNING - Example duplicates...
2022-06-17 15:17:57 - ExampleDataset - WARNING -                 person_id  observation_concept_id observation_date  \
observation_id                                                       
440                 110.0                 4059317       2019-07-07   
441                 110.0                 4059317       2019-07-07   

                      observation_datetime observation_source_value  \
observation_id                                                        
440             2019-07-07 00:00:00.000000             Heart Attack   
441             2019-07-07 00:00:00.000000             Heart Attack   

                observation_source_concept_id  
observation_id                                 
440                                   4059317  
441                                   4059317  
2022-06-17 15:17:57 - ExampleDataset - INFO - called save_dateframe but outputs are not defined. save_files: True
2022-06-17 15:17:57 - ExampleDataset - INFO - finalised observation on iteration 0 producing 1197 rows from 4 tables
2022-06-17 15:17:57 - LocalDataCollection - INFO - Getting next chunk of data
2022-06-17 15:17:57 - LocalDataCollection - INFO - All input files for this object have now been used.

instance.keys()
dict_keys(['person', 'condition_occurrence', 'drug_exposure', 'observation'])
instance['observation'].dropna(axis=1)
person_id observation_concept_id observation_date observation_datetime observation_source_value observation_source_concept_id
observation_id
1 357 4288455 2020-10-03 2020-10-03 00:00:00.000000 17.172114692899758 4288455
2 258 4288455 2020-11-02 2020-11-02 00:00:00.000000 201.93861878809216 4288455
4 556 4288455 2021-07-26 2021-07-26 00:00:00.000000 11.506250956970998 4288455
5 380 4288455 2021-10-29 2021-10-29 00:00:00.000000 2.6594057121417487 4288455
6 415 4288455 2021-09-07 2021-09-07 00:00:00.000000 40.844873593089126 4288455
... ... ... ... ... ... ...
1193 988 40757663 2020-07-21 2020-07-21 00:00:00.000000 Cancer 40757663
1194 555 40757663 2020-10-03 2020-10-03 00:00:00.000000 Cancer 40757663
1195 992 40757663 2021-06-20 2021-06-20 00:00:00.000000 Cancer 40757663
1196 992 40757663 2019-05-13 2019-05-13 00:00:00.000000 Cancer 40757663
1197 992 40757663 2019-08-25 2019-08-25 00:00:00.000000 Cancer 40757663

1196 rows × 6 columns

Manually edited

By generating a python class from the rules files, you can manually edit the python file setting up i/o as well as making some edits to the various tables. Once done, it could simple be run as a python file:

python  ExampleDatasetModified.py

# %load ExampleDatasetModified.py
from carrot.cdm import define_person, define_condition_occurrence, define_visit_occurrence, define_measurement, define_observation, define_drug_exposure
from carrot.cdm import CommonDataModel
from carrot.tools import load_csv,create_csv_store
import json
import glob
import pandas as pd

class ExampleDatasetModified(CommonDataModel):

    def __init__(self,**kwargs):
        """ 
        initialise the inputs and setup indexing 
        """
        inputs = load_csv(glob.glob('../data/part1/*'))
        outputs = create_csv_store(output_folder="./data_tests/",
                                                   sep="\t",
                                                   write_separate=True,
                                                   write_mode='w')

        super().__init__(inputs=inputs,outputs=outputs,**kwargs)
        self.process()

    @define_person
    def person_0(self):
        """
        Create CDM object for person
        """
        self.birth_datetime.series = self.inputs["Demographics.csv"]["Age"]
        self.gender_concept_id.series = self.inputs["Demographics.csv"]["Sex"]
        self.gender_source_concept_id.series = self.inputs["Demographics.csv"]["Sex"]
        self.gender_source_value.series = self.inputs["Demographics.csv"]["Sex"]
        self.person_id.series = self.inputs["Demographics.csv"]["ID"]

        # --- insert field operations --- 
        self.birth_datetime.series = self.tools.get_datetime_from_age(self.birth_datetime.series)

        # --- insert term mapping --- 
        self.gender_concept_id.series = self.gender_concept_id.series.map(
            {
                "Male": 8507,
                "Female": 8532
            }
        )

    @define_observation
    def observation_0(self):
        """
        Create CDM object for observation
        """

        def convert_igg(x):
            """
            A custom function to convert the IgG into g/L
            """
            try:
                igg = float(x['IgG'])
            except:
                return None
            #example of a dataset where the assay has been recalibrated after a certain date
            #therefore you might need to do some conversion based upon the date
            factor = 1.2 if x['Date'].year &lt; 2021 else 1

            #apply a factor to convert to g/L
            factor = factor * 10

            #return the modified IgG value
            return igg*factor

        #save the source value of the IgG
        self.observation_source_value.series = self.inputs["Serology.csv"]["IgG"]

        #convert the date into a datetime object
        self.inputs["Serology.csv"]["Date"] =  pd.to_datetime(self.inputs["Serology.csv"]["Date"],
                                                             errors='coerce')

        #recalculate the IgG based upon a custom function
        self.inputs["Serology.csv"]["IgG"] = self.inputs["Serology.csv"].apply(
                                                            lambda x: convert_igg(x),axis=1)
        #set the output units
        self.inputs["Serology.csv"]["Units"] = 'g/L'

        #set additional columns we did not have before...
        self.unit_source_value.series = self.inputs["Serology.csv"]["Units"]
        self.value_as_number.series = self.inputs["Serology.csv"]["IgG"]


        self.observation_concept_id.series = self.inputs["Serology.csv"]["IgG"]
        self.observation_datetime.series = self.inputs["Serology.csv"]["Date"]
        self.observation_source_concept_id.series = self.inputs["Serology.csv"]["IgG"]
        self.person_id.series = self.inputs["Serology.csv"]["ID"]


        # --- insert term mapping --- 
        self.observation_concept_id.series = self.tools.make_scalar(self.observation_concept_id.series,4288455)
        self.observation_source_concept_id.series = self.tools.make_scalar(self.observation_source_concept_id.series,4288455)
instance = ExampleDatasetModified()
instance
2022-06-17 15:17:57 - LocalDataCollection - INFO - DataCollection Object Created
2022-06-17 15:17:57 - LocalDataCollection - INFO - Registering  Blood_Test.csv [<carrot.io.common.DataBrick object at 0x116322730>]
2022-06-17 15:17:57 - LocalDataCollection - INFO - Registering  Demographics.csv [<carrot.io.common.DataBrick object at 0x111f1ba90>]
2022-06-17 15:17:57 - LocalDataCollection - INFO - Registering  GP_Records.csv [<carrot.io.common.DataBrick object at 0x111f56d60>]
2022-06-17 15:17:57 - LocalDataCollection - INFO - Registering  Hospital_Visit.csv [<carrot.io.common.DataBrick object at 0x111f56c10>]
2022-06-17 15:17:57 - LocalDataCollection - INFO - Registering  Serology.csv [<carrot.io.common.DataBrick object at 0x116458a00>]
2022-06-17 15:17:57 - LocalDataCollection - INFO - Registering  Symptoms.csv [<carrot.io.common.DataBrick object at 0x116458f40>]
2022-06-17 15:17:57 - LocalDataCollection - INFO - Registering  Vaccinations.csv [<carrot.io.common.DataBrick object at 0x1163e86a0>]
2022-06-17 15:17:57 - LocalDataCollection - INFO - Registering  pks.csv [<carrot.io.common.DataBrick object at 0x1163e8b80>]
2022-06-17 15:17:57 - LocalDataCollection - INFO - DataCollection Object Created
2022-06-17 15:17:57 - ExampleDatasetModified - INFO - CommonDataModel (5.3.1) created with co-connect-tools version 0.0.0
2022-06-17 15:17:57 - ExampleDatasetModified - INFO - Running with an DataCollection object
2022-06-17 15:17:57 - ExampleDatasetModified - INFO - Turning on automatic cdm column filling
2022-06-17 15:17:57 - ExampleDatasetModified - INFO - Added observation_0 of type observation
2022-06-17 15:17:57 - ExampleDatasetModified - INFO - Added person_0 of type person
2022-06-17 15:17:57 - ExampleDatasetModified - INFO - Starting processing in order: ['person', 'observation']
2022-06-17 15:17:57 - ExampleDatasetModified - INFO - Number of objects to process for each table...
{
      "observation": 1,
      "person": 1
}
2022-06-17 15:17:57 - ExampleDatasetModified - INFO - for person: found 1 object
2022-06-17 15:17:57 - ExampleDatasetModified - INFO - working on person
2022-06-17 15:17:57 - ExampleDatasetModified - INFO - starting on person_0
2022-06-17 15:17:57 - LocalDataCollection - INFO - Retrieving initial dataframe for 'Demographics.csv' for the first time
2022-06-17 15:17:57 - Person - WARNING - Requiring non-null values in gender_concept_id removed 3 rows, leaving 997 rows.
2022-06-17 15:17:57 - Person - WARNING - Requiring non-null values in birth_datetime removed 1 rows, leaving 996 rows.
2022-06-17 15:17:57 - Person - INFO - Automatically formatting data columns.
2022-06-17 15:17:58 - Person - INFO - created df (0x11654bcd0)[person_0]

could not convert string to float: 'na'

2022-06-17 15:17:58 - ExampleDatasetModified - INFO - finished person_0 (0x11654bcd0) ... 1/1 completed, 996 rows
2022-06-17 15:17:58 - LocalDataCollection - INFO - saving person_ids.0x1164a5d30.2022-06-17T141758 to ./data_tests//person_ids.0x1164a5d30.2022-06-17T141758.tsv
2022-06-17 15:17:58 - LocalDataCollection - INFO - finished save to file
2022-06-17 15:17:58 - ExampleDatasetModified - INFO - saving dataframe (0x1164852e0) to <carrot.io.plugins.local.LocalDataCollection object at 0x1164585e0>
2022-06-17 15:17:58 - LocalDataCollection - INFO - saving person.person_0.0x1164852e0.2022-06-17T141758 to ./data_tests//person.person_0.0x1164852e0.2022-06-17T141758.tsv
2022-06-17 15:17:58 - LocalDataCollection - INFO - finished save to file
2022-06-17 15:17:58 - ExampleDatasetModified - INFO - finalised person on iteration 0 producing 996 rows from 1 tables
2022-06-17 15:17:58 - LocalDataCollection - INFO - Getting next chunk of data
2022-06-17 15:17:58 - LocalDataCollection - INFO - All input files for this object have now been used.
2022-06-17 15:17:58 - LocalDataCollection - INFO - resetting used bricks
2022-06-17 15:17:58 - ExampleDatasetModified - INFO - for observation: found 1 object
2022-06-17 15:17:58 - ExampleDatasetModified - INFO - working on observation
2022-06-17 15:17:58 - ExampleDatasetModified - INFO - starting on observation_0
2022-06-17 15:17:58 - LocalDataCollection - INFO - Retrieving initial dataframe for 'Serology.csv' for the first time
2022-06-17 15:17:58 - Observation - WARNING - Requiring non-null values in observation_datetime removed 2 rows, leaving 413 rows.
2022-06-17 15:17:58 - Observation - INFO - Automatically formatting data columns.
2022-06-17 15:17:58 - Observation - INFO - created df (0x1164a5af0)[observation_0]
2022-06-17 15:17:58 - ExampleDatasetModified - INFO - finished observation_0 (0x1164a5af0) ... 1/1 completed, 413 rows
2022-06-17 15:17:58 - ExampleDatasetModified - ERROR - There are person_ids in this table that are not in the output person table!
2022-06-17 15:17:58 - ExampleDatasetModified - ERROR - Either they are not in the original data, or while creating the person table, 
2022-06-17 15:17:58 - ExampleDatasetModified - ERROR - studies have been removed due to lack of required fields, such as birthdate.
2022-06-17 15:17:58 - ExampleDatasetModified - ERROR - 410/413 were good, 3 studies are removed.
2022-06-17 15:17:58 - ExampleDatasetModified - INFO - saving dataframe (0x11659e9d0) to <carrot.io.plugins.local.LocalDataCollection object at 0x1164585e0>
2022-06-17 15:17:58 - LocalDataCollection - INFO - saving observation.observation_0.0x11659e9d0.2022-06-17T141758 to ./data_tests//observation.observation_0.0x11659e9d0.2022-06-17T141758.tsv
2022-06-17 15:17:58 - LocalDataCollection - INFO - finished save to file
2022-06-17 15:17:58 - ExampleDatasetModified - INFO - finalised observation on iteration 0 producing 410 rows from 1 tables
2022-06-17 15:17:58 - LocalDataCollection - INFO - Getting next chunk of data
2022-06-17 15:17:58 - LocalDataCollection - INFO - All input files for this object have now been used.

<__main__.ExampleDatasetModified at 0x116458b20>
instance.keys()
dict_keys(['person', 'observation'])
instance['observation'].dropna(axis=1)
person_id observation_concept_id observation_date observation_datetime observation_source_value observation_source_concept_id unit_source_value
observation_id
1 650 4288455 2020-10-03 2020-10-03 00:00:00.000000 17.172114692899758 4288455 g/L
2 457 4288455 2020-11-02 2020-11-02 00:00:00.000000 201.93861878809216 4288455 g/L
3 983 4288455 2021-07-26 2021-07-26 00:00:00.000000 11.506250956970998 4288455 g/L
4 696 4288455 2021-10-29 2021-10-29 00:00:00.000000 2.6594057121417487 4288455 g/L
5 751 4288455 2021-09-07 2021-09-07 00:00:00.000000 40.844873593089126 4288455 g/L
... ... ... ... ... ... ... ...
409 187 4288455 2022-11-07 2022-11-07 00:00:00.000000 51.77573831029082 4288455 g/L
410 886 4288455 2022-09-07 2022-09-07 00:00:00.000000 57.11515081936336 4288455 g/L
411 50 4288455 2022-11-07 2022-11-07 00:00:00.000000 15.264660709568151 4288455 g/L
412 260 4288455 2019-11-13 2019-11-13 00:00:00.000000 26.051354325968106 4288455 g/L
413 370 4288455 2020-05-25 2020-05-25 00:00:00.000000 4.266438928364172 4288455 g/L

410 rows × 7 columns