Skip to content

By Hand

Define a Model

We can use carrot.cdm to create destination tables and fields for our synthetic data model

from carrot.cdm.objects.common import DestinationTable, DestinationField

class Demographics(DestinationTable):
    name = 'Demo'
    def __init__(self,name=None,**kwargs):
        self.ID = DestinationField(dtype="Text50", required=True)
        self.Age = DestinationField(dtype="Integer", required=False)
        self.Sex = DestinationField(dtype="Text50", required=False )
        super().__init__(self.name,type(self).__name__)

class Symptoms(DestinationTable):
    name = 'Symptoms'
    def __init__(self,name=None,**kwargs):
        self.ID = DestinationField(dtype="Text50", required=True)
        self.date_occurrence = DestinationField(dtype="Timestamp", required=False)                                                                                     
        self.Headache = DestinationField(dtype="Text50", required=False )
        self.Fatigue = DestinationField(dtype="Text50", required=False )
        self.Dizzy = DestinationField(dtype="Text50", required=False )
        self.Cough = DestinationField(dtype="Text50", required=False )
        self.Fever = DestinationField(dtype="Text50", required=False )
        self.Muscle_Pain = DestinationField(dtype="Text50", required=False )
        super().__init__(self.name,type(self).__name__)

class GP_Records(DestinationTable):
    name = 'GP_Records'
    def __init__(self,name=None,**kwargs):
        self.ID = DestinationField(dtype="Text50", required=True)
        self.date_of_visit = DestinationField(dtype="Timestamp", required=False)                                                                                     
        self.comorbidity = DestinationField(dtype="Text50", required=False )
        self.comorbidity_value = DestinationField(dtype="Float", required=False )
        super().__init__(self.name,type(self).__name__)

class Hospital_Visit(DestinationTable):
    name = 'Hospital_Visit'
    def __init__(self,name=None,**kwargs):
        self.ID = DestinationField(dtype="Text50", required=True)
        self.admission_date = DestinationField(dtype="Timestamp", required=False)   
        self.reason = DestinationField(dtype="Text50", required=False )
        super().__init__(self.name,type(self).__name__)

class Blood_Test(DestinationTable):
    name = 'Blood_Test'
    def __init__(self,name=None,**kwargs):
        self.ID = DestinationField(dtype="Text50", required=True)
        self.date_taken = DestinationField(dtype="Timestamp", required=False)   
        self.location = DestinationField(dtype="Text50", required=False )
        self.quantity = DestinationField(dtype="Float", required=False )
        super().__init__(self.name,type(self).__name__)

class Vaccinations(DestinationTable):
    name = 'Vaccinations'
    def __init__(self,name=None,**kwargs):
        self.ID = DestinationField(dtype="Text50", required=True)
        self.date_of_vaccination = DestinationField(dtype="Timestamp", required=False)                                                                                     
        self.type = DestinationField(dtype="Text50", required=False)
        self.stage = DestinationField(dtype="Integer", required=False)
        super().__init__(self.name,type(self).__name__) 


class Serology(DestinationTable):
    name = 'Serology'
    def __init__(self,name=None,**kwargs):
        self.ID = DestinationField(dtype="Text50", required=True)
        self.Date = DestinationField(dtype="Timestamp", required=True)
        self.IgG = DestinationField(dtype="Float", required=False )
        super().__init__(self.name,type(self).__name__)

Then build a total model(dataset) based upon these tables, creating this with 50k people

import pandas as pd
import numpy as np
import datetime
import time
import io
import carrot
from carrot.cdm import CommonDataModel
from carrot.cdm import define_table

def create_gaus_time_series(mu,sigma,n):
    mu = time.mktime(mu.timetuple())
    sigma = (datetime.timedelta(**sigma)).total_seconds()
    return pd.Series([datetime.date.fromtimestamp(x) for x in np.random.normal(mu,sigma,n)])

class ExampleCovid19DataSet(CommonDataModel):
    def __init__(self):
        """                                                                                                                                                    
        initialise the inputs and setup indexing                                                                                                               
        """  
        #50k people
        n = 50000


        outputs = carrot.tools.create_sql_store(connection_string="postgresql://localhost:5432/ExampleCOVID19DataSet",
                                          drop_existing=True)
        super().__init__(format_level=0,outputs=outputs)

        #create people indexes that we can use in the different tables
        self.people = pd.DataFrame([f'pk{i}' for i in range(1,n+1)],columns=['pks'])

        #set the processing order, e.g we want to build demographics table first
        #so that the values recorded in other tables can be demographically dependent 
        self.set_execution_order([
            'Demographics', 
            'GP_Records', 
            'Vaccinations',
            'Serology',
            'Symptoms',
            'Hospital_Visit',
            'Blood_Test'
        ])
        self.process()

    @define_table(Demographics)
    def demo(self):  
        """
        Straight foreward demographics
        """
        self.ID.series = self.cdm.people['pks']
        self.n = len(self.ID.series)
        self.Age.series = pd.Series(np.random.normal(60,20,self.n)).astype(int)
        self.Age.series = self.Age.series.mask(self.Age.series < 0 , None)
        self.Sex.series = pd.Series(np.random.choice(['Male','Female',None],size=self.n,p=[0.55,0.445,0.005]))


    @define_table(Symptoms)
    def symptoms(self):
        npeople = self.cdm.demo.n
        nsymptoms = npeople*5

        ID = self.cdm.demo.ID.series

        self.ID.series = ID.sample(int(npeople*0.8))\
            .sample(nsymptoms,replace=True)\
            .sort_values().reset_index(drop=True)  

        self.date_occurrence.series = create_gaus_time_series(mu=datetime.datetime(2021,1,1),
                                                              sigma={'days':365},
                                                              n=nsymptoms)

        self.date_occurrence.series.loc[self.date_occurrence.series.sample(frac=0.005).index] = np.nan

        syms_probs = {'Headache':0.8,'Fatigue':0.7,'Dizzy':0.4,'Cough':0.7,'Fever':0.2,'Muscle_Pain':0.1}
        for key,p in syms_probs.items():
            series = pd.Series(np.random.choice(['Yes','No'],size=nsymptoms,p=[p,1-p]))
            setattr(getattr(self,key),'series',series)

    @define_table(Serology)
    def serology(self):

        def calc_IgG(age,sex,nrisks):
            scale = 50*(1 - age/200)*(1.1 if sex=='Female' else 1.0)*(1/nrisks)
            return np.random.exponential(scale=scale)

        df_gp = self.cdm.gp.get_df()
        df_nrisks = df_gp['comorbidity'].groupby(df_gp.index)\
                    .count()
        df_nrisks.name ='nrisks'

        df = self.cdm.demo.get_df().join(df_nrisks).reset_index()
        df['nrisks'] = df['nrisks'].fillna(1)

        df = df[df['Age']>18].sample(frac=0.3)
        nstudies = len(df)

        df = df.sample(frac=1.4,replace=True).reset_index()

        df['IgG'] = df.apply(lambda x : calc_IgG(x.Age,x.Sex,x.nrisks),axis=1)
        df.sort_values('ID',inplace=True)

        self.IgG.series = df['IgG']
        self.ID.series = df['ID']
        self.Date.series = create_gaus_time_series(mu=datetime.datetime(2021,5,1),
                                                              sigma={'days':365},
                                                              n=len(df))

    @define_table(GP_Records)
    def gp(self):

        def calc_comoribidites(age):
            if pd.isna(age):
                return []   
            comorbidities = {
                'Mental Health':0.3*(1 + age/90) ,
                'Diabetes Type-II':0.15*(1 + age/70) ,
                'Heart Condition':0.1*(1 + age/50) ,
                'High Blood Pressure':0.07*(1 + age/60),
                'BMI': 1
            }
            return [x for x,p in comorbidities.items() if np.all(np.random.uniform() < p) ]

        #90% of people have a GP visit record
        df = self.cdm.demo.get_df().sample(frac=0.9).reset_index()

        df['comorbidity'] = df.apply(lambda x: calc_comoribidites(x.Age),axis=1)
        df['date_of_observation'] = create_gaus_time_series(mu=datetime.datetime(2010,5,1),
                                                              sigma={'days':700},
                                                              n=len(df))

        df = df.explode('comorbidity').set_index('ID').sort_index()

        self.ID.series = df.index.to_series()
        self.comorbidity.series = df['comorbidity']
        self.comorbidity_value.series = df['comorbidity'].apply(lambda x: np.random.exponential(scale=20)
                                                                if x == 'BMI' else 1)
        self.date_of_visit.series = df['date_of_observation']

    @define_table(Hospital_Visit) 
    def hospital(self):

        n = len(self.cdm.demo.ID.series)

        #5% of people have had a hospital visit
        #some of those have multiple visists
        self.ID.series = self.cdm.demo.ID.series.sample(n)\
                        .sample(int(n*1.2),replace=True)\
                        .sort_values().reset_index(drop=True)  

        n = len(self.ID.series)
        self.admission_date.series = create_gaus_time_series(mu=datetime.datetime(2020,5,1),
                                                              sigma={'days':300},
                                                              n=n)

        reasons = {
            'Kidney Operation':0.1,
            'Appendix Operation':0.1,
            'Heart Attack':0.2,
            'COVID-19':0.15,
            'Pneumonia':0.15,
            'Cancer':0.3
        }

        self.reason.series = pd.Series(np.random.choice(list(reasons.keys()),size=n,p=list(reasons.values())))

    @define_table(Blood_Test)
    def bloods(self):
        #half of the people with hospital visits have blood taken
        df_hospital = self.cdm.hospital.get_df().sample(frac=0.5).reset_index()

        self.ID.series = df_hospital['ID']
        self.date_taken.series = pd.to_datetime(df_hospital['admission_date']) \
                               + datetime.timedelta(days=np.random.uniform(0,5))

        n = len(df_hospital)
        self.location.series = pd.Series(np.random.choice(['Right Arm','Left Arm','Small Intestine','Abdominal Wall'],
                                                   size=n,
                                                   p=[0.3,0.3,0.2,0.2]))
        self.quantity.series = pd.Series((np.random.exponential(scale=1.5) for _ in range(0,n)))

    @define_table(Vaccinations)
    def first_covid_vaccination(self):

        def calc_date_of_vacc(age):
            if pd.isna(age):
                return np.nan
            start_date = datetime.datetime(2021,1,1)
            tdelta = datetime.timedelta(days=(300-age*2)+np.random.uniform(0,50))

            return start_date + tdelta

        #95% of people have had a vaccination
        df = self.cdm.demo.get_df().sample(frac=0.9).reset_index()

        self.ID.series = df['ID']
        self.date_of_vaccination.series =  df.apply(lambda x : calc_date_of_vacc(x.Age),axis=1)
        n = len(self.ID.series)
        self.type.series = pd.Series(np.random.choice(['Moderna','AstraZenica','Pfizer'],size=n,p=[0.34,0.33,0.33]))
        self.stage.series = pd.Series((0 for _ in range(0,n)))

    @define_table(Vaccinations)
    def second_covid_vaccination(self):

        def calc_date_of_vacc(age):
            if pd.isna(age):
                return np.nan
            start_date = datetime.datetime(2021,1,1)
            tdelta = datetime.timedelta(days=(300-age*2)+np.random.uniform(0,50))

            return start_date + tdelta

        #80% of people who had 1st had 2nd
        df = self.cdm.first_covid_vaccination.get_df().sample(frac=0.8).reset_index()

        self.ID.series = df['ID']
        self.date_of_vaccination.series =  pd.to_datetime(df['date_of_vaccination']) \
                                           + datetime.timedelta(days=(50+np.random.uniform(0,50)))
        n = len(self.ID.series)
        self.type.series = pd.Series(np.random.choice(['Moderna','AstraZenica','Pfizer'],size=n,p=[0.34,0.33,0.33]))
        self.stage.series = pd.Series((1 for _ in range(0,n)))

Create a Model

Create and run the model, for a dataset so large and complex, this can take some time

model = ExampleCovid19DataSet()
model
2022-06-17 15:03:24 - SqlDataCollection - INFO - DataCollection Object Created
2022-06-17 15:03:25 - SqlDataCollection - INFO - Engine(postgresql://localhost:5432/ExampleCOVID19DataSet)
2022-06-17 15:03:25 - ExampleCovid19DataSet - INFO - CommonDataModel (5.3.1) created with co-connect-tools version 0.0.0
2022-06-17 15:03:25 - ExampleCovid19DataSet - INFO - Turning on automatic cdm column filling
2022-06-17 15:03:25 - ExampleCovid19DataSet - INFO - Added bloods of type Blood_Test
2022-06-17 15:03:25 - ExampleCovid19DataSet - INFO - Added demo of type Demographics
2022-06-17 15:03:25 - ExampleCovid19DataSet - INFO - Added first_covid_vaccination of type Vaccinations
2022-06-17 15:03:25 - ExampleCovid19DataSet - INFO - Added gp of type GP_Records
2022-06-17 15:03:25 - ExampleCovid19DataSet - INFO - Added hospital of type Hospital_Visit
2022-06-17 15:03:25 - ExampleCovid19DataSet - INFO - Added second_covid_vaccination of type Vaccinations
2022-06-17 15:03:25 - ExampleCovid19DataSet - INFO - Added serology of type Serology
2022-06-17 15:03:25 - ExampleCovid19DataSet - INFO - Added symptoms of type Symptoms
2022-06-17 15:03:25 - ExampleCovid19DataSet - INFO - Starting processing in order: ['Demographics', 'GP_Records', 'Vaccinations', 'Serology', 'Symptoms', 'Hospital_Visit', 'Blood_Test']
2022-06-17 15:03:25 - ExampleCovid19DataSet - INFO - Number of objects to process for each table...
{
      "Blood_Test": 1,
      "Demographics": 1,
      "Vaccinations": 2,
      "GP_Records": 1,
      "Hospital_Visit": 1,
      "Serology": 1,
      "Symptoms": 1
}
2022-06-17 15:03:25 - ExampleCovid19DataSet - INFO - for Demographics: found 1 object
2022-06-17 15:03:25 - ExampleCovid19DataSet - INFO - working on Demographics
2022-06-17 15:03:25 - ExampleCovid19DataSet - INFO - starting on demo
2022-06-17 15:03:25 - Demographics - INFO - Not formatting data columns
2022-06-17 15:03:25 - Demographics - INFO - created df (0x1076b2a90)[demo]
2022-06-17 15:03:25 - ExampleCovid19DataSet - INFO - finished demo (0x1076b2a90) ... 1/1 completed, 50000 rows
2022-06-17 15:03:25 - ExampleCovid19DataSet - ERROR - Removed 49652 row(s) due to duplicates found when merging Demographics
2022-06-17 15:03:25 - ExampleCovid19DataSet - WARNING - Example duplicates...
2022-06-17 15:03:25 - ExampleCovid19DataSet - WARNING -        Age     Sex
ID                
pk1   62.0    Male
pk2   56.0  Female
pk3   37.0    Male
pk4   57.0  Female
pk5   55.0    Male
pk6   95.0    Male
pk7   49.0    Male
pk8   76.0    Male
pk9   90.0  Female
pk10  62.0    Male
2022-06-17 15:03:25 - ExampleCovid19DataSet - INFO - saving dataframe (0x1076b22b0) to <carrot.io.plugins.sql.SqlDataCollection object at 0x107742100>
2022-06-17 15:03:25 - SqlDataCollection - INFO - updating Demographics in Engine(postgresql://localhost:5432/ExampleCOVID19DataSet)
2022-06-17 15:03:25 - SqlDataCollection - INFO - finished save to psql
2022-06-17 15:03:26 - ExampleCovid19DataSet - INFO - finalised Demographics on iteration 0 producing 50000 rows from 1 tables
2022-06-17 15:03:26 - ExampleCovid19DataSet - INFO - for GP_Records: found 1 object
2022-06-17 15:03:26 - ExampleCovid19DataSet - INFO - working on GP_Records
2022-06-17 15:03:26 - ExampleCovid19DataSet - INFO - starting on gp
2022-06-17 15:03:31 - GP_Records - INFO - Not formatting data columns
2022-06-17 15:03:31 - GP_Records - INFO - created df (0x10bc8f070)[gp]
2022-06-17 15:03:31 - ExampleCovid19DataSet - INFO - finished gp (0x10bc8f070) ... 1/1 completed, 96181 rows
2022-06-17 15:03:31 - ExampleCovid19DataSet - ERROR - Removed 39336 row(s) due to duplicates found when merging GP_Records
2022-06-17 15:03:31 - ExampleCovid19DataSet - WARNING - Example duplicates...
2022-06-17 15:03:31 - ExampleCovid19DataSet - WARNING -         date_of_visit       comorbidity  comorbidity_value
ID                                                        
pk1        2007-10-21  Diabetes Type-II                1.0
pk10       2009-01-03     Mental Health                1.0
pk100      2010-04-11     Mental Health                1.0
pk100      2010-04-11  Diabetes Type-II                1.0
pk1000     2006-10-08     Mental Health                1.0
pk10000    2012-02-26     Mental Health                1.0
pk10001    2007-07-17   Heart Condition                1.0
pk10002    2012-12-29  Diabetes Type-II                1.0
pk10002    2012-12-29   Heart Condition                1.0
pk10003    2010-04-01     Mental Health                1.0
2022-06-17 15:03:31 - ExampleCovid19DataSet - INFO - saving dataframe (0x10b690b50) to <carrot.io.plugins.sql.SqlDataCollection object at 0x107742100>
2022-06-17 15:03:31 - SqlDataCollection - INFO - updating GP_Records in Engine(postgresql://localhost:5432/ExampleCOVID19DataSet)
2022-06-17 15:03:37 - SqlDataCollection - INFO - finished save to psql
2022-06-17 15:03:37 - ExampleCovid19DataSet - INFO - finalised GP_Records on iteration 0 producing 96181 rows from 1 tables
2022-06-17 15:03:37 - ExampleCovid19DataSet - INFO - for Vaccinations: found 2 objects
2022-06-17 15:03:37 - ExampleCovid19DataSet - INFO - working on Vaccinations
2022-06-17 15:03:37 - ExampleCovid19DataSet - INFO - starting on first_covid_vaccination
2022-06-17 15:03:39 - Vaccinations - INFO - Not formatting data columns
2022-06-17 15:03:39 - Vaccinations - INFO - created df (0x10c0998b0)[first_covid_vaccination]
2022-06-17 15:03:39 - ExampleCovid19DataSet - INFO - finished first_covid_vaccination (0x10c0998b0) ... 1/2 completed, 45000 rows
2022-06-17 15:03:39 - ExampleCovid19DataSet - INFO - starting on second_covid_vaccination
2022-06-17 15:03:39 - Vaccinations - INFO - Not formatting data columns
2022-06-17 15:03:39 - Vaccinations - INFO - created df (0x10bd35070)[second_covid_vaccination]
2022-06-17 15:03:39 - ExampleCovid19DataSet - INFO - finished second_covid_vaccination (0x10bd35070) ... 2/2 completed, 36000 rows
2022-06-17 15:03:39 - ExampleCovid19DataSet - ERROR - Removed 64 row(s) due to duplicates found when merging Vaccinations
2022-06-17 15:03:39 - ExampleCovid19DataSet - WARNING - Example duplicates...
2022-06-17 15:03:39 - ExampleCovid19DataSet - WARNING -                 type  stage
ID                         
pk40950  AstraZenica      0
pk44515  AstraZenica      0
pk33656  AstraZenica      0
pk23624  AstraZenica      0
pk49043      Moderna      0
pk12327  AstraZenica      0
pk45916       Pfizer      0
pk22064       Pfizer      0
pk23701      Moderna      0
pk2449   AstraZenica      0
2022-06-17 15:03:39 - ExampleCovid19DataSet - INFO - saving dataframe (0x1076b29a0) to <carrot.io.plugins.sql.SqlDataCollection object at 0x107742100>

2022-06-17 15:03:39 - SqlDataCollection - INFO - updating Vaccinations in Engine(postgresql://localhost:5432/ExampleCOVID19DataSet)
2022-06-17 15:03:48 - SqlDataCollection - INFO - finished save to psql
2022-06-17 15:03:48 - ExampleCovid19DataSet - INFO - finalised Vaccinations on iteration 0 producing 81000 rows from 2 tables
2022-06-17 15:03:48 - ExampleCovid19DataSet - INFO - for Serology: found 1 object
2022-06-17 15:03:48 - ExampleCovid19DataSet - INFO - working on Serology
2022-06-17 15:03:48 - ExampleCovid19DataSet - INFO - starting on serology
2022-06-17 15:03:49 - Serology - INFO - Not formatting data columns
2022-06-17 15:03:49 - Serology - INFO - created df (0x10bd35be0)[serology]
2022-06-17 15:03:49 - ExampleCovid19DataSet - INFO - finished serology (0x10bd35be0) ... 1/1 completed, 20591 rows
2022-06-17 15:03:49 - ExampleCovid19DataSet - INFO - saving dataframe (0x10b6b6cd0) to <carrot.io.plugins.sql.SqlDataCollection object at 0x107742100>
2022-06-17 15:03:49 - SqlDataCollection - INFO - updating Serology in Engine(postgresql://localhost:5432/ExampleCOVID19DataSet)
2022-06-17 15:03:51 - SqlDataCollection - INFO - finished save to psql
2022-06-17 15:03:51 - ExampleCovid19DataSet - INFO - finalised Serology on iteration 0 producing 20591 rows from 1 tables
2022-06-17 15:03:51 - ExampleCovid19DataSet - INFO - for Symptoms: found 1 object
2022-06-17 15:03:51 - ExampleCovid19DataSet - INFO - working on Symptoms
2022-06-17 15:03:51 - ExampleCovid19DataSet - INFO - starting on symptoms
2022-06-17 15:03:53 - Symptoms - INFO - Not formatting data columns
2022-06-17 15:03:53 - Symptoms - INFO - created df (0x1265ee490)[symptoms]
2022-06-17 15:03:53 - ExampleCovid19DataSet - INFO - finished symptoms (0x1265ee490) ... 1/1 completed, 250000 rows
2022-06-17 15:03:53 - ExampleCovid19DataSet - ERROR - Removed 193327 row(s) due to duplicates found when merging Symptoms
2022-06-17 15:03:53 - ExampleCovid19DataSet - WARNING - Example duplicates...
2022-06-17 15:03:53 - ExampleCovid19DataSet - WARNING -      date_occurrence Headache Fatigue Dizzy Cough Fever Muscle_Pain
ID                                                                 
pk1       2021-01-24      Yes     Yes    No   Yes    No         Yes
pk1       2019-05-30      Yes     Yes    No    No   Yes          No
pk1       2021-05-16      Yes      No    No    No   Yes          No
pk1       2022-06-11      Yes     Yes   Yes   Yes    No          No
pk1       2020-06-18      Yes     Yes   Yes   Yes   Yes         Yes
pk1       2021-02-04      Yes      No    No   Yes    No          No
pk10      2021-11-24      Yes     Yes    No   Yes   Yes          No
pk10      2018-12-15       No     Yes    No   Yes    No          No
pk10      2020-12-28      Yes      No    No   Yes    No          No
pk10      2023-04-08      Yes     Yes   Yes   Yes    No          No
2022-06-17 15:03:53 - ExampleCovid19DataSet - INFO - saving dataframe (0x10bd3c220) to <carrot.io.plugins.sql.SqlDataCollection object at 0x107742100>
2022-06-17 15:03:53 - SqlDataCollection - INFO - updating Symptoms in Engine(postgresql://localhost:5432/ExampleCOVID19DataSet)
2022-06-17 15:04:00 - SqlDataCollection - INFO - finished save to psql
2022-06-17 15:04:00 - ExampleCovid19DataSet - INFO - finalised Symptoms on iteration 0 producing 250000 rows from 1 tables
2022-06-17 15:04:00 - ExampleCovid19DataSet - INFO - for Hospital_Visit: found 1 object
2022-06-17 15:04:00 - ExampleCovid19DataSet - INFO - working on Hospital_Visit
2022-06-17 15:04:00 - ExampleCovid19DataSet - INFO - starting on hospital
2022-06-17 15:04:00 - Hospital_Visit - INFO - Not formatting data columns
2022-06-17 15:04:00 - Hospital_Visit - INFO - created df (0x12660f580)[hospital]
2022-06-17 15:04:00 - ExampleCovid19DataSet - INFO - finished hospital (0x12660f580) ... 1/1 completed, 60000 rows
2022-06-17 15:04:00 - ExampleCovid19DataSet - ERROR - Removed 51290 row(s) due to duplicates found when merging Hospital_Visit
2022-06-17 15:04:00 - ExampleCovid19DataSet - WARNING - Example duplicates...
2022-06-17 15:04:00 - ExampleCovid19DataSet - WARNING -         admission_date              reason
ID                                        
pk1         2020-03-20              Cancer
pk1000      2019-02-14           Pneumonia
pk10001     2019-10-10  Appendix Operation
pk10001     2021-01-13              Cancer
pk10001     2018-11-30            COVID-19
pk10001     2021-01-21            COVID-19
pk10002     2020-05-31              Cancer
pk10004     2020-02-12              Cancer
pk10004     2020-05-27    Kidney Operation
pk10005     2020-11-13           Pneumonia
2022-06-17 15:04:00 - ExampleCovid19DataSet - INFO - saving dataframe (0x10bd3c280) to <carrot.io.plugins.sql.SqlDataCollection object at 0x107742100>
2022-06-17 15:04:00 - SqlDataCollection - INFO - updating Hospital_Visit in Engine(postgresql://localhost:5432/ExampleCOVID19DataSet)
2022-06-17 15:04:01 - SqlDataCollection - INFO - finished save to psql
2022-06-17 15:04:01 - ExampleCovid19DataSet - INFO - finalised Hospital_Visit on iteration 0 producing 60000 rows from 1 tables
2022-06-17 15:04:01 - ExampleCovid19DataSet - INFO - for Blood_Test: found 1 object
2022-06-17 15:04:01 - ExampleCovid19DataSet - INFO - working on Blood_Test
2022-06-17 15:04:01 - ExampleCovid19DataSet - INFO - starting on bloods
2022-06-17 15:04:02 - Blood_Test - INFO - Not formatting data columns
2022-06-17 15:04:02 - Blood_Test - INFO - created df (0x126afcdf0)[bloods]
2022-06-17 15:04:02 - ExampleCovid19DataSet - INFO - finished bloods (0x126afcdf0) ... 1/1 completed, 30000 rows
2022-06-17 15:04:02 - ExampleCovid19DataSet - INFO - saving dataframe (0x10bd42ee0) to <carrot.io.plugins.sql.SqlDataCollection object at 0x107742100>
2022-06-17 15:04:02 - SqlDataCollection - INFO - updating Blood_Test in Engine(postgresql://localhost:5432/ExampleCOVID19DataSet)
2022-06-17 15:04:06 - SqlDataCollection - INFO - finished save to psql
2022-06-17 15:04:06 - ExampleCovid19DataSet - INFO - finalised Blood_Test on iteration 0 producing 30000 rows from 1 tables

<__main__.ExampleCovid19DataSet at 0x10b6280d0>

Viewing the Model

Print to see what output data tables the model contains

model.keys()
dict_keys(['Demographics', 'GP_Records', 'Vaccinations', 'Serology', 'Symptoms', 'Hospital_Visit', 'Blood_Test'])

retrieve the dataframes from the model

model['Demographics']
Age Sex
ID
pk1 62.0 Male
pk2 56.0 Female
pk3 37.0 Male
pk4 57.0 Female
pk5 55.0 Male
... ... ...
pk45025 124.0 Male
pk47056 29.0 None
pk49115 61.0 None
pk49903 31.0 None
pk49979 139.0 Male

348 rows × 2 columns

model['GP_Records']
date_of_visit comorbidity comorbidity_value
ID
pk1 2007-10-21 Diabetes Type-II 1.000000
pk1 2007-10-21 Heart Condition 1.000000
pk1 2007-10-21 BMI 48.725715
pk10 2009-01-03 Mental Health 1.000000
pk10 2009-01-03 BMI 23.007635
... ... ... ...
pk9994 2009-08-09 BMI 58.675133
pk9995 2012-09-06 BMI 46.737545
pk9996 2010-04-13 BMI 1.468812
pk9998 2013-02-05 BMI 9.818068
pk9999 2011-05-23 BMI 17.832532

56845 rows × 3 columns

model['Vaccinations']
date_of_vaccination type stage
ID
pk32019 2021-07-18 21:19:36.566949 Moderna 0
pk32821 2021-09-07 23:08:09.986958 AstraZenica 0
pk40642 2021-08-17 22:21:06.467002 Pfizer 0
pk30449 2021-08-31 08:30:15.911471 AstraZenica 0
pk34545 2021-06-26 14:03:28.892069 Moderna 0
... ... ... ...
pk31142 2021-07-08 00:53:53.763942 Pfizer 1
pk22055 2021-08-17 01:57:34.407304 AstraZenica 1
pk38363 2021-11-21 07:48:46.199824 AstraZenica 1
pk38067 2021-10-21 18:15:40.375532 Moderna 1
pk39445 2021-11-05 17:19:06.735946 Pfizer 1

80936 rows × 3 columns

model['Serology']
Date IgG
ID
pk10001 2021-03-17 13.701676
pk10001 2021-07-29 1.077413
pk10005 2020-06-06 39.366639
pk10005 2020-03-02 56.358177
pk10009 2021-05-03 55.585361
... ... ...
pk997 2020-12-14 1.720815
pk9978 2020-11-10 52.239568
pk9982 2023-05-12 5.753619
pk9989 2022-12-16 37.616017
pk9989 2022-12-14 7.415778

20591 rows × 2 columns

model['Symptoms']
date_occurrence Headache Fatigue Dizzy Cough Fever Muscle_Pain
ID
pk1 2021-01-24 Yes Yes No Yes No Yes
pk1 2019-05-30 Yes Yes No No Yes No
pk1 2021-05-16 Yes No No No Yes No
pk1 2022-06-11 Yes Yes Yes Yes No No
pk1 2020-06-18 Yes Yes Yes Yes Yes Yes
... ... ... ... ... ... ... ...
pk9992 2021-11-22 No No No No No Yes
pk9992 2019-11-07 Yes Yes No Yes Yes Yes
pk9992 2018-09-01 Yes Yes No No Yes Yes
pk9993 2018-12-02 No No No No Yes No
pk9996 2018-08-11 Yes Yes No Yes No Yes

56673 rows × 7 columns

model['Blood_Test']
date_taken location quantity
ID
pk31048 2019-05-24 13:00:52.371957 Right Arm 0.858088
pk42771 2020-12-04 13:00:52.371957 Right Arm 0.362495
pk9294 2019-08-18 13:00:52.371957 Left Arm 0.674107
pk34653 2020-06-11 13:00:52.371957 Small Intestine 0.994321
pk34474 2020-06-13 13:00:52.371957 Left Arm 0.520345
... ... ... ...
pk2856 2021-03-03 13:00:52.371957 Left Arm 0.358316
pk46915 2020-01-19 13:00:52.371957 Left Arm 0.387136
pk32960 2019-10-12 13:00:52.371957 Left Arm 0.563794
pk30995 2018-05-29 13:00:52.371957 Abdominal Wall 0.665672
pk21561 2019-10-16 13:00:52.371957 Left Arm 2.318754

30000 rows × 3 columns

model['Hospital_Visit']
admission_date reason
ID
pk1 2020-03-20 Cancer
pk1000 2019-02-14 Pneumonia
pk10001 2019-10-10 Appendix Operation
pk10001 2021-01-13 Cancer
pk10001 2018-11-30 COVID-19
... ... ...
pk9843 2018-10-04 COVID-19
pk9904 2018-05-15 Appendix Operation
pk9913 2022-06-14 Pneumonia
pk9915 2021-12-24 COVID-19
pk9926 2017-11-18 Cancer

8710 rows × 2 columns

Perform analysis

Firstly get the the GP records, and count the number of comorbidities a patient has, to create risk groups

df_gp = model['GP_Records']
df_nrisks = df_gp.groupby(df_gp.index)['comorbidity'].count() 
df_nrisks.name = 'nrisks'
df_nrisks.value_counts().to_frame().sort_index()
nrisks
0 40
1 35767
2 6920
3 1927
4 313
5 33

build a dataframe for analysis by getting the serology data + demographics data + the number of risks

df = model['Serology'].join(model['Demographics']).join(df_nrisks).fillna(0)
df
Date IgG Age Sex nrisks
ID
pk10001 2021-03-17 13.701676 0.0 0 2.0
pk10001 2021-07-29 1.077413 0.0 0 2.0
pk10005 2020-06-06 39.366639 0.0 0 3.0
pk10005 2020-03-02 56.358177 0.0 0 3.0
pk10009 2021-05-03 55.585361 0.0 0 1.0
... ... ... ... ... ...
pk997 2020-12-14 1.720815 0.0 0 1.0
pk9978 2020-11-10 52.239568 0.0 0 1.0
pk9982 2023-05-12 5.753619 0.0 0 1.0
pk9989 2022-12-16 37.616017 0.0 0 2.0
pk9989 2022-12-14 7.415778 0.0 0 2.0

20591 rows × 5 columns

Produce some plots show how there is a difference in the IgG response for different age and risk groups

import matplotlib.pyplot as plt
fig,axs = plt.subplots(2)
ax = axs[0]

df['IgG'].plot.hist(ax=ax,bins=10,range=(0,150),histtype='step',lw=2,density=True,label='all')
df[df['Age']&gt;50]['IgG'].plot.hist(ax=ax,bins=10,range=(0,150),density=True,histtype='step',lw=2,label='Age &gt; 50')
df[df['Age']&lt;50]['IgG'].plot.hist(ax=ax,bins=10,range=(0,150),density=True,histtype='step',lw=2,label='Age &lt; 50')
ax.set_yscale('log')
ax.set_xlabel('IgG measurement')
ax.legend()

ax = axs[1]
df[df['nrisks']&lt;1]['IgG'].plot.hist(ax=ax,bins=10,range=(0,150),density=True,histtype='step',lw=2,label='nrisk=0')
df[df['nrisks']==1]['IgG'].plot.hist(ax=ax,bins=10,range=(0,150),density=True,histtype='step',lw=2,label='nrisk=1')
df[df['nrisks']&gt;1]['IgG'].plot.hist(ax=ax,bins=10,range=(0,150),density=True,histtype='step',lw=2,label='nrisk&gt;1')

ax.set_yscale('log')
ax.set_xlabel('IgG measurement')
ax.legend()
plt.show();
No description has been provided for this image