Skip to content

Common

Bases: OrderedDict, Logger

Class for formatting DestinationFields in the CommonDataModel

Inherits from an ordered dictionary, and maps datatypes to lambda functions. The lamba functions encode how to transform and format a pandas series given the datatype.

Source code in docs/CaRROT-CDM/source_code/carrot/cdm/objects/common.py
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
class DataFormatter(collections.OrderedDict,Logger):
    """
    Class for formatting DestinationFields in the CommonDataModel

    Inherits from an ordered dictionary, and maps datatypes to lambda functions.
    The lamba functions encode how to transform and format a pandas series given the datatype.

    """

    def check_formatting(self,series,function,nsample=50,tolerance=0.3):
        """
        Apply a formatting function to a subset of a series
        Args:
            series (pandas.Series) : input data series
            function (built-in function): formatting function to be applied
            nsample (int): number of rows to sample to make checks on (default = 50)
        Returns:
           series : modified or original pandas.Series object

        """
        # get the number of rows of the datframe
        n = len(series)
        nsample = nsample if n > nsample else n


        #sample the series
        series_slice = series.sample(nsample)
        #format the sample of the series
        series_slice_formatted = function(series_slice)

        #if it's just formatting of a number, just return the series if no error has been raised
        if series_slice_formatted.dtype == 'Float64':
            return series

        #if it's formatting of text i.e. date string 
        #and the pre- and post-formatting of the series are equal
        #dont waste time formatting the entire series, just return it as it is
        series_slice_values = series_slice.dropna().astype(str).unique()
        series_slice_formatted_values = series_slice_formatted.dropna().astype(str).replace('', np.nan).dropna().unique()

        if np.array_equal(series_slice_values,series_slice_formatted_values):
            self.logger.debug(f'Sampling {nsample}/{n} values suggests the column '\
                              f'{series.name}" is  already formatted!!')
            return series
        else:
            a=np.array(series_slice.values,dtype=str)
            b=np.array(series_slice_formatted.values,dtype=str)

            are_equal = a==b
            ngood = are_equal.sum()
            fraction_good = round(ngood / nsample,2)

            logger = self.logger.critical if fraction_good <= tolerance else self.logger.warning

            logger(f'Tested fomatting {nsample} rows of {series.name}. The original data is not in the right format.')

            df_bad = pd.concat([series_slice[~are_equal],series_slice_formatted[~are_equal]],axis=1)
            df_bad.columns = ['original','should be']

            self.logger.warning(f"\n {df_bad}")

            if logger == self.logger.critical:
                logger(f"Fraction of good columns = {fraction_good} ({ngood} / {nsample} ), is below the tolerance threshold={tolerance}")
                raise DataStandardError(f"{series.name} has not been formatted correctly")
            else:
                logger(f"Fraction of good columns ={fraction_good} ({ngood} / {nsample} ), is above the tolerance threshold={tolerance}")


    def __init__(self,errors='coerce'):
        super().__init__()

        self['Integer'] = lambda x : pd.to_numeric(x,errors=errors).astype('Int64')
        self['Float']   = lambda x : pd.to_numeric(x,errors=errors).astype('Float64')
        self['Text20']  = lambda x : x.fillna('').astype(str).apply(lambda x: x[:20])
        self['Text50']  = lambda x : x.fillna('').astype(str).apply(lambda x: x[:50])
        self['Text60']  = lambda x : x.fillna('').astype(str).apply(lambda x: x[:60])

        self['Timestamp'] = lambda x : pd.to_datetime(x,errors=errors)\
                                        .dt.strftime('%Y-%m-%d %H:%M:%S.%f')
        self['Date'] = lambda x : pd.to_datetime(x,errors=errors).dt.date

check_formatting(series, function, nsample=50, tolerance=0.3)

Apply a formatting function to a subset of a series Args: series (pandas.Series) : input data series function (built-in function): formatting function to be applied nsample (int): number of rows to sample to make checks on (default = 50) Returns: series : modified or original pandas.Series object

Source code in docs/CaRROT-CDM/source_code/carrot/cdm/objects/common.py
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
def check_formatting(self,series,function,nsample=50,tolerance=0.3):
    """
    Apply a formatting function to a subset of a series
    Args:
        series (pandas.Series) : input data series
        function (built-in function): formatting function to be applied
        nsample (int): number of rows to sample to make checks on (default = 50)
    Returns:
       series : modified or original pandas.Series object

    """
    # get the number of rows of the datframe
    n = len(series)
    nsample = nsample if n > nsample else n


    #sample the series
    series_slice = series.sample(nsample)
    #format the sample of the series
    series_slice_formatted = function(series_slice)

    #if it's just formatting of a number, just return the series if no error has been raised
    if series_slice_formatted.dtype == 'Float64':
        return series

    #if it's formatting of text i.e. date string 
    #and the pre- and post-formatting of the series are equal
    #dont waste time formatting the entire series, just return it as it is
    series_slice_values = series_slice.dropna().astype(str).unique()
    series_slice_formatted_values = series_slice_formatted.dropna().astype(str).replace('', np.nan).dropna().unique()

    if np.array_equal(series_slice_values,series_slice_formatted_values):
        self.logger.debug(f'Sampling {nsample}/{n} values suggests the column '\
                          f'{series.name}" is  already formatted!!')
        return series
    else:
        a=np.array(series_slice.values,dtype=str)
        b=np.array(series_slice_formatted.values,dtype=str)

        are_equal = a==b
        ngood = are_equal.sum()
        fraction_good = round(ngood / nsample,2)

        logger = self.logger.critical if fraction_good <= tolerance else self.logger.warning

        logger(f'Tested fomatting {nsample} rows of {series.name}. The original data is not in the right format.')

        df_bad = pd.concat([series_slice[~are_equal],series_slice_formatted[~are_equal]],axis=1)
        df_bad.columns = ['original','should be']

        self.logger.warning(f"\n {df_bad}")

        if logger == self.logger.critical:
            logger(f"Fraction of good columns = {fraction_good} ({ngood} / {nsample} ), is below the tolerance threshold={tolerance}")
            raise DataStandardError(f"{series.name} has not been formatted correctly")
        else:
            logger(f"Fraction of good columns ={fraction_good} ({ngood} / {nsample} ), is above the tolerance threshold={tolerance}")

Bases: object

CommonDataModel Table Destination Field.

Object for handling output columns (destination fields) in a Destination Table

Attributes:

Name Type Description
series Series

raw column data in the form of a series

dtype str

data type for how to format the column based on the DataFormatter

required bool

if the column is required or not i.e. if the row should be delete if it is not filled

pk str

primary key label, indicating if the column is the primary required field

Source code in docs/CaRROT-CDM/source_code/carrot/cdm/objects/common.py
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
class DestinationField(object):
    """
    CommonDataModel Table Destination Field.

    Object for handling output columns (destination fields) in a 
    Destination Table

    Attributes:
       series (pandas.Series): raw column data in the form of a series
       dtype (str): data type for how to format the column based on the DataFormatter
       required (bool): if the column is required or not 
                        i.e. if the row should be delete if it is not filled
       pk (str): primary key label, indicating if the column is the primary required field

    """
    def __init__(self, dtype: str, required: bool, pk=False):
        self.series = None
        self.dtype = dtype
        self.required = required
        self.pk = pk

Bases: Logger

Common object that all CDM objects (tables) inherit from.

Source code in docs/CaRROT-CDM/source_code/carrot/cdm/objects/common.py
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
class DestinationTable(Logger):
    """
    Common object that all CDM objects (tables) inherit from.
    """

    @classmethod
    def from_df(cls,df,name=None):
        obj = cls(name)
        obj.__df = df
        #for colname in df.columns:
        #    obj[colname].series = df[colname]
        return obj

    def __len__(self):
        return len(self.__df)

    def reset(self):
        self.clear()
        self._meta.clear()
        self.__init_meta()

    def clear(self):
        self.__df = None
        #for field in self.fields:
        #    series = getattr(self,field)
        #    del series

    def __init_meta(self):
        self._meta = {'required_fields':{}}

    def __init__(self,name,_type,_version='v5_3_1',format_level=1):
        """
        Initialise the CDM DestinationTable Object class
        Args:
           _type (str): the name of the object being initialsed, e.g. "person"
           _version (str): the CDM version, see https://github.com/OHDSI/CommonDataModel/tags
        Returns: 
           None
        """
        self.name = name
        self._type = _type
        self.__init_meta()

        self.dtypes = DataFormatter()
        self.format_level = FormatterLevel(format_level)
        self.fields = self.get_field_names()
        #self.do_formatting = not format_level is None

        if len(self.fields) == 0:
            raise Exception("something misconfigured - cannot find any DataTypes for {self.name}")

        #print a check to see what cdm objects have been initialised
        self.logger.debug(self.get_destination_fields())
        self.__df = None

        #get the required fields
        self.required_fields = [
            field
            for field in self.get_field_names()
            if getattr(self,field).required == True
        ]

        self.automatically_fill_missing_columns = True
        self.tools = OperationTools()


    def get_field_names(self):
        """
        From the current object, loop over all member objects and find those that are instances
        of a DestinationField (column)

        Returns:
           list : a list of destination fields (columns [series])

        """
        return [
            item
            for item in self.__dict__.keys()
            if isinstance(getattr(self,item),DestinationField)
        ]

    def get_field_dtypes(self):
        """
        From the current object, loop over all member objects and find those that are instances
        of a DestinationField (column)

        Returns:
           list : a list of destination fields (columns [series])

        """
        return {
            item:getattr(self,item).dtype
            for item in self.__dict__.keys()
            if isinstance(getattr(self,item),DestinationField)
        }

    def get_ordering(self):
        """
        Loops over all associated fields and finds which have been marked as being a primary key.

        Returns:
            list: a string list of the names of primary columns (fields)
        """
        retval = [
            field
            for field in self.fields
            if getattr(self,field).pk == True
        ]

        return retval

    def __getitem__(self, key):
        """
        Retrieve a field (column) from the table (dataframe)

        Args:
           key (str) : name of a destination field
        Returns:
           DestinationField : the destination field object
        """

        return getattr(self, key)

    def __setitem__(self, key, obj):
        """
        Register a field object with the table
        """
        return setattr(self, key, obj)

    #def set_format_level(self,level):
    #    self.format_level = level

    def set_name(self,name):
        """
        Register/Set the name of the destination table
        """
        self.name = name
        self.logger.name = self.name

    def define(self,_):
        """
        define function, expected to be overloaded by the user defining the object
        """
        pass

    def get_destination_fields(self):
        """
        Get a list of all the destination fields that have been 
        loaded and associated to this cdm object


        Returns: 
           list: a list of all the destination fields that have been defined
        """
        return list(self.fields)

    def update(self,that):
        #extract all objects from the passed object
        objs = {k:v for k,v in that.__dict__.items() if k!='logger' }
        #add objects to this class
        self.__dict__.update(objs)


    def set_df(self,df):
        self.__df = df

    def get_df(self,force_rebuild=False,dont_build=False,dropna=False,**kwargs):
        """
        Retrieve a dataframe from the current object

        Returns:
           pandas.Dataframe: extracted dataframe of the cdm object
        """

        if not self.__df is None:
            self.logger.debug(f"df({hex(id(self.__df))}) already exists")

        if dont_build:
            if self.__df is None:
                self.__df = pd.DataFrame(columns = self.fields)
                self.set_df_name()
            return self.__df

        #if the dataframe has already been built.. just return it
        if not self.__df is None and not force_rebuild:
            self.logger.debug('already got a dataframe, so returning the existing one')
            if dropna:
                return self.__df.dropna(axis=1)
            else:
                return self.__df

        self.define(self)

        #get a dict of all series
        #each object is a pandas series
        dfs = {}

        for field in self.fields:
            obj = getattr(self,field)
            series = obj.series
            if series is None:
                #if required:
                #    self.logger.error(f"{field} is all null/none or has not been set/defined")
                #    raise RequiredFieldIsNone(f"{field} is a required for {self.name}.")
                continue

            #rename the column to be the final destination field name
            series = series.rename(field)
            #register the new series
            dfs[field] = series
            self.logger.debug(f'Adding series to dataframe from field "{field}"')

        #if there's none defined, dont do anything
        if len(dfs) == 0:
            self.logger.warning("no objects defined")
            self.__df = pd.DataFrame(columns = self.fields)
            self.set_df_name()
            return self.__df

        #check the lengths of the dataframes
        lengths = list(set([len(df) for df in dfs.values()]))
        if len(lengths)>1:
            self.logger.error("One or more inputs being mapped to this object has a different number of entries")
            for name,df in dfs.items():
                self.logger.error(f"{name} of length {len(df)}")
            raise BadInputs("Differring number of rows in the inputs")

        #create a dataframe from all the series objects
        df = pd.concat(dfs.values(),axis=1)

        #find which fields in the cdm havent been defined
        missing_fields = set(self.fields) - set(df.columns)

        #self._meta['defined_columns'] = df.columns.tolist()
        #self._meta['undefined_columns'] = list(missing_fields)

        #set these to a nan/null series
        for field in missing_fields:
            df[field] = np.NaN

        #simply order the columns 
        df = df[self.fields]

        df = self.finalise(df,**kwargs)
        df = self.format(df)

        if dropna:
            df = df.dropna(axis=1)

        #register the df
        self.__df = df
        self.set_df_name()

        self.logger.info(f"created df ({hex(id(df))})[{self.get_df_name()}]")
        return self.__df


    def get_df_name(self):
        if not self.__df is None:
            return self.__df.attrs['name']

    def set_df_name(self):
        if self.__df is None:
            return
        name = re.sub("[^0-9a-zA-Z]+","_",self.name)
        self.__df.attrs['name'] = name

    def format(self,df):

        if self.format_level is FormatterLevel.OFF:
            self.logger.info('Not formatting data columns')
            return df
        elif self.format_level is FormatterLevel.ON:
            self.logger.info("Automatically formatting data columns.")
        elif self.format_level is FormatterLevel.CHECK:
            self.logger.info("Performing checks on data formatting.")


        for col in self.fields:
            #if is already all na/nan, dont bother trying to format
            is_nan_already = df[col].head(100).isna().all()
            if is_nan_already:
                continue

            obj = getattr(self,col)

            #dont try any formatting for primary keys that need to be integers
            if obj.pk == True or col == 'person_id':
                continue

            dtype = obj.dtype
            formatter_function = self.dtypes[dtype]

            nbefore = len(df[col])
            if nbefore == 0:
                self.logger.warning(f"trying to format an empty column ({cols})")

            nsample = 5 if nbefore > 5 else nbefore
            sample = df[col].sample(nsample)

            if self.format_level is FormatterLevel.ON:
                self.logger.debug(f"Formatting {col}")
                try:
                    df[col] = formatter_function(df[col])
                except Exception as e:
                    self.logger.critical(e)
                    if 'source_files' in self._meta:
                        self.logger.error("This is coming from the source file (table & column) ...")
                        self.logger.error(self._meta['source_files'][col])
                    raise(e)

                if col in self.required_fields:
                    df = df[~df[col].isna()]
                    #count the number of rows after
                    nafter = len(df)
                    self._meta['required_fields'][col]['after_formatting'] = nafter

                    if nafter == 0 :
                        self.logger.error(f"Something wrong with the formatting of the required field {col} using {dtype}")
                        self.logger.info(f"Formatting resulted in all NaN values. Sample of this column before formatting:")
                        self.logger.error(sample)
                        if 'source_files' in self._meta:
                            self.logger.error("This is coming from the source file (table & column) ...")
                            self.logger.error(self._meta['source_files'][col])
                        raise FormattingError(f"When formatting the required column {col}, using the formatter function {dtype}, all produced values are  NaN/null values.")
                    else:
                        ndiff = nafter - nbefore
                        if ndiff > 0:
                            self.logger.warning(f"Formatting of values in {col} removed {ndiff} rows, leaving {nafter} rows.")

            elif self.format_level is FormatterLevel.CHECK:
                self.logger.debug(f"Checking formatting of {col} to {dtype}")
                try:
                    _ = self.dtypes.check_formatting(df[col],formatter_function)
                except Exception as e:
                    if 'source_files' in self._meta:
                        self.logger.error("This is coming from the source file (table & column) ...")
                        self.logger.error(self._meta['source_files'][col])
                    raise(e) 

        return df

    def finalise(self,df,start_index=1,**kwargs):
        """
        Finalise a dataframe by dropping null/nan rows if a required field is missing.
        also sort the dataframe by the primary key of the table.

        Args:
            df (pandas.Dataframe): input dataframe
        Returns:
            pandas.Dataframe: cleaned output dataframe
        """

        #loop over the non-index fields
        for field in df.columns[1:]:
            #if it's not required, skip
            if field not in self.required_fields:
                continue

            #count the number of rows before
            nbefore = len(df)
            #remove rows which do not have this required field filled
            df = df[~df[field].isna()]

            #count the number of rows after
            nafter = len(df)
            #get the number of rows removed
            ndiff = nbefore - nafter
            #if rows have been removed
            if ndiff>0:
                #log a warning message if after requiring non-NaN values has removed all rows
                log = self.logger.warning if nafter > 0 else self.logger.error
                log(f"Requiring non-null values in {field} removed {ndiff} rows, leaving {nafter} rows.")

            #log some metadata
            self._meta['required_fields'][field] = {
                'before':nbefore,
                'after':nafter
            }

        #now index properly
        primary_column = df.columns[0]
        if primary_column != 'person_id':
            if df[primary_column].head(100).isnull().all():
                df[primary_column] = df.reset_index().index + start_index

        #return the dataframe sorted by the primary key requested
        #ordering = self.get_ordering()
        #if len(ordering) > 0:
        #    df = df.sort_values(self.get_ordering())
        return df

__getitem__(key)

Retrieve a field (column) from the table (dataframe)

Parameters:

Name Type Description Default
key str)

name of a destination field

required

Returns: DestinationField : the destination field object

Source code in docs/CaRROT-CDM/source_code/carrot/cdm/objects/common.py
249
250
251
252
253
254
255
256
257
258
259
def __getitem__(self, key):
    """
    Retrieve a field (column) from the table (dataframe)

    Args:
       key (str) : name of a destination field
    Returns:
       DestinationField : the destination field object
    """

    return getattr(self, key)

__init__(name, _type, _version='v5_3_1', format_level=1)

Initialise the CDM DestinationTable Object class Args: _type (str): the name of the object being initialsed, e.g. "person" _version (str): the CDM version, see https://github.com/OHDSI/CommonDataModel/tags Returns: None

Source code in docs/CaRROT-CDM/source_code/carrot/cdm/objects/common.py
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
def __init__(self,name,_type,_version='v5_3_1',format_level=1):
    """
    Initialise the CDM DestinationTable Object class
    Args:
       _type (str): the name of the object being initialsed, e.g. "person"
       _version (str): the CDM version, see https://github.com/OHDSI/CommonDataModel/tags
    Returns: 
       None
    """
    self.name = name
    self._type = _type
    self.__init_meta()

    self.dtypes = DataFormatter()
    self.format_level = FormatterLevel(format_level)
    self.fields = self.get_field_names()
    #self.do_formatting = not format_level is None

    if len(self.fields) == 0:
        raise Exception("something misconfigured - cannot find any DataTypes for {self.name}")

    #print a check to see what cdm objects have been initialised
    self.logger.debug(self.get_destination_fields())
    self.__df = None

    #get the required fields
    self.required_fields = [
        field
        for field in self.get_field_names()
        if getattr(self,field).required == True
    ]

    self.automatically_fill_missing_columns = True
    self.tools = OperationTools()

__setitem__(key, obj)

Register a field object with the table

Source code in docs/CaRROT-CDM/source_code/carrot/cdm/objects/common.py
261
262
263
264
265
def __setitem__(self, key, obj):
    """
    Register a field object with the table
    """
    return setattr(self, key, obj)

define(_)

define function, expected to be overloaded by the user defining the object

Source code in docs/CaRROT-CDM/source_code/carrot/cdm/objects/common.py
277
278
279
280
281
def define(self,_):
    """
    define function, expected to be overloaded by the user defining the object
    """
    pass

finalise(df, start_index=1, **kwargs)

Finalise a dataframe by dropping null/nan rows if a required field is missing. also sort the dataframe by the primary key of the table.

Parameters:

Name Type Description Default
df Dataframe

input dataframe

required

Returns: pandas.Dataframe: cleaned output dataframe

Source code in docs/CaRROT-CDM/source_code/carrot/cdm/objects/common.py
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
def finalise(self,df,start_index=1,**kwargs):
    """
    Finalise a dataframe by dropping null/nan rows if a required field is missing.
    also sort the dataframe by the primary key of the table.

    Args:
        df (pandas.Dataframe): input dataframe
    Returns:
        pandas.Dataframe: cleaned output dataframe
    """

    #loop over the non-index fields
    for field in df.columns[1:]:
        #if it's not required, skip
        if field not in self.required_fields:
            continue

        #count the number of rows before
        nbefore = len(df)
        #remove rows which do not have this required field filled
        df = df[~df[field].isna()]

        #count the number of rows after
        nafter = len(df)
        #get the number of rows removed
        ndiff = nbefore - nafter
        #if rows have been removed
        if ndiff>0:
            #log a warning message if after requiring non-NaN values has removed all rows
            log = self.logger.warning if nafter > 0 else self.logger.error
            log(f"Requiring non-null values in {field} removed {ndiff} rows, leaving {nafter} rows.")

        #log some metadata
        self._meta['required_fields'][field] = {
            'before':nbefore,
            'after':nafter
        }

    #now index properly
    primary_column = df.columns[0]
    if primary_column != 'person_id':
        if df[primary_column].head(100).isnull().all():
            df[primary_column] = df.reset_index().index + start_index

    #return the dataframe sorted by the primary key requested
    #ordering = self.get_ordering()
    #if len(ordering) > 0:
    #    df = df.sort_values(self.get_ordering())
    return df

get_destination_fields()

Get a list of all the destination fields that have been loaded and associated to this cdm object

Returns:

Name Type Description
list

a list of all the destination fields that have been defined

Source code in docs/CaRROT-CDM/source_code/carrot/cdm/objects/common.py
283
284
285
286
287
288
289
290
291
292
def get_destination_fields(self):
    """
    Get a list of all the destination fields that have been 
    loaded and associated to this cdm object


    Returns: 
       list: a list of all the destination fields that have been defined
    """
    return list(self.fields)

get_df(force_rebuild=False, dont_build=False, dropna=False, **kwargs)

Retrieve a dataframe from the current object

Returns:

Type Description

pandas.Dataframe: extracted dataframe of the cdm object

Source code in docs/CaRROT-CDM/source_code/carrot/cdm/objects/common.py
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
def get_df(self,force_rebuild=False,dont_build=False,dropna=False,**kwargs):
    """
    Retrieve a dataframe from the current object

    Returns:
       pandas.Dataframe: extracted dataframe of the cdm object
    """

    if not self.__df is None:
        self.logger.debug(f"df({hex(id(self.__df))}) already exists")

    if dont_build:
        if self.__df is None:
            self.__df = pd.DataFrame(columns = self.fields)
            self.set_df_name()
        return self.__df

    #if the dataframe has already been built.. just return it
    if not self.__df is None and not force_rebuild:
        self.logger.debug('already got a dataframe, so returning the existing one')
        if dropna:
            return self.__df.dropna(axis=1)
        else:
            return self.__df

    self.define(self)

    #get a dict of all series
    #each object is a pandas series
    dfs = {}

    for field in self.fields:
        obj = getattr(self,field)
        series = obj.series
        if series is None:
            #if required:
            #    self.logger.error(f"{field} is all null/none or has not been set/defined")
            #    raise RequiredFieldIsNone(f"{field} is a required for {self.name}.")
            continue

        #rename the column to be the final destination field name
        series = series.rename(field)
        #register the new series
        dfs[field] = series
        self.logger.debug(f'Adding series to dataframe from field "{field}"')

    #if there's none defined, dont do anything
    if len(dfs) == 0:
        self.logger.warning("no objects defined")
        self.__df = pd.DataFrame(columns = self.fields)
        self.set_df_name()
        return self.__df

    #check the lengths of the dataframes
    lengths = list(set([len(df) for df in dfs.values()]))
    if len(lengths)>1:
        self.logger.error("One or more inputs being mapped to this object has a different number of entries")
        for name,df in dfs.items():
            self.logger.error(f"{name} of length {len(df)}")
        raise BadInputs("Differring number of rows in the inputs")

    #create a dataframe from all the series objects
    df = pd.concat(dfs.values(),axis=1)

    #find which fields in the cdm havent been defined
    missing_fields = set(self.fields) - set(df.columns)

    #self._meta['defined_columns'] = df.columns.tolist()
    #self._meta['undefined_columns'] = list(missing_fields)

    #set these to a nan/null series
    for field in missing_fields:
        df[field] = np.NaN

    #simply order the columns 
    df = df[self.fields]

    df = self.finalise(df,**kwargs)
    df = self.format(df)

    if dropna:
        df = df.dropna(axis=1)

    #register the df
    self.__df = df
    self.set_df_name()

    self.logger.info(f"created df ({hex(id(df))})[{self.get_df_name()}]")
    return self.__df

get_field_dtypes()

From the current object, loop over all member objects and find those that are instances of a DestinationField (column)

Returns:

Name Type Description
list

a list of destination fields (columns [series])

Source code in docs/CaRROT-CDM/source_code/carrot/cdm/objects/common.py
219
220
221
222
223
224
225
226
227
228
229
230
231
232
def get_field_dtypes(self):
    """
    From the current object, loop over all member objects and find those that are instances
    of a DestinationField (column)

    Returns:
       list : a list of destination fields (columns [series])

    """
    return {
        item:getattr(self,item).dtype
        for item in self.__dict__.keys()
        if isinstance(getattr(self,item),DestinationField)
    }

get_field_names()

From the current object, loop over all member objects and find those that are instances of a DestinationField (column)

Returns:

Name Type Description
list

a list of destination fields (columns [series])

Source code in docs/CaRROT-CDM/source_code/carrot/cdm/objects/common.py
204
205
206
207
208
209
210
211
212
213
214
215
216
217
def get_field_names(self):
    """
    From the current object, loop over all member objects and find those that are instances
    of a DestinationField (column)

    Returns:
       list : a list of destination fields (columns [series])

    """
    return [
        item
        for item in self.__dict__.keys()
        if isinstance(getattr(self,item),DestinationField)
    ]

get_ordering()

Loops over all associated fields and finds which have been marked as being a primary key.

Returns:

Name Type Description
list

a string list of the names of primary columns (fields)

Source code in docs/CaRROT-CDM/source_code/carrot/cdm/objects/common.py
234
235
236
237
238
239
240
241
242
243
244
245
246
247
def get_ordering(self):
    """
    Loops over all associated fields and finds which have been marked as being a primary key.

    Returns:
        list: a string list of the names of primary columns (fields)
    """
    retval = [
        field
        for field in self.fields
        if getattr(self,field).pk == True
    ]

    return retval

set_name(name)

Register/Set the name of the destination table

Source code in docs/CaRROT-CDM/source_code/carrot/cdm/objects/common.py
270
271
272
273
274
275
def set_name(self,name):
    """
    Register/Set the name of the destination table
    """
    self.name = name
    self.logger.name = self.name