Skip to content

File Helpers

Source code in docs/CaRROT-CDM/source_code/carrot/tools/file_helpers.py
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
def load_csv(_map,chunksize=None,
             dtype=str,nrows=None,
             lower_col_names=False,
             load_path="",
             rules=None,
             sep=',',
             na_values=['']):

    if isinstance(_map,list):
        _map = {
            os.path.basename(x):x
            for x in _map
        }

    logger = Logger("carrot.tools.load_csv")

    if isinstance(_map,list) or isinstance(_map,tuple):
        _map = { x:x for x in _map}
    elif isinstance(_map,str):
        _map = { _map:_map }

    if rules is not None:
        logger.debug("rules .json file supplied")
        if not isinstance(rules,dict):
            rules = load_json(rules)

        inputs_from_cli = list(_map.keys())

        source_map = get_mapped_fields_from_rules(rules)
        inputs_from_json = list(source_map.keys())

        if len(inputs_from_cli) == 0:
            raise MissingInputFiles (f"You haven't loaded any input files!")

        logger.debug(f"{len(inputs_from_cli)} input files loaded")
        logger.debug(f"{inputs_from_cli}")

        missing_inputs = list(set(inputs_from_json) - set(inputs_from_cli))
        if len(missing_inputs) > 0 :
            raise MissingInputFiles (f"Found the following files {missing_inputs} in the json file, that are not in the loaded file list... {inputs_from_cli}")

        #reduce the mapping of inputs, if we dont need them all
        _map = {
            k: {
                'file':v,
                'fields':source_map[k]
            }
            for k,v in _map.items()
            if k in source_map
        }


    if not nrows is None:
        chunksize = nrows if chunksize is None else chunksize

    retval = io.LocalDataCollection(chunksize=chunksize)

    for key,obj in _map.items():
        fields = None
        if isinstance(obj,str):
            fname = obj
        else:
            fname = obj['file']
            fields = obj['fields']

        df = pd.read_csv(load_path+fname,
                         chunksize=chunksize,
                         #iterator=True,
                         nrows=nrows,
                         sep=sep,
                         keep_default_na=False,
                         na_values=na_values,
                         dtype=dtype,
                         usecols=fields)

        df.attrs = {'original_file':load_path+fname}


        if isinstance(df,pd.DataFrame):
            #this should be removed
            if lower_col_names:
                df.columns = df.columns.str.lower()

        retval[key] = io.DataBrick(df,name=key)

    return retval
Source code in docs/CaRROT-CDM/source_code/carrot/tools/file_helpers.py
63
64
65
66
67
68
69
70
71
72
73
def load_json(f_in):

    if os.path.exists(f_in):
        data = json.load(open(f_in))
    else:
        try:
            data = json.loads(f_in)
        except Exception as err:
            raise FileNotFoundError(f"{f_in} not found. Or cannot parse as json")

    return data
Source code in docs/CaRROT-CDM/source_code/carrot/tools/file_helpers.py
192
193
194
195
196
197
198
199
200
201
202
203
204
def get_file_map_from_dir(_dir,ext='.csv'):
    if not os.path.isdir(_dir):
        _dir = os.path.abspath(
            os.path.join(
                os.path.dirname(__file__),'..','data',_dir)
        )

    _map = {}
    for fname in glob.glob(f"{_dir}{os.path.sep}*{ext}"):
        key = os.path.basename(fname)
        _map[key] = fname

    return _map
Source code in docs/CaRROT-CDM/source_code/carrot/tools/file_helpers.py
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
def get_mapped_fields_from_rules(rules):
    #extract a tuple of source tables and source fields
    sources = [
        (x['source_table'],x['source_field'])
        for cdm_obj_set in rules['cdm'].values()
        for cdm_obj in cdm_obj_set.values()
        for x in cdm_obj.values()
    ]

    source_map = {}
    for (table,field) in sources:
        if table not in source_map:
            source_map[table] = []
        source_map[table].append(field)

    source_map = {
        k:list(set(v))
        for k,v in source_map.items()
    }

    return source_map
Source code in docs/CaRROT-CDM/source_code/carrot/tools/file_helpers.py
280
281
282
283
284
285
def get_separator_from_filename(fname):
    _, fileExtension = os.path.splitext(fname)
    if fileExtension == '.tsv':
        return '\t'
    else:
        return ','
Source code in docs/CaRROT-CDM/source_code/carrot/tools/file_helpers.py
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
def diff_csv(file1,file2,separator=None,nrows=None):
    logger = Logger("CSV File Diff")

    if separator == None:
        sep1 = get_separator_from_filename(file1)
        sep2 = get_separator_from_filename(file2)
    else:
        sep1 = separator
        sep2 = separator

    df1 = pd.read_csv(file1,sep=sep1,nrows=nrows)
    df2 = pd.read_csv(file2,sep=sep2,nrows=nrows)

    exact_match = df1.equals(df2)
    if exact_match:
        return

    df = pd.concat([df1,df2]).drop_duplicates(keep=False)

    if len(df) > 0:
        logger.error(" ======== Differing Rows ========== ")
        logger.error(df)
        m = df1.merge(df2, on=df.columns[0], how='outer', suffixes=['', '_'], indicator=True)[['_merge']]
        m = m[~m['_merge'].str.contains('both')]
        file1 = file1.split('/')[-1]
        file2 = file2.split('/')[-1]

        m['_merge'] = m['_merge'].map({'left_only':file1,'right_only':file2})
        m = m.rename(columns={'_merge':'Only Contained Within'})
        m.index.name = 'Row Number'
        logger.error(m.reset_index().to_dict(orient='records'))
        raise DifferingRows("Something not right with the rows, changes detected.")

    elif len(df1.columns) != len(df2.columns):

        raise DifferingColumns('in df1 but not df2',list(set(df1.columns) - set(df2.columns)),'\n',
                               'in df2 but not df1',list(set(df2.columns) - set(df1.columns)))

    else:
        logger.error(" ======= Rows are likely in a different order ====== ")
        for i in range(len(df1)):
            if not (df1.iloc[i] == df2.iloc[i]).any():
                print ('Row',i,'is in a different location')
        raise Exception("differences detected")