Join base files for years

In this notebook the information contained in the previously cleaned CSV files are reformated and written to year folders. For each CSV the information is grouped for one publications and year, publication id, title, language, journal name, authors and affiliations (joined by a semikolon) and whether MPG affiliations are contained.

Imports

import pandas as pd
import datetime
import os
import re
import random
import numpy as np
from tqdm import tqdm
from functools import partial
from multiprocessing import  Pool, cpu_count

Setup paths

The root path where all SciGraph data can be read and written.

dataPath = '/media/arbeit/b88b17b8-b5d7-4c4e-86bc-ff4b67458842/sciGraph/processedData/'
rootPath = dataPath + "extendedArticles/"
yearBasePath = dataPath + 'yeardata/'

The main method reads each CSV file to allow parallel work on as many CPUs as are locally available. We therefore need a list of the CSV files.

csvList = [x for x in os.listdir(rootPath) if x.startswith('articles') and x.endswith('.csv')]
csvList[0]
'articles_3201.csv'
df = pd.read_csv(rootPath + csvList[120])
df.head(1)
is_MPG journalId journalName affName affId alternateName autId familyName givenName datePublishedReg pubId inLanguage title
0 False sg:journal.1017409 Clinical Pharmacology & Therapeutics ['University of Alberta Alberta Research Centr... https://www.grid.ac/institutes/grid.17089.37 University of Alberta sg:person.0764071732.05 Hartling L 2011-11-01 sg:pub.10.1038/clpt.2011.212 ['en'] StaR Child Health: Developing Evidence‐Based G...

Define routine

def createLinks(file, pubIDkey='pubId', journalIDKey='journalId', journalNameKey = 'journalName', debug=False):
    try:
        filename = str(file)
        df = pd.read_csv(rootPath + file)
        df = df.fillna('')
        year = df['datePublishedReg'].apply(lambda row: row.split('-')[0])
        df.insert(0,'year',year)
        for y,g1 in df.groupby('year'):
            if y:                
                os.makedirs(yearBasePath + y,exist_ok=True)
                with open(yearBasePath + y +  '/' + filename.split('.')[0] + '_{0}.csv'.format(y), 'w') as file:
                    for pub, g2 in g1.groupby(pubIDkey):
                        if debug:
                            print(pub)
                        journalID = g2[journalIDKey].unique()[0]
                        pubName = g2[journalNameKey].unique()[0]
                        lang = g2.inLanguage.unique()[0]
                        title = g2.title.unique()[0]
                        if any([x in g2.is_MPG.value_counts().to_dict().keys() for x in ('True',True)]):
                            has_MPG = 'True'
                        else:
                            has_MPG = 'False'
                        res = []
                        resAff = []
                        for i, row in g2.iterrows():
                            if row['autId']:
                                res.append(row['autId'])
                            elif row['familyName']:
                                res.append(row['familyName'] + '_' + row['givenName'])
                            if row['affId']:
                                resAff.append(row['affId'])
                            elif row['affName']:
                                resAff.append(row['affName'])
                        file.write(
                            '{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\n'.format(
                                y, 
                                pub, 
                                re.sub('\t','',title), 
                                lang, 
                                journalID, 
                                pubName, 
                                has_MPG, 
                                re.sub('\t','',';'.join(res)),
                                ';'.join(resAff)
                            )
                        )
            else:
                os.makedirs(yearBasePath + 'noYear',exist_ok=True)
                with open(yearBasePath + 'noYear/' + filename.split('.')[0] + '_{0}.csv'.format(y), 'w') as file:
                    for pub, g2 in g1.groupby(pubIDkey):
                        if debug:
                            print(pub)
                        publicationID = g2[pubIDkey].unique()[0]
                        pubName = g2[journalNameKey].unique()[0]
                        lang = g2.inLanguage.unique()[0]
                        title = g2.title.unique()[0]
                        if any([x in g2.is_MPG.value_counts().to_dict().keys() for x in ('True',True)]):
                            has_MPG = 'True'
                        else:
                            has_MPG = 'False'
                        res = []
                        resAff = []
                        for i, row in g2.iterrows():
                            if row['autId']:
                                res.append(row['autId'])
                            elif row['familyName']:
                                res.append(row['familyName'] + '_' + row['givenName'])
                            if row['affId']:
                                resAff.append(row['affId'])
                            elif row['affName']:
                                resAff.append(row['affName'])
                        file.write('{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\n'.format(y, pub, re.sub('\t','',title), lang, publicationID, pubName, has_MPG, ';'.join(res),';'.join(resAff)))
    
    except:
        result.append(f'Failed for {filename}')
        raise 
    result.append(f'Done for {filename}')
    return

Testing with single file

result = []
createLinks(csvList[120])
result
['Done for articles_5877.csv']

Run on all files

ncore = cpu_count() - 2
result = []


with Pool(ncore) as p:
    max_ = len(csvList)
    with tqdm(total=max_) as pbar:
        for i, _ in enumerate(p.imap_unordered(createLinks, csvList)):
            pbar.update()
100%|██████████| 10645/10645 [58:00<00:00,  3.06it/s] 

Join year files

For each year folder, join the contained files to a single year file in the main yearBasePath folder.

for year in os.listdir(yearBasePath):
    yearPath = yearBasePath + year + '/' 
    if os.path.isdir(yearPath):
        csvList = os.listdir(yearPath)
        with open(yearBasePath + f'{year}.csv','a') as outFile:
            for file in csvList:
                with open(yearPath + file, 'r') as inFile:
                    lines = inFile.readlines()
                    outFile.writelines(lines)

Missing publication year data

For a limited number of publications the metadata contains no publication date. These 45 publications could be added to the database by hand.

resDF = []
for file in os.listdir(yearBasePath + 'noYear/'):
    if os.path.isfile(yearBasePath + 'noYear/' + file):
        if file.startswith('articles'):
            resDF.append(pd.read_csv(yearBasePath + 'noYear/' + file, sep='\t',header=None))
dfMissing = pd.concat(resDF)
dfMissing.shape
(45, 9)
dfMissing.head(2)
0 1 2 3 4 5 6 7 8
0 NaN sg:pub.10.1007/0-387-23852-2_10 IT-Supported Modeling, Analysis and Design of ... ['en'] sg:pub.10.1007/0-387-23852-2_10 NaN False sg:person.011160054337.99;sg:person.0110715537... NaN
0 NaN sg:pub.10.1007/7631_2015_7 Uterine Cancer: Pathology ['en'] sg:pub.10.1007/7631_2015_7 NaN False sg:person.01354507572.27;sg:person.01066764345.44 ['Department of Pathology, Memorial Sloan Kett...