Join base files for years¶

In this notebook the information contained in the previously cleaned CSV files are reformated and written to year folders. For each CSV the information is grouped for one publications and year, publication id, title, language, journal name, authors and affiliations (joined by a semikolon) and whether MPG affiliations are contained.

Imports¶

import pandas as pd
import datetime
import os
import re
import random
import numpy as np
from tqdm import tqdm
from functools import partial
from multiprocessing import  Pool, cpu_count

Setup paths¶

The root path where all SciGraph data can be read and written.

dataPath = '/media/arbeit/b88b17b8-b5d7-4c4e-86bc-ff4b67458842/sciGraph/processedData/'

rootPath = dataPath + "extendedArticles/"

yearBasePath = dataPath + 'yeardata/'

The main method reads each CSV file to allow parallel work on as many CPUs as are locally available. We therefore need a list of the CSV files.

csvList = [x for x in os.listdir(rootPath) if x.startswith('articles') and x.endswith('.csv')]

csvList[0]

'articles_3201.csv'

df = pd.read_csv(rootPath + csvList[120])

df.head(1)

	is_MPG	journalId	journalName	affName	affId	alternateName	autId	familyName	givenName	datePublishedReg	pubId	inLanguage	title
0	False	sg:journal.1017409	Clinical Pharmacology & Therapeutics	['University of Alberta Alberta Research Centr...	https://www.grid.ac/institutes/grid.17089.37	University of Alberta	sg:person.0764071732.05	Hartling	L	2011-11-01	sg:pub.10.1038/clpt.2011.212	['en']	StaR Child Health: Developing Evidence‐Based G...

Define routine¶

def createLinks(file, pubIDkey='pubId', journalIDKey='journalId', journalNameKey = 'journalName', debug=False):
    try:
        filename = str(file)
        df = pd.read_csv(rootPath + file)
        df = df.fillna('')
        year = df['datePublishedReg'].apply(lambda row: row.split('-')[0])
        df.insert(0,'year',year)
        for y,g1 in df.groupby('year'):
            if y:                
                os.makedirs(yearBasePath + y,exist_ok=True)
                with open(yearBasePath + y +  '/' + filename.split('.')[0] + '_{0}.csv'.format(y), 'w') as file:
                    for pub, g2 in g1.groupby(pubIDkey):
                        if debug:
                            print(pub)
                        journalID = g2[journalIDKey].unique()[0]
                        pubName = g2[journalNameKey].unique()[0]
                        lang = g2.inLanguage.unique()[0]
                        title = g2.title.unique()[0]
                        if any([x in g2.is_MPG.value_counts().to_dict().keys() for x in ('True',True)]):
                            has_MPG = 'True'
                        else:
                            has_MPG = 'False'
                        res = []
                        resAff = []
                        for i, row in g2.iterrows():
                            if row['autId']:
                                res.append(row['autId'])
                            elif row['familyName']:
                                res.append(row['familyName'] + '_' + row['givenName'])
                            if row['affId']:
                                resAff.append(row['affId'])
                            elif row['affName']:
                                resAff.append(row['affName'])
                        file.write(
                            '{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\n'.format(
                                y, 
                                pub, 
                                re.sub('\t','',title), 
                                lang, 
                                journalID, 
                                pubName, 
                                has_MPG, 
                                re.sub('\t','',';'.join(res)),
                                ';'.join(resAff)
                            )
                        )
            else:
                os.makedirs(yearBasePath + 'noYear',exist_ok=True)
                with open(yearBasePath + 'noYear/' + filename.split('.')[0] + '_{0}.csv'.format(y), 'w') as file:
                    for pub, g2 in g1.groupby(pubIDkey):
                        if debug:
                            print(pub)
                        publicationID = g2[pubIDkey].unique()[0]
                        pubName = g2[journalNameKey].unique()[0]
                        lang = g2.inLanguage.unique()[0]
                        title = g2.title.unique()[0]
                        if any([x in g2.is_MPG.value_counts().to_dict().keys() for x in ('True',True)]):
                            has_MPG = 'True'
                        else:
                            has_MPG = 'False'
                        res = []
                        resAff = []
                        for i, row in g2.iterrows():
                            if row['autId']:
                                res.append(row['autId'])
                            elif row['familyName']:
                                res.append(row['familyName'] + '_' + row['givenName'])
                            if row['affId']:
                                resAff.append(row['affId'])
                            elif row['affName']:
                                resAff.append(row['affName'])
                        file.write('{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\n'.format(y, pub, re.sub('\t','',title), lang, publicationID, pubName, has_MPG, ';'.join(res),';'.join(resAff)))
    
    except:
        result.append(f'Failed for {filename}')
        raise 
    result.append(f'Done for {filename}')
    return

Testing with single file¶

result = []
createLinks(csvList[120])

result

['Done for articles_5877.csv']

Run on all files¶

ncore = cpu_count() - 2
result = []


with Pool(ncore) as p:
    max_ = len(csvList)
    with tqdm(total=max_) as pbar:
        for i, _ in enumerate(p.imap_unordered(createLinks, csvList)):
            pbar.update()

100%|██████████| 10645/10645 [58:00<00:00,  3.06it/s]

Join year files¶

For each year folder, join the contained files to a single year file in the main yearBasePath folder.

for year in os.listdir(yearBasePath):
    yearPath = yearBasePath + year + '/' 
    if os.path.isdir(yearPath):
        csvList = os.listdir(yearPath)
        with open(yearBasePath + f'{year}.csv','a') as outFile:
            for file in csvList:
                with open(yearPath + file, 'r') as inFile:
                    lines = inFile.readlines()
                    outFile.writelines(lines)

Missing publication year data¶

For a limited number of publications the metadata contains no publication date. These 45 publications could be added to the database by hand.

resDF = []
for file in os.listdir(yearBasePath + 'noYear/'):
    if os.path.isfile(yearBasePath + 'noYear/' + file):
        if file.startswith('articles'):
            resDF.append(pd.read_csv(yearBasePath + 'noYear/' + file, sep='\t',header=None))

dfMissing = pd.concat(resDF)
dfMissing.shape

(45, 9)

dfMissing.head(2)

	0	1	2	3	4	5	6	7	8
0	NaN	sg:pub.10.1007/0-387-23852-2_10	IT-Supported Modeling, Analysis and Design of ...	['en']	sg:pub.10.1007/0-387-23852-2_10	NaN	False	sg:person.011160054337.99;sg:person.0110715537...	NaN
0	NaN	sg:pub.10.1007/7631_2015_7	Uterine Cancer: Pathology	['en']	sg:pub.10.1007/7631_2015_7	NaN	False	sg:person.01354507572.27;sg:person.01066764345.44	['Department of Pathology, Memorial Sloan Kett...

Expand author name and affiliation Statistics of metadata