Join base files for years¶
In this notebook the information contained in the previously cleaned CSV files are reformated and written to year folders. For each CSV the information is grouped for one publications and year, publication id, title, language, journal name, authors and affiliations (joined by a semikolon) and whether MPG affiliations are contained.
Imports¶
import pandas as pd
import datetime
import os
import re
import random
import numpy as np
from tqdm import tqdm
from functools import partial
from multiprocessing import Pool, cpu_count
Setup paths¶
The root path where all SciGraph data can be read and written.
dataPath = '/media/arbeit/b88b17b8-b5d7-4c4e-86bc-ff4b67458842/sciGraph/processedData/'
rootPath = dataPath + "extendedArticles/"
yearBasePath = dataPath + 'yeardata/'
The main method reads each CSV file to allow parallel work on as many CPUs as are locally available. We therefore need a list of the CSV files.
csvList = [x for x in os.listdir(rootPath) if x.startswith('articles') and x.endswith('.csv')]
csvList[0]
'articles_3201.csv'
df = pd.read_csv(rootPath + csvList[120])
df.head(1)
is_MPG | journalId | journalName | affName | affId | alternateName | autId | familyName | givenName | datePublishedReg | pubId | inLanguage | title | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | False | sg:journal.1017409 | Clinical Pharmacology & Therapeutics | ['University of Alberta Alberta Research Centr... | https://www.grid.ac/institutes/grid.17089.37 | University of Alberta | sg:person.0764071732.05 | Hartling | L | 2011-11-01 | sg:pub.10.1038/clpt.2011.212 | ['en'] | StaR Child Health: Developing Evidence‐Based G... |
Define routine¶
def createLinks(file, pubIDkey='pubId', journalIDKey='journalId', journalNameKey = 'journalName', debug=False):
try:
filename = str(file)
df = pd.read_csv(rootPath + file)
df = df.fillna('')
year = df['datePublishedReg'].apply(lambda row: row.split('-')[0])
df.insert(0,'year',year)
for y,g1 in df.groupby('year'):
if y:
os.makedirs(yearBasePath + y,exist_ok=True)
with open(yearBasePath + y + '/' + filename.split('.')[0] + '_{0}.csv'.format(y), 'w') as file:
for pub, g2 in g1.groupby(pubIDkey):
if debug:
print(pub)
journalID = g2[journalIDKey].unique()[0]
pubName = g2[journalNameKey].unique()[0]
lang = g2.inLanguage.unique()[0]
title = g2.title.unique()[0]
if any([x in g2.is_MPG.value_counts().to_dict().keys() for x in ('True',True)]):
has_MPG = 'True'
else:
has_MPG = 'False'
res = []
resAff = []
for i, row in g2.iterrows():
if row['autId']:
res.append(row['autId'])
elif row['familyName']:
res.append(row['familyName'] + '_' + row['givenName'])
if row['affId']:
resAff.append(row['affId'])
elif row['affName']:
resAff.append(row['affName'])
file.write(
'{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\n'.format(
y,
pub,
re.sub('\t','',title),
lang,
journalID,
pubName,
has_MPG,
re.sub('\t','',';'.join(res)),
';'.join(resAff)
)
)
else:
os.makedirs(yearBasePath + 'noYear',exist_ok=True)
with open(yearBasePath + 'noYear/' + filename.split('.')[0] + '_{0}.csv'.format(y), 'w') as file:
for pub, g2 in g1.groupby(pubIDkey):
if debug:
print(pub)
publicationID = g2[pubIDkey].unique()[0]
pubName = g2[journalNameKey].unique()[0]
lang = g2.inLanguage.unique()[0]
title = g2.title.unique()[0]
if any([x in g2.is_MPG.value_counts().to_dict().keys() for x in ('True',True)]):
has_MPG = 'True'
else:
has_MPG = 'False'
res = []
resAff = []
for i, row in g2.iterrows():
if row['autId']:
res.append(row['autId'])
elif row['familyName']:
res.append(row['familyName'] + '_' + row['givenName'])
if row['affId']:
resAff.append(row['affId'])
elif row['affName']:
resAff.append(row['affName'])
file.write('{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\n'.format(y, pub, re.sub('\t','',title), lang, publicationID, pubName, has_MPG, ';'.join(res),';'.join(resAff)))
except:
result.append(f'Failed for {filename}')
raise
result.append(f'Done for {filename}')
return
Testing with single file¶
result = []
createLinks(csvList[120])
result
['Done for articles_5877.csv']
Run on all files¶
ncore = cpu_count() - 2
result = []
with Pool(ncore) as p:
max_ = len(csvList)
with tqdm(total=max_) as pbar:
for i, _ in enumerate(p.imap_unordered(createLinks, csvList)):
pbar.update()
100%|██████████| 10645/10645 [58:00<00:00, 3.06it/s]
Join year files¶
For each year folder, join the contained files to a single year file in the main yearBasePath
folder.
for year in os.listdir(yearBasePath):
yearPath = yearBasePath + year + '/'
if os.path.isdir(yearPath):
csvList = os.listdir(yearPath)
with open(yearBasePath + f'{year}.csv','a') as outFile:
for file in csvList:
with open(yearPath + file, 'r') as inFile:
lines = inFile.readlines()
outFile.writelines(lines)
Missing publication year data¶
For a limited number of publications the metadata contains no publication date. These 45 publications could be added to the database by hand.
resDF = []
for file in os.listdir(yearBasePath + 'noYear/'):
if os.path.isfile(yearBasePath + 'noYear/' + file):
if file.startswith('articles'):
resDF.append(pd.read_csv(yearBasePath + 'noYear/' + file, sep='\t',header=None))
dfMissing = pd.concat(resDF)
dfMissing.shape
(45, 9)
dfMissing.head(2)
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | |
---|---|---|---|---|---|---|---|---|---|
0 | NaN | sg:pub.10.1007/0-387-23852-2_10 | IT-Supported Modeling, Analysis and Design of ... | ['en'] | sg:pub.10.1007/0-387-23852-2_10 | NaN | False | sg:person.011160054337.99;sg:person.0110715537... | NaN |
0 | NaN | sg:pub.10.1007/7631_2015_7 | Uterine Cancer: Pathology | ['en'] | sg:pub.10.1007/7631_2015_7 | NaN | False | sg:person.01354507572.27;sg:person.01066764345.44 | ['Department of Pathology, Memorial Sloan Kett... |