Expand author name and affiliation¶

This notebook reads the JSONL files downloaded from SciGraph:

Team, SN SciGraph (2018): Dataset: Articles. SN SciGraph. Dataset. https://doi.org/10.6084/m9.figshare.7376468

The source material should be extracted in a folder selected by dataPath and rootPath, respectively.

The class ExpandAuthorMetadata is initialized with the corresponding root data path. All apths for writing the various resulting files can be changed in the main class. The defaults are as follows:

self.read = 'articles/' : The folder where the JSONL files are to be found.
self.write = scigraphExtendedArticles/: The folder for reprocessed output files.
self.stats = 'scigraphStats/': Folder for writing statistics about JSONL file, namely:
- number of publications
- number of publications with author information
- number of publications with affiliation information
- number of publications with at least one MPG affiliation

The variable self.mpgstring holds the regular expression to determine, if a publication should count as partly by MPG members.

Other variables are used to store intermediate data or define the data-keys which should be read in the final CSV version of the output file.

Imports¶

import pandas as pd
import datetime
import os
import re
import random
import numpy as np
from tqdm import tqdm
from functools import partial
from multiprocessing import  Pool, cpu_count

Setup paths¶

The root path where all SciGraph data can be read and written.

dataPath = '/media/arbeit/b88b17b8-b5d7-4c4e-86bc-ff4b67458842/sciGraph/'

The main class is initialized with each JSONL file to allow parallel work on as many CPUs as are locally available. We therefore need a list of the JSONL files.

articlesPath = dataPath + "articles/"

jsonList = [x for x in os.listdir(articlesPath) if x.startswith('articles') and x.endswith('.jsonl')]

jsonList[0]

'articles_2983.jsonl'

df = pd.read_json(articlesPath + jsonList[123],lines=True)

df.head(2)

	id	sdPublisher	@context	sdDatePublished	type	sdSource	sdDataset	sdLicense	name	url	...	description	genre	author	isAccessibleForFree	inLanguage	productId	about	citation	datePublishedReg	isFundedItemOf
0	sg:pub.10.1590/s0482-50042013000400011	{'name': 'Springer Nature - SN SciGraph projec...	https://springernature.github.io/scigraph/json...	2019-04-10T15:29	ScholarlyArticle	s3://com-uberresearch-data-dimensions-target-2...	articles	https://scigraph.springernature.com/explorer/l...	Perfil lipídico em pacientes adultos com artri...	http://www.scielo.br/scielo.php?script=sci_art...	...	The inflammatory processes in the joints of a ...	research_article	[{'id': 'sg:person.01351230604.52', 'type': 'P...	True	[pt]	[{'value': ['b596404f563e5d292f5012f0441ecd6af...	[{'id': 'http://purl.org/au-research/vocabular...	[{'id': 'https://doi.org/10.1093/rheumatology/...	2013-07-01	NaN
1	sg:pub.10.1111/j.1601-5223.1962.tb01812.x	{'name': 'Springer Nature - SN SciGraph projec...	https://springernature.github.io/scigraph/json...	2019-04-10T15:29	ScholarlyArticle	s3://com-uberresearch-data-dimensions-target-2...	articles	https://scigraph.springernature.com/explorer/l...	GENETIC STUDIES ON BLOOD GROUPS, TRANSFERRINS ...	http://onlinelibrary.wiley.com/doi/10.1111/j.1...	...	NaN	research_article	[{'type': 'Person', 'givenName': 'M.', 'family...	True	[en]	[{'value': ['b6b66a6e00b3a94b45af93095c64c7b8d...	NaN	[{'id': 'https://doi.org/10.1080/0001512570944...	1962-08-01	NaN

2 rows × 24 columns

Main class definition¶

To rerun the statistics creation, set the variable self.createStats = True.

class ExpandAuthorMetadata(object):
    
    def __init__(self, filename, rootPath):
        self.base = rootPath
        self.baseOutPath = 'processedData/'
        self.read = 'articles/'
        self.rerun = True
        self.write = self.baseOutPath + 'extendedArticles/'
        self.createStats = True
        self.stats = self.baseOutPath + 'stats/'
        self.statsErros = self.baseOutPath + 'errors/'
        for folder in [self.baseOutPath, self.write, self.stats, self.statsErros]:
            if not os.path.isdir(self.base + folder):
                os.makedirs(self.base + folder)
        self.dataframe = pd.read_json(self.base + self.read + filename, lines=True)
        self.filename = filename
        self.keys = ['givenName','familyName','id','affiliation']
        self.affkeys = ['alternateName','id','name'] 
        self.pubkeys = ['name','id']
        self.mpgstring = 'Max-Planck-Ins|Max Planck Ins|MPI|MPG'
    
    def getMissingData(self):
        res = {}
        res['File'] = self.filename
        df = self.dataframe
        res['N_pub'] = df.shape[0]
        res['N_aut_with_MPG'] = ''
        try:
            aut = df[~df.author.isna()]
            res['N_pub_with_aut'] = aut.shape[0]
        except:
            res['N_pub_with_aut'] = None
            return res
        autL = [x for y in aut.author.values for x in y]  
        dfAut = pd.DataFrame(autL)
        res['N_aut'] = dfAut.shape[0]
        try:
            dfAff = dfAut[~dfAut.affiliation.isna()]
            res['N_aut_with_aff'] = dfAff.shape[0]
        except:
            res['N_aut_with_aff'] = None
        return res
    
    def _getAutKeys(self, row, key):
        if row:
            try:
                return row[key]
            except:
                return ''
            
    def _getPubKeys(self, row, key):
        if row:
            try:
                res = [x[key] for x in row if key in x.keys()]
                if len(res) == 1:
                    return res[0]
                else:
                    return ';'.join(res)
            except:
                return ''
            
    def _getAutAff(self, row, key):
        if row:
            try:
                aff = row['affiliation']
                return aff[key]
            except:
                return ''
            
    def _findMPG(self,row):
        if row:
            try:
                if any([re.findall(self.mpgstring, x) for x in row]):
                    return True
                else:
                    return False
            except:
                return 'Error'
        else:
            return 'None'
    
    def run(self):
        outfile = self.base + self.write + self.filename.split('.')[0] + '.csv'
        if os.path.isfile(outfile):
            if not self.rerun:
                return 'exists'
        if self.createStats:
            try:
                stat = self.getMissingData()
                pd.DataFrame([stat]).to_csv(self.base + self.stats + self.filename, index=False)
            except:
                with open('{0}'.format(self.base + self.statsErros + self.filename),'w') as file:
                    file.write('Error')
        try:
            dfT0 = self.dataframe[['datePublishedReg','id','author','isPartOf','inLanguage','name']]
            dfT1 = dfT0.rename(columns={'name':'title'})
            dfT2 = dfT1.explode('author').reset_index(drop=True).rename(columns={'id':'pubId'})
            res1 = {}
            for el in self.keys:
                res1[el] = dfT2.author.apply(lambda row: self._getAutKeys(row, el))
            for el in self.keys:
                dfT2.insert(0,el,res1[el])
            dfT3 = dfT2.rename(columns={'id':'autId'})
            res2 = {}
            for el in self.affkeys:
                res2[el] = dfT3.author.apply(lambda row: self._getAutAff(row, el))
            for el in self.affkeys:
                dfT3.insert(0,el,res2[el])
            dfT4 = dfT3.rename(columns={'id':'affId','name':'affName'})
            res3 = {}
            for el in self.pubkeys:
                res3[el] = dfT4.isPartOf.apply(lambda row: self._getPubKeys(row, el))
            for el in self.pubkeys:
                dfT4.insert(0,el,res3[el])
            dfT5 = dfT4.rename(columns={'id':'journalId','name':'journalName'})
        
            dfT6 = dfT5.drop('author',axis=1).drop('affiliation',axis=1).drop('isPartOf', axis=1)
            mpg = dfT6.affName.apply(lambda row: self._findMPG(row))
            if self.createStats:
                stat.update(
                    {'N_aut_with_MPG':(mpg == True).sum()}
                )
                pd.DataFrame([stat]).to_csv(self.base + self.stats + self.filename, index=False)
            dfT6.insert(0,'is_MPG', mpg)
            dfT6.to_csv(outfile, index = False)
            return 'done'
        except:
            return self.filename
    
def makeRun(filename):
    x = ExpandAuthorMetadata(filename, dataPath)
    s = x.run()
    return s

Testing for single file¶

makeRun(jsonList[3])

'done'

Run expansion for all files¶

For a large system select as many CPUs as possible. On an intermediate workstation with 16 cores this operation took 40 minutes.

ncore = cpu_count() - 2

with Pool(ncore) as p:
    max_ = len(jsonList)
    with tqdm(total=max_) as pbar:
        for i, _ in enumerate(p.imap_unordered(makeRun, jsonList)):
            pbar.update()

100%|██████████| 10684/10684 [41:45<00:00,  4.26it/s]

The Max Planck Society and its scientific context