Expand author name and affiliation

This notebook reads the JSONL files downloaded from SciGraph:

Team, SN SciGraph (2018): Dataset: Articles. SN SciGraph. Dataset. https://doi.org/10.6084/m9.figshare.7376468

The source material should be extracted in a folder selected by dataPath and rootPath, respectively.

The class ExpandAuthorMetadata is initialized with the corresponding root data path. All apths for writing the various resulting files can be changed in the main class. The defaults are as follows:

  • self.read = 'articles/' : The folder where the JSONL files are to be found.

  • self.write = scigraphExtendedArticles/: The folder for reprocessed output files.

  • self.stats = 'scigraphStats/': Folder for writing statistics about JSONL file, namely:

    • number of publications

    • number of publications with author information

    • number of publications with affiliation information

    • number of publications with at least one MPG affiliation

The variable self.mpgstring holds the regular expression to determine, if a publication should count as partly by MPG members.

Other variables are used to store intermediate data or define the data-keys which should be read in the final CSV version of the output file.

Imports

import pandas as pd
import datetime
import os
import re
import random
import numpy as np
from tqdm import tqdm
from functools import partial
from multiprocessing import  Pool, cpu_count
Copy to clipboard

Setup paths

The root path where all SciGraph data can be read and written.

dataPath = '/media/arbeit/b88b17b8-b5d7-4c4e-86bc-ff4b67458842/sciGraph/'
Copy to clipboard

The main class is initialized with each JSONL file to allow parallel work on as many CPUs as are locally available. We therefore need a list of the JSONL files.

articlesPath = dataPath + "articles/"
Copy to clipboard
jsonList = [x for x in os.listdir(articlesPath) if x.startswith('articles') and x.endswith('.jsonl')]
Copy to clipboard
jsonList[0]
Copy to clipboard
'articles_2983.jsonl'
Copy to clipboard
df = pd.read_json(articlesPath + jsonList[123],lines=True)
Copy to clipboard
df.head(2)
Copy to clipboard
id sdPublisher @context sdDatePublished type sdSource sdDataset sdLicense name url ... description genre author isAccessibleForFree inLanguage productId about citation datePublishedReg isFundedItemOf
0 sg:pub.10.1590/s0482-50042013000400011 {'name': 'Springer Nature - SN SciGraph projec... https://springernature.github.io/scigraph/json... 2019-04-10T15:29 ScholarlyArticle s3://com-uberresearch-data-dimensions-target-2... articles https://scigraph.springernature.com/explorer/l... Perfil lipídico em pacientes adultos com artri... http://www.scielo.br/scielo.php?script=sci_art... ... The inflammatory processes in the joints of a ... research_article [{'id': 'sg:person.01351230604.52', 'type': 'P... True [pt] [{'value': ['b596404f563e5d292f5012f0441ecd6af... [{'id': 'http://purl.org/au-research/vocabular... [{'id': 'https://doi.org/10.1093/rheumatology/... 2013-07-01 NaN
1 sg:pub.10.1111/j.1601-5223.1962.tb01812.x {'name': 'Springer Nature - SN SciGraph projec... https://springernature.github.io/scigraph/json... 2019-04-10T15:29 ScholarlyArticle s3://com-uberresearch-data-dimensions-target-2... articles https://scigraph.springernature.com/explorer/l... GENETIC STUDIES ON BLOOD GROUPS, TRANSFERRINS ... http://onlinelibrary.wiley.com/doi/10.1111/j.1... ... NaN research_article [{'type': 'Person', 'givenName': 'M.', 'family... True [en] [{'value': ['b6b66a6e00b3a94b45af93095c64c7b8d... NaN [{'id': 'https://doi.org/10.1080/0001512570944... 1962-08-01 NaN

2 rows × 24 columns

Main class definition

To rerun the statistics creation, set the variable self.createStats = True.

class ExpandAuthorMetadata(object):
    
    def __init__(self, filename, rootPath):
        self.base = rootPath
        self.baseOutPath = 'processedData/'
        self.read = 'articles/'
        self.rerun = True
        self.write = self.baseOutPath + 'extendedArticles/'
        self.createStats = True
        self.stats = self.baseOutPath + 'stats/'
        self.statsErros = self.baseOutPath + 'errors/'
        for folder in [self.baseOutPath, self.write, self.stats, self.statsErros]:
            if not os.path.isdir(self.base + folder):
                os.makedirs(self.base + folder)
        self.dataframe = pd.read_json(self.base + self.read + filename, lines=True)
        self.filename = filename
        self.keys = ['givenName','familyName','id','affiliation']
        self.affkeys = ['alternateName','id','name'] 
        self.pubkeys = ['name','id']
        self.mpgstring = 'Max-Planck-Ins|Max Planck Ins|MPI|MPG'
    
    def getMissingData(self):
        res = {}
        res['File'] = self.filename
        df = self.dataframe
        res['N_pub'] = df.shape[0]
        res['N_aut_with_MPG'] = ''
        try:
            aut = df[~df.author.isna()]
            res['N_pub_with_aut'] = aut.shape[0]
        except:
            res['N_pub_with_aut'] = None
            return res
        autL = [x for y in aut.author.values for x in y]  
        dfAut = pd.DataFrame(autL)
        res['N_aut'] = dfAut.shape[0]
        try:
            dfAff = dfAut[~dfAut.affiliation.isna()]
            res['N_aut_with_aff'] = dfAff.shape[0]
        except:
            res['N_aut_with_aff'] = None
        return res
    
    def _getAutKeys(self, row, key):
        if row:
            try:
                return row[key]
            except:
                return ''
            
    def _getPubKeys(self, row, key):
        if row:
            try:
                res = [x[key] for x in row if key in x.keys()]
                if len(res) == 1:
                    return res[0]
                else:
                    return ';'.join(res)
            except:
                return ''
            
    def _getAutAff(self, row, key):
        if row:
            try:
                aff = row['affiliation']
                return aff[key]
            except:
                return ''
            
    def _findMPG(self,row):
        if row:
            try:
                if any([re.findall(self.mpgstring, x) for x in row]):
                    return True
                else:
                    return False
            except:
                return 'Error'
        else:
            return 'None'
    
    def run(self):
        outfile = self.base + self.write + self.filename.split('.')[0] + '.csv'
        if os.path.isfile(outfile):
            if not self.rerun:
                return 'exists'
        if self.createStats:
            try:
                stat = self.getMissingData()
                pd.DataFrame([stat]).to_csv(self.base + self.stats + self.filename, index=False)
            except:
                with open('{0}'.format(self.base + self.statsErros + self.filename),'w') as file:
                    file.write('Error')
        try:
            dfT0 = self.dataframe[['datePublishedReg','id','author','isPartOf','inLanguage','name']]
            dfT1 = dfT0.rename(columns={'name':'title'})
            dfT2 = dfT1.explode('author').reset_index(drop=True).rename(columns={'id':'pubId'})
            res1 = {}
            for el in self.keys:
                res1[el] = dfT2.author.apply(lambda row: self._getAutKeys(row, el))
            for el in self.keys:
                dfT2.insert(0,el,res1[el])
            dfT3 = dfT2.rename(columns={'id':'autId'})
            res2 = {}
            for el in self.affkeys:
                res2[el] = dfT3.author.apply(lambda row: self._getAutAff(row, el))
            for el in self.affkeys:
                dfT3.insert(0,el,res2[el])
            dfT4 = dfT3.rename(columns={'id':'affId','name':'affName'})
            res3 = {}
            for el in self.pubkeys:
                res3[el] = dfT4.isPartOf.apply(lambda row: self._getPubKeys(row, el))
            for el in self.pubkeys:
                dfT4.insert(0,el,res3[el])
            dfT5 = dfT4.rename(columns={'id':'journalId','name':'journalName'})
        
            dfT6 = dfT5.drop('author',axis=1).drop('affiliation',axis=1).drop('isPartOf', axis=1)
            mpg = dfT6.affName.apply(lambda row: self._findMPG(row))
            if self.createStats:
                stat.update(
                    {'N_aut_with_MPG':(mpg == True).sum()}
                )
                pd.DataFrame([stat]).to_csv(self.base + self.stats + self.filename, index=False)
            dfT6.insert(0,'is_MPG', mpg)
            dfT6.to_csv(outfile, index = False)
            return 'done'
        except:
            return self.filename
    
def makeRun(filename):
    x = ExpandAuthorMetadata(filename, dataPath)
    s = x.run()
    return s
Copy to clipboard

Testing for single file

makeRun(jsonList[3])
Copy to clipboard
'done'
Copy to clipboard

Run expansion for all files

For a large system select as many CPUs as possible. On an intermediate workstation with 16 cores this operation took 40 minutes.

ncore = cpu_count() - 2

with Pool(ncore) as p:
    max_ = len(jsonList)
    with tqdm(total=max_) as pbar:
        for i, _ in enumerate(p.imap_unordered(makeRun, jsonList)):
            pbar.update()
Copy to clipboard
100%|██████████| 10684/10684 [41:45<00:00,  4.26it/s] 
Copy to clipboard