Expand author name and affiliation¶
This notebook reads the JSONL files downloaded from SciGraph:
Team, SN SciGraph (2018): Dataset: Articles. SN SciGraph. Dataset. https://doi.org/10.6084/m9.figshare.7376468
The source material should be extracted in a folder selected by dataPath
and rootPath
, respectively.
The class ExpandAuthorMetadata
is initialized with the corresponding root data path. All apths for writing the various resulting files can be changed in the main class. The defaults are as follows:
self.read = 'articles/'
: The folder where the JSONL files are to be found.self.write = scigraphExtendedArticles/
: The folder for reprocessed output files.self.stats = 'scigraphStats/'
: Folder for writing statistics about JSONL file, namely:number of publications
number of publications with author information
number of publications with affiliation information
number of publications with at least one MPG affiliation
The variable self.mpgstring
holds the regular expression to determine, if a publication should count as partly by MPG members.
Other variables are used to store intermediate data or define the data-keys which should be read in the final CSV version of the output file.
Imports¶
Setup paths¶
The root path where all SciGraph data can be read and written.
The main class is initialized with each JSONL file to allow parallel work on as many CPUs as are locally available. We therefore need a list of the JSONL files.
id | sdPublisher | @context | sdDatePublished | type | sdSource | sdDataset | sdLicense | name | url | ... | description | genre | author | isAccessibleForFree | inLanguage | productId | about | citation | datePublishedReg | isFundedItemOf | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | sg:pub.10.1590/s0482-50042013000400011 | {'name': 'Springer Nature - SN SciGraph projec... | https://springernature.github.io/scigraph/json... | 2019-04-10T15:29 | ScholarlyArticle | s3://com-uberresearch-data-dimensions-target-2... | articles | https://scigraph.springernature.com/explorer/l... | Perfil lipídico em pacientes adultos com artri... | http://www.scielo.br/scielo.php?script=sci_art... | ... | The inflammatory processes in the joints of a ... | research_article | [{'id': 'sg:person.01351230604.52', 'type': 'P... | True | [pt] | [{'value': ['b596404f563e5d292f5012f0441ecd6af... | [{'id': 'http://purl.org/au-research/vocabular... | [{'id': 'https://doi.org/10.1093/rheumatology/... | 2013-07-01 | NaN |
1 | sg:pub.10.1111/j.1601-5223.1962.tb01812.x | {'name': 'Springer Nature - SN SciGraph projec... | https://springernature.github.io/scigraph/json... | 2019-04-10T15:29 | ScholarlyArticle | s3://com-uberresearch-data-dimensions-target-2... | articles | https://scigraph.springernature.com/explorer/l... | GENETIC STUDIES ON BLOOD GROUPS, TRANSFERRINS ... | http://onlinelibrary.wiley.com/doi/10.1111/j.1... | ... | NaN | research_article | [{'type': 'Person', 'givenName': 'M.', 'family... | True | [en] | [{'value': ['b6b66a6e00b3a94b45af93095c64c7b8d... | NaN | [{'id': 'https://doi.org/10.1080/0001512570944... | 1962-08-01 | NaN |
2 rows × 24 columns
Main class definition¶
To rerun the statistics creation, set the variable self.createStats = True
.
class ExpandAuthorMetadata(object):
def __init__(self, filename, rootPath):
self.base = rootPath
self.baseOutPath = 'processedData/'
self.read = 'articles/'
self.rerun = True
self.write = self.baseOutPath + 'extendedArticles/'
self.createStats = True
self.stats = self.baseOutPath + 'stats/'
self.statsErros = self.baseOutPath + 'errors/'
for folder in [self.baseOutPath, self.write, self.stats, self.statsErros]:
if not os.path.isdir(self.base + folder):
os.makedirs(self.base + folder)
self.dataframe = pd.read_json(self.base + self.read + filename, lines=True)
self.filename = filename
self.keys = ['givenName','familyName','id','affiliation']
self.affkeys = ['alternateName','id','name']
self.pubkeys = ['name','id']
self.mpgstring = 'Max-Planck-Ins|Max Planck Ins|MPI|MPG'
def getMissingData(self):
res = {}
res['File'] = self.filename
df = self.dataframe
res['N_pub'] = df.shape[0]
res['N_aut_with_MPG'] = ''
try:
aut = df[~df.author.isna()]
res['N_pub_with_aut'] = aut.shape[0]
except:
res['N_pub_with_aut'] = None
return res
autL = [x for y in aut.author.values for x in y]
dfAut = pd.DataFrame(autL)
res['N_aut'] = dfAut.shape[0]
try:
dfAff = dfAut[~dfAut.affiliation.isna()]
res['N_aut_with_aff'] = dfAff.shape[0]
except:
res['N_aut_with_aff'] = None
return res
def _getAutKeys(self, row, key):
if row:
try:
return row[key]
except:
return ''
def _getPubKeys(self, row, key):
if row:
try:
res = [x[key] for x in row if key in x.keys()]
if len(res) == 1:
return res[0]
else:
return ';'.join(res)
except:
return ''
def _getAutAff(self, row, key):
if row:
try:
aff = row['affiliation']
return aff[key]
except:
return ''
def _findMPG(self,row):
if row:
try:
if any([re.findall(self.mpgstring, x) for x in row]):
return True
else:
return False
except:
return 'Error'
else:
return 'None'
def run(self):
outfile = self.base + self.write + self.filename.split('.')[0] + '.csv'
if os.path.isfile(outfile):
if not self.rerun:
return 'exists'
if self.createStats:
try:
stat = self.getMissingData()
pd.DataFrame([stat]).to_csv(self.base + self.stats + self.filename, index=False)
except:
with open('{0}'.format(self.base + self.statsErros + self.filename),'w') as file:
file.write('Error')
try:
dfT0 = self.dataframe[['datePublishedReg','id','author','isPartOf','inLanguage','name']]
dfT1 = dfT0.rename(columns={'name':'title'})
dfT2 = dfT1.explode('author').reset_index(drop=True).rename(columns={'id':'pubId'})
res1 = {}
for el in self.keys:
res1[el] = dfT2.author.apply(lambda row: self._getAutKeys(row, el))
for el in self.keys:
dfT2.insert(0,el,res1[el])
dfT3 = dfT2.rename(columns={'id':'autId'})
res2 = {}
for el in self.affkeys:
res2[el] = dfT3.author.apply(lambda row: self._getAutAff(row, el))
for el in self.affkeys:
dfT3.insert(0,el,res2[el])
dfT4 = dfT3.rename(columns={'id':'affId','name':'affName'})
res3 = {}
for el in self.pubkeys:
res3[el] = dfT4.isPartOf.apply(lambda row: self._getPubKeys(row, el))
for el in self.pubkeys:
dfT4.insert(0,el,res3[el])
dfT5 = dfT4.rename(columns={'id':'journalId','name':'journalName'})
dfT6 = dfT5.drop('author',axis=1).drop('affiliation',axis=1).drop('isPartOf', axis=1)
mpg = dfT6.affName.apply(lambda row: self._findMPG(row))
if self.createStats:
stat.update(
{'N_aut_with_MPG':(mpg == True).sum()}
)
pd.DataFrame([stat]).to_csv(self.base + self.stats + self.filename, index=False)
dfT6.insert(0,'is_MPG', mpg)
dfT6.to_csv(outfile, index = False)
return 'done'
except:
return self.filename
def makeRun(filename):
x = ExpandAuthorMetadata(filename, dataPath)
s = x.run()
return s
Testing for single file¶
Run expansion for all files¶
For a large system select as many CPUs as possible. On an intermediate workstation with 16 cores this operation took 40 minutes.