Focus on MPG publications¶
Imports¶
import pandas as pd
import os
import regex
import re
import string
import difflib
import numpy as np
from tqdm import tqdm
from functools import partial
from multiprocessing import Pool, cpu_count
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.pyplot as plt
Setup paths¶
gitDataPath = '../data/processedData/'
dataPath = '/media/arbeit/b88b17b8-b5d7-4c4e-86bc-ff4b67458842/sciGraph/'
General stopword routine to re-use at every step¶
rePunct = "\(|\)|\.|\d+|,|-"
rePunct
'\\(|\\)|\\.|\\d+|,|-'
stopwords = [
'Forschungsstelle',
'Förderung',
'förderung',
'Wissenschaften',
'Kaiser-Wilhelm-Institut',
'Max-Planck-gesellschaft',
'Max-Planck-Institut',
'maxplanckgesellschaft',
'maxplanckinstitut',
'maxplanckinstituts',
'maxplanckinstitute',
'maxplanckinsitut',
'maxplanckinst',
'maxplanckinstitutes',
'maxplanckinstituten',
'maxplanckinsituts',
'maxplanckinstitutt',
'maxplanckinstiut',
'maxplanckinsütut',
'planckinstitut',
'insitute',
'insitut',
'maxplanckarbeitsgruppe',
'maxplanckforschungsstelle',
'Max Planck Institute',
'Max-Planck-Institute',
'Max-Planck-Institutes',
'Max-Planck-Instituts',
'Max',
'Planck',
'MPI',
'MPG',
'Instituts',
'Institutes',
'Institute',
'Institut',
'Planck-Institut',
'zur',
'ab',
'die',
'und',
'der',
'of',
'für',
'fuer',
'fur',
'f¨r',
'fr',
'für',
'for',
'\n',
'des',
'the',
'GmbH',
'in',
'von'
]
lowerStopwords = [x.lower() for x in stopwords]
def cleanStopWords(instring):
ret = []
braket = [x.split(')(') for x in instring.split(' ')]
for x in [x for y in braket for x in y]:
x = re.sub(rePunct,'',x)
x = x.strip('\n').lower()
if x not in lowerStopwords:
if x.strip() not in ret:
ret.append(x.strip())
return ' '.join([x for x in ret if x])
Loading alternative spelling of MPIs¶
Based on OCR error files by Dirk Wintergrün.
with open(gitDataPath + 'all_labels.tsv','r', encoding='utf8') as file:
lines = [x.split('\t') for x in file.readlines()]
labels = {}
for l0 in lines[1:]:
if not 'KWI' in l0[0]:
for l1 in l0[1:]:
l1 = cleanStopWords(l1)
if l1:
labels[l1] = l0[0]
Read data from files¶
The corpus is created by appyling a regex routine on the corpus file for every year. See previous Notebook 5.
dfLoad = pd.read_csv(gitDataPath + 'mpgpubs.csv', sep='\t')
dfLoad.head(2)
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | |
---|---|---|---|---|---|---|---|---|---|
0 | 1946 | sg:pub.10.1007/bf00643799 | Kondensationsvorgänge aus überhitztem Arsenikd... | ['de'] | sg:journal.1018189 | The Science of Nature | False | sg:person.013544521043.99;Korb_A. | https://www.grid.ac/institutes/grid.418028.7;h... |
1 | 1946 | sg:pub.10.1007/bf00624523 | Uber die Phosphorylase der Leukocyten | ['en'] | sg:journal.1018189 | The Science of Nature | False | Rohdewald_Margarete | https://www.grid.ac/institutes/grid.418441.c |
dfMPGall = dfLoad.drop('6',axis=1).rename(columns={'0':'year','1':'pubID','2':'title','3':'lang','4':'journalID','5':'journalName','7':'authors','8':'affName'})
dfMPGall.head(2)
year | pubID | title | lang | journalID | journalName | authors | affName | |
---|---|---|---|---|---|---|---|---|
0 | 1946 | sg:pub.10.1007/bf00643799 | Kondensationsvorgänge aus überhitztem Arsenikd... | ['de'] | sg:journal.1018189 | The Science of Nature | sg:person.013544521043.99;Korb_A. | https://www.grid.ac/institutes/grid.418028.7;h... |
1 | 1946 | sg:pub.10.1007/bf00624523 | Uber die Phosphorylase der Leukocyten | ['en'] | sg:journal.1018189 | The Science of Nature | Rohdewald_Margarete | https://www.grid.ac/institutes/grid.418441.c |
Resources for Sektion mapping¶
Person DB¶
One version was manually cleaned to delete double mentions, years, brackets and so on in instituts names. This data was then used to create a dictionary of mappings.
dfInstCleaned = pd.read_csv(gitDataPath + 'mpg_institute_cleanded.csv')
preprocessed = dfInstCleaned['institut_name']
The main information about the institutes of the MPG is derived from the research database of the research program “History of the Max Planck Society”, for more details see the website.
dfInstDirty = pd.read_csv(gitDataPath + 'Institute.csv')
dfInstDirtMPG = dfInstDirty[dfInstDirty.is_mpg == True].reset_index(drop=True)
dfInstDirtMPG.insert(0,'preprocessed', preprocessed)
cleanstopDirt = dfInstDirtMPG.preprocessed.apply(lambda row: cleanStopWords(row))
dfInstDirtMPG.insert(0,'cleaned',cleanstopDirt)
Two translation dictionaries : from the cleaned words to the database institut name, and the same to the sektion. At least one MPI ( Empirical Aestetics ) has no Sektion.
input2DBName = dfInstDirtMPG[['cleaned','institut_name','date_founded']].sort_values('date_founded').set_index('cleaned')['institut_name'].to_dict()
input2Sektion = dfInstDirtMPG[['cleaned','sektion','date_founded']].sort_values('date_founded').replace('','MPG').fillna('None').set_index('cleaned')['sektion'].to_dict()
Additionaly, create dictionary to translate from database name to sektion.
dbname2Sektion = dfInstDirtMPG[['institut_name','sektion','date_founded']].sort_values('date_founded').replace('','MPG').fillna('None').set_index('institut_name')['sektion'].to_dict()
Manual translations of OCR errors¶
This list is created manually, by comparing likely hits of instituts names.
dfManualTransl = pd.read_csv(gitDataPath + 'translations_inst_names.csv')
dfManualTransl.insert(0,'cleaned',dfManualTransl['0'].apply(lambda row: cleanStopWords(row)))
ocrInput2DBName = dfManualTransl[['cleaned','Unnamed: 0']].drop_duplicates().set_index('cleaned')['Unnamed: 0'].to_dict()
def findKey(inputString,debug=False,debugStr=''):
ret = ''
l1 = cleanStopWords(inputString)
if l1 in input2DBName.keys():
ret = input2DBName[l1]
elif l1 in labels.keys():
l2 = cleanStopWords(labels[l1])
if l2 in input2DBName.keys():
ret = input2DBName[l2]
else:
ret=inputString
if debug:
return f'{debugStr}: Input: {inputString} Output: {ret}'
else:
return ret
Grid data¶
Source : Global Research Identifier Database
This data is used to map between identifiers of the form grid.45667.2 to instituts name. All MPG instituts are related to the MPG (GridID grid.4372.2). This allows also to create lists of german to english translations of names.
dfGrid = pd.read_csv(dataPath + 'grid/grid-2020-06-29/grid.csv')
dfGridRelations = pd.read_csv(dataPath + 'grid/grid-2020-06-29/full_tables/relationships.csv')
dfGRIDLabels = pd.read_csv(dataPath + 'grid/grid-2020-06-29/full_tables/labels.csv')
mpgGridIDs = dfGridRelations[dfGridRelations.grid_id == 'grid.4372.2'].related_grid_id.values
name2GRID = dfGRIDLabels[dfGRIDLabels.grid_id.isin(mpgGridIDs)][['grid_id','label']].set_index('label')['grid_id'].to_dict()
grid2name = {y:x for x, y in name2GRID.items()}
If there are no german labels, use the main data.
for grid_i in mpgGridIDs:
if grid_i not in grid2name.keys():
grid2name.update({grid_i:dfGrid[dfGrid.ID == grid_i].Name.values[0]})
English translations to DB Names¶
dfGridEngl = dfGrid[dfGrid.ID.isin(mpgGridIDs)][['ID','Name']]
germ = dfGridEngl.ID.apply(lambda row: grid2name[row])
dfGridEngl.insert(2,'gerName',germ)
cleanEng = dfGridEngl.Name.apply(lambda row: cleanStopWords(row))
dfGridEngl.insert(2,'cleanEng',cleanEng)
transl = dfGridEngl.gerName.apply(lambda row: findKey(row,debug=False))
dfGridEngl.insert(3,'foundMPIs',transl)
enginput2DBName = dfGridEngl[['cleanEng','foundMPIs']].set_index('cleanEng')['foundMPIs'].to_dict()
Finding the right MPIs¶
This routine uses regular expression to find the affiliated MPI. In the SciGraph data are three main cases of affiliation data.
If the affiliation can be disambiguated, the GRID ID is used. If all affiliations can be disambiguated, you find expressions
https://grid.ac/institutes/grid.419696.5
joined by semikolon. For each grid id, we check if it is part of the MPG list.For older publications, often no disambiguation was possible, such that the affiliation string has the form
"'affil 1','affil 2',' affil 3'"
. Here, regular expressions are used to find names of MPIs. Since the exact form of the string varies a lot, we have a long list of possible spellings and also OCR errors.For mixed cases of disambiguated and raw affiliation data, we check for each element, if grid or not and then apply the relevant method.
In each case, the found MPI is converted to terms with the cleaning routine above. Then various dictionaries are aplied to find the correct database institut. Results are joined by a semikolon.
Known problems:
For some MPIs the affiliation could be captured by
([^'\[])+Max\sPlanck\sInstitute,([^;,]+)+
, i.e. the name of the institute comes before mentioning that its a MPI. These cases lead to missing all other cases, such that they have to be captures manually after the main routine.Some special collaborations, like the
MPG-CAS
partner instituts are not captured.Some miss detection exists, like
MPG Ranch
.
findMPI = """
Max-Delbrück-Laboratorium\sin\sder\sMPG|
Klinische\sArbeitsgruppe\sder\sMPG|
MPG\sArbeitsgruppe\s([^;,]+)+| #
MPG-AG\s([^;,]+)+| #
MPG-CAS\sPICB|#
CAS-MPG\sPartner\sInstitute|#
Fritz-Haber\sInstitut\sder\sMPG| #
Fritz-Haber-Institut\sof\sthe\sMPG| #
Fritz-Haber-Institute\sder\sMPG|#
Fritz-Haber-Institut\s{1,2}der\sMPG|#
Deutschen\sForschungs-anstalt\sfür\sPsychiatrie| #
Deutschen\sForschungsanstalt\sfür\sPsychiatrie| #
Deutsche\sForschungsanstalt\sfür\sPsychiatrie| #
KWI\sfür([^;,]+)+| #
MPI\sfür([^;,]+)+| #
MPI\sfor([^;,]+)+| #
MPI\szur([^;,]+)+|#
MPI\sof([^;,]+)+|#
MPI\s([^;,]+)+|#
Max-Planck-Institut\sf\.([^;,]+)+|
Max\sPlanck\sInstitut\sf\.([^;,]+)+|
Max-Planck-Institute\sf\.([^;,]+)+|
Max-Planck-Inst\sf\.([^;,]+)+|
Max\sPlanck\sInstitute\sf\.([^;,]+)+|
Max-Planck-Insütut\sfür([^;,]+)+|
Max\sPlanck\sInsütut\sfür([^;,]+)+|
Max-Planck-Institut\sfür([^;,]+)+|
Max-Planck-Instituts\sfür([^;,]+)+|
Max-Planck-Institute\sfür([^;,]+)+|
Max\sPlanck\sInstituts\sfür([^;,]+)+|
Max-Planck-Institutes\sfür([^;,]+)+|
Max\sPlanck\sInstitutes\sfür([^;,]+)+|
Max\sPlanck\sInstitut\sfür([^;,]+)+|#
Max\sPlanck\sInstitute\sfür([^;,]+)+|#
Max-Planck-Instituten\sfür([^;,]+)+|#
Max\sPlanck\sInstituten\sfür([^;,]+)+|#
Max\sPlanck\sInstitute\sfuer([^;,]+)+|#
Max\sPlanck\sInstitut\sfuer([^;,]+)+|#
Max-Planck-Institut\sfuer([^;,]+)+|#
Max-Planck-Institute\sfuer([^;,]+)+|#
Max-Planck-Institut\sf\?r([^;,]+)+|#
Max\sPlanck\sInstitut\sf\?r([^;,]+)+|#
Max-Planck-Institut\szur([^;,]+)+|#
Max\sPlanck\sInstitut\szur([^;,]+)+|#
Max\sPlanck\sInstitute\sfor([^;,]+)+| #
Max-Planck-Institute\sfor([^;,]+)+| #
Max-Planck-Institut\sfor([^;,]+)+| #
Max\sPlanck\sInstitute\sof([^;,]+)+| #
Max-Planck-Institute\sof([^;,]+)+| #
Max-Planck-Institut\sof([^;,]+)+| #
Max-Planck-Institute\s([^;,]+)+| #
Max\sPlanck\sInstitute\s([^;,]+)+| #
Max-Planck-Insitut\sfür([^;,]+)+| #
Max\sPlanck\sInsitut\sfür([^;,]+)+| #
Max-Planck-Institut\sfur([^;,]+)+| #
Max\sPlanck\sInstitut\sfur([^;,]+)+| #
Max-Planck-Inst.\sfür([^;,]+)+| #
Max\sPlanck\sInst\.\sfür([^;,]+)+| #
Max-Planck-Inst\.\sf\.([^;,]+)+| #
Max\sPlanck\sInst\.\sf\.([^;,]+)+| #
Max-Planck-Instiut\sfür([^;,]+)+| #
Max\sPlanck\sInstiut\sfür([^;,]+)+| #
Max-Planck-Insituts\sfür([^;,]+)+| #
Max\sPlanck\sInsituts\sfür([^;,]+)+| #
Max-Planck-Institut\s([^;,]+)+| #
Max\sPlanck\sInstitut\s([^;,]+)+| #
Max\sPlanck\sInsitute\sfor\s([^;,]+)+| #
Max-Planck-Institutt\sfür\s([^;,]+)+#
"""
find = re.compile(findMPI,re.X|re.MULTILINE)
def findKeyString(inputString,debug=False,debugStr=''):
ret = ''
l1 = cleanStopWords(inputString)
if l1 in input2DBName.keys():
ret = input2DBName[l1]
elif l1 in enginput2DBName.keys():
ret = enginput2DBName[l1]
elif l1 in ocrInput2DBName.keys():
ret = ocrInput2DBName[l1]
elif l1 in labels.keys():
l2 = cleanStopWords(labels[l1])
if l2 in input2DBName.keys():
ret = input2DBName[l2]
elif l2 in enginput2DBName.keys():
ret = enginput2DBName[l2]
elif l2 in ocrInput2DBName.keys():
ret = ocrInput2DBName[l2]
else:
ret=inputString
if debug:
return f'{debugStr}: Input: {inputString} Output: {ret}'
else:
return ret
def cleanInput(text, debug = False):
ret = []
parts = text.split(';')
if len(parts) == 1:
splitted = regex.sub("'","#", text.strip(']|[')).split('#, #')
for part in splitted:
exp1 = regex.sub('#|\*|\(|\)|\[|\]|\+|\{|\}|\n', '', part)
res = re.search(find, exp1)
if res:
try:
ret.append(findKeyString(res.group(0), debug=debug, debugStr='regexNoGrid'))
except:
raise
else:
for part in parts:
if part.startswith('http'):
try:
gridid = part.split("https://www.grid.ac/institutes/")[1].strip('\n')
ret.append(findKeyString(grid2name[gridid], debug=debug, debugStr='grid'))
except KeyError:
pass
except IndexError:
pass
except:
raise
else:
exp1 = regex.sub('#|\*|\(|\)|\[|\]|\+|\{|\}|\n', '', part)
res = re.search(find, exp1)
if res:
try:
ret.append(findKeyString(res.group(0), debug=debug, debugStr='regexGrid'))
except:
raise
return ';'.join(ret)
out = dfMPGall.affName.apply(lambda row: cleanInput(row))
for col in ['foundMPIs','sektion']:
if col in dfMPGall.columns:
dfMPGall = dfMPGall.drop(col,axis=1)
dfMPGall.insert(0,'foundMPIs',out)
Find corresponding Sektion¶
Using the found database institute names, we can now also find the Sektion.
error = []
possibleVals = []
def applyDict(row,dbname=dbname2Sektion,di=input2Sektion, debug=False):
res = ['None' for i,j in enumerate(row.split(';'))]
for idx, part in enumerate(row.split(';')):
if part in dbname.keys():
if debug:
res[idx] = f'dbname2sek: {dbname[part]}'
else:
res[idx] = dbname[part]
else:
cl = cleanStopWords(part)
if cl in di.keys():
if debug:
res[idx] = f'input2sek_lev1: {di[cl]}'
else:
res[idx] = di[cl]
elif cl in labels.keys():
try:
l2 = cleanStopWords(labels[cl])
if debug:
res[idx] = f'input2sek_lev2:cl:{cl} l2:{l2} res:{di[l2]}'
else:
res[idx] = di[l2]
except:
raise
else:
if debug:
resMPIs = []
for pcl in cl.split():
for x in input2DBName.keys():
if pcl in x.split():
resMPIs.append(input2DBName[x])
for x in enginput2DBName.keys():
if pcl in x.split():
resMPIs.append(enginput2DBName[x])
for x in ocrInput2DBName.keys():
if pcl in x.split():
resMPIs.append(ocrInput2DBName[x])
possibleVals.append((part,cl,resMPIs))
error.append((cl))
try:
return ';'.join(res)
except:
print(res)
sektion = dfMPGall.foundMPIs.apply(lambda row: applyDict(row, debug=False))
dfMPGall.insert(1,'sektion',sektion)
How many author contributions can be found for each Sektion.
sektion.apply(lambda row: row.split(';')).explode().value_counts()
CPTS 71771
BMS 51065
None 8292
GSHS 5715
MPG 10
Name: foundMPIs, dtype: int64
dfMPGall.shape
(53446, 10)
Distribution of author numbers in the Sektionen¶
authBMS = dfMPGall[dfMPGall.sektion.str.contains('BMS')].authors.fillna('').apply(lambda row: len(row.split(';'))).to_frame().reset_index(drop=True).value_counts().to_frame().rename(columns={0:'BMS'})
authCPTS = dfMPGall[dfMPGall.sektion.str.contains('CPTS')].authors.fillna('').apply(lambda row: len(row.split(';'))).to_frame().reset_index(drop=True).value_counts().to_frame().rename(columns={0:'CPTS'})
authGSHS = dfMPGall[dfMPGall.sektion.str.contains('GSHS')].authors.fillna('').apply(lambda row: len(row.split(';'))).to_frame().reset_index(drop=True).value_counts().to_frame().rename(columns={0:'GSHS'})
dfCoauthorenSektionen = authBMS.merge(authCPTS,left_index=True, right_index=True,how='outer').merge(authGSHS,left_index=True, right_index=True,how='outer')
dfCoauthorenSektionen
BMS | CPTS | GSHS | |
---|---|---|---|
authors | |||
1 | 1895.0 | 523.0 | 86.0 |
2 | 4730.0 | 4660.0 | 697.0 |
3 | 3777.0 | 4101.0 | 558.0 |
4 | 2685.0 | 2906.0 | 378.0 |
5 | 1947.0 | 1874.0 | 265.0 |
... | ... | ... | ... |
3173 | NaN | 1.0 | NaN |
3180 | NaN | 1.0 | NaN |
3195 | NaN | 1.0 | NaN |
5100 | NaN | 2.0 | NaN |
5114 | NaN | 2.0 | NaN |
538 rows × 3 columns
bmsNorm = dfCoauthorenSektionen['BMS']/dfCoauthorenSektionen.BMS.sum()
cptsNorm = dfCoauthorenSektionen['CPTS']/dfCoauthorenSektionen.CPTS.sum()
gshsNorm = dfCoauthorenSektionen['GSHS']/dfCoauthorenSektionen.GSHS.sum()
pd.DataFrame([dfCoauthorenSektionen['CPTS'],dfCoauthorenSektionen['BMS'],dfCoauthorenSektionen['GSHS']]).transpose()[:20].plot.area(stacked=False)
/home/arbeit/Dokumente/gwdgGitlab/GMPG/gmpg-notebooks/env/lib/python3.8/site-packages/pandas/plotting/_matplotlib/core.py:1235: UserWarning: FixedFormatter should only be used together with FixedLocator
ax.set_xticklabels(xticklabels)
<AxesSubplot:xlabel='authors'>
![../_images/6_Focus_on_MPG_publications_74_2.png](../_images/6_Focus_on_MPG_publications_74_2.png)
pd.DataFrame([cptsNorm,bmsNorm,gshsNorm]).transpose()[:20].plot.area(stacked=False)
/home/arbeit/Dokumente/gwdgGitlab/GMPG/gmpg-notebooks/env/lib/python3.8/site-packages/pandas/plotting/_matplotlib/core.py:1235: UserWarning: FixedFormatter should only be used together with FixedLocator
ax.set_xticklabels(xticklabels)
<AxesSubplot:xlabel='authors'>
![../_images/6_Focus_on_MPG_publications_75_2.png](../_images/6_Focus_on_MPG_publications_75_2.png)
Publications without Sektion data¶
For some publications, the contribution of an MPI is clear, but finding the right institute requieres expert knowledge. This work can be done in the cleaning app, developed by SHKs.
dfNotFound = dfMPGall[dfMPGall.sektion == 'None']
for val in dfNotFound.foundMPIs.unique():
print(val)
Max-Planck-Institut München
Max-Planck-Institut in München
Max Planck Institut in Berlin
MPI Garching
Max-Planck-Institut HML
Max Planck Institut Göttingen
Deutsches Klimarechenzentrum
Forschungszentrum caesar
MPI für Verhaltensbiologie
MPI Bonn
MPI Mines Ltd.
Max Planck Institute Jena
Gesellschaft für wissenschaftliche Datenverarbeitung mbH Göttingen
MPI Research Inc.
MPI - CNRS UMR5146 - Centre SMS
Max Planck Digital Library
Max-Planck-Institut für Herz- und Lungenforschung
Center for Free-Electron Laser Science
MPI for Bioinformatics
MPI Inc.
Beilstein-Institut zur Förderung der Chemischen Wissenschaften
CAS-MPG Partner Institute
MPI Academy
Max Planck Graduate Center
Max Planck Florida Institute for Neuroscience
Ernst Strüngmann Institut
Max-Planck-Institut für Immunobiologie und Epigenetik
Max Planck Innovation
MPI für empirische Ästhetik
Max Planck Institute Luxembourg
MPI Leipzig
MPI Saarbrücken
Max Planck Institute Luxemburg for International, European and Regulatory Procedural Law
MPI ANR-11-LABX-0007-01
MPI Tübingen
Export to file¶
To create a new exported version, uncomment the following line, and change the file extension to the current date.
dfMPGall.head(2)
foundMPIs | sektion | year | pubID | title | lang | journalID | journalName | authors | affName | |
---|---|---|---|---|---|---|---|---|---|---|
0 | Fritz-Haber-Institut der MPG;Fritz-Haber-Insti... | CPTS;CPTS | 1946 | sg:pub.10.1007/bf00643799 | Kondensationsvorgänge aus überhitztem Arsenikd... | ['de'] | sg:journal.1018189 | The Science of Nature | sg:person.013544521043.99;Korb_A. | https://www.grid.ac/institutes/grid.418028.7;h... |
1 | None | 1946 | sg:pub.10.1007/bf00624523 | Uber die Phosphorylase der Leukocyten | ['en'] | sg:journal.1018189 | The Science of Nature | Rohdewald_Margarete | https://www.grid.ac/institutes/grid.418441.c |
#dfMPGall.to_csv(gitDataPath + 'MPGPubSektionen_11112020.tsv',sep='\t',index=False)
Some first interpretations¶
Intra-Sektional collaborations¶
Publications with collaborations of authors from more then once Sektion are relatively rare until the early 2000thst.
dfMPGall[dfMPGall.sektion.apply(lambda row: len(set(row.split(';')))>1)].groupby('year').size().plot()
<AxesSubplot:xlabel='year'>
![../_images/6_Focus_on_MPG_publications_84_1.png](../_images/6_Focus_on_MPG_publications_84_1.png)
intraCollab = dfMPGall[dfMPGall.sektion.apply(lambda row: len(set([x for x in row.split(';') if x != 'None']))>1)]
intraCollab.foundMPIs.apply(lambda row: row.split(';')).explode().value_counts().to_frame()
foundMPIs | |
---|---|
MPI für biophysikalische Chemie (Karl-Friedrich-Bonhoeffer-Institut) (seit 1971) | 259 |
MPI für Psychiatrie (bis 1966 Deutsche Forschungsanstalt für Psychiatrie (MPI)) | 195 |
MPI für experimentelle Medizin (1948-1965 Medizinische Forschungsanstalt der MPG) | 149 |
MPI für medizinische Forschung | 123 |
Center for Free-Electron Laser Science | 122 |
... | ... |
MPI für Struktur und Dynamik der Materie | 1 |
MPI für Menschheitsgeschichte (ab 2014) | 1 |
MPI für die Physik des Lichts | 1 |
MPI für bioanorganische Chemie (2003-2012) | 1 |
MPI für Verhaltensphysiologie (1954-2003) | 1 |
63 rows × 1 columns
There is only one publication from all three Sektionen, published 2018.
dfMPGall[dfMPGall.sektion.apply(lambda row: len(set([x for x in row.split(';') if x != 'None']))>2)]
foundMPIs | sektion | year | pubID | title | lang | journalID | journalName | authors | affName | |
---|---|---|---|---|---|---|---|---|---|---|
49894 | MPI für biologische Kybernetik;MPI für biologi... | BMS;BMS;CPTS;BMS;GSHS;GSHS;BMS;BMS | 2018 | sg:pub.10.1038/s41467-018-06304-z | LISA improves statistical analysis for fMRI | ['en'] | sg:journal.1043282 | Nature Communications | sg:person.01110373331.54;sg:person.01323631320... | https://www.grid.ac/institutes/grid.419501.8;h... |
Output per Sektion¶
dfMPGall.columns
Index(['foundMPIs', 'sektion', 'year', 'pubID', 'title', 'lang', 'journalID',
'journalName', 'authors', 'affName'],
dtype='object')
sekSp = dfMPGall.sektion.apply(lambda row: row.split(',')[0].split(';'))
dfMPGall.insert(0,'sekSplit', sekSp)
dfMPGall.sekSplit.iloc[3]
['CPTS', 'CPTS']
result = []
for year, g0 in dfMPGall.groupby('year'):
res = {'year':year}
for sektion, g1 in g0.explode('sekSplit').groupby('sekSplit'):
res[sektion] = g1.shape[0]
result.append(res)
#result
dfSektionOutput = pd.DataFrame(result)[:-15]
dfSektionOutput = dfSektionOutput.set_index('year')
dfSektionOutput = dfSektionOutput.rename(columns={'None':'Keine Sektion'})
dfSektionOutput.columns = sorted(dfSektionOutput.columns)
dfSektionOutput.index.rename('Jahr',inplace=True)
colorSektions = ['r','g','b','k']
with PdfPages('../results/Fig8_MPG_Sektionen.pdf') as pdffigure:
fig, ax1 = plt.subplots(figsize=(10,6))
color = 'k'
ax1.set_xlabel('Jahr',fontsize = 14)
ax1.set_ylabel('Publikationen pro Sektion im SciGraph Corpus', color=color,fontsize = 14)
plt1 = dfSektionOutput[['BMS','CPTS','GSHS','Keine Sektion']].plot.area(stacked=False, ax=ax1, color=colorSektions)
ax1.tick_params(axis='y', labelcolor=color, labelsize = 14)
ax1.tick_params(axis='x', labelcolor=color, labelsize = 14)
ax1.legend(fontsize = 14,loc=2)
ax1.annotate(
"Quelle: SciGraph Dataset Articles, Stand: 02/2019", (
1.02,0.7),(0, 0), xycoords = "axes fraction", textcoords = "offset points", va = "top", style = "italic", fontsize = 10, rotation = 90)
fig.tight_layout()
# Save as PDF
#pdffigure.savefig(bbox_inches="tight", dpi=400)
# Save as PNG
#plt.savefig('../../../results/SciGraph/SciGraph_MPG_Publikationen_Sektionen.png')
plt.show()
plt.close()
![../_images/6_Focus_on_MPG_publications_102_0.png](../_images/6_Focus_on_MPG_publications_102_0.png)