Focus on MPG publications¶

Imports¶

import pandas as pd
import os
import regex
import re
import string
import difflib

import numpy as np
from tqdm import tqdm
from functools import partial
from multiprocessing import  Pool, cpu_count

from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.pyplot as plt

Setup paths¶

gitDataPath = '../data/processedData/'

dataPath = '/media/arbeit/b88b17b8-b5d7-4c4e-86bc-ff4b67458842/sciGraph/'

General stopword routine to re-use at every step¶

rePunct = "\(|\)|\.|\d+|,|-"
rePunct

'\\(|\\)|\\.|\\d+|,|-'

stopwords = [
    'Forschungsstelle',
    'Förderung',
    'förderung',
    'Wissenschaften',
    'Kaiser-Wilhelm-Institut',
    'Max-Planck-gesellschaft',
    'Max-Planck-Institut',
    'maxplanckgesellschaft',
    'maxplanckinstitut',
    'maxplanckinstituts',
    'maxplanckinstitute',
    'maxplanckinsitut',
    'maxplanckinst',
    'maxplanckinstitutes',
    'maxplanckinstituten',
    'maxplanckinsituts',
    'maxplanckinstitutt',
    'maxplanckinstiut',
    'maxplanckinsütut',
    'planckinstitut',
    'insitute',
    'insitut',
    'maxplanckarbeitsgruppe',
    'maxplanckforschungsstelle',
    'Max Planck Institute',
    'Max-Planck-Institute',
    'Max-Planck-Institutes',
    'Max-Planck-Instituts',
    'Max',
    'Planck',
    'MPI',
    'MPG',
    'Instituts',
    'Institutes',    
    'Institute',
    'Institut',
    'Planck-Institut',
    'zur',
    'ab',
    'die',
    'und',
    'der',
    'of',
    'für',
    'fuer',
    'fur',
    'f¨r',
    'fr',
    'für',
    'for',
    '\n',
    'des',
    'the',
    'GmbH',
    'in',
    'von'
]

lowerStopwords = [x.lower() for x in stopwords]

def cleanStopWords(instring):
    ret = []
    braket = [x.split(')(') for x in instring.split(' ')]
    for x in [x for y in braket for x in y]:
        x = re.sub(rePunct,'',x)
        x = x.strip('\n').lower()
        if x not in lowerStopwords:
            if x.strip() not in ret:
                ret.append(x.strip())
    return ' '.join([x for x in ret if x])

Loading alternative spelling of MPIs¶

Based on OCR error files by Dirk Wintergrün.

with open(gitDataPath + 'all_labels.tsv','r', encoding='utf8') as file:
    lines = [x.split('\t') for x in file.readlines()]

labels = {}
for l0 in lines[1:]:
    if not 'KWI' in l0[0]:
        for l1 in l0[1:]:
            l1 = cleanStopWords(l1)
            if l1:
                labels[l1] = l0[0]

Read data from files¶

The corpus is created by appyling a regex routine on the corpus file for every year. See previous Notebook 5.

dfLoad = pd.read_csv(gitDataPath + 'mpgpubs.csv', sep='\t')

dfLoad.head(2)

	0	1	2	3	4	5	6	7	8
0	1946	sg:pub.10.1007/bf00643799	Kondensationsvorgänge aus überhitztem Arsenikd...	['de']	sg:journal.1018189	The Science of Nature	False	sg:person.013544521043.99;Korb_A.	https://www.grid.ac/institutes/grid.418028.7;h...
1	1946	sg:pub.10.1007/bf00624523	Uber die Phosphorylase der Leukocyten	['en']	sg:journal.1018189	The Science of Nature	False	Rohdewald_Margarete	https://www.grid.ac/institutes/grid.418441.c

dfMPGall = dfLoad.drop('6',axis=1).rename(columns={'0':'year','1':'pubID','2':'title','3':'lang','4':'journalID','5':'journalName','7':'authors','8':'affName'})

dfMPGall.head(2)

	year	pubID	title	lang	journalID	journalName	authors	affName
0	1946	sg:pub.10.1007/bf00643799	Kondensationsvorgänge aus überhitztem Arsenikd...	['de']	sg:journal.1018189	The Science of Nature	sg:person.013544521043.99;Korb_A.	https://www.grid.ac/institutes/grid.418028.7;h...
1	1946	sg:pub.10.1007/bf00624523	Uber die Phosphorylase der Leukocyten	['en']	sg:journal.1018189	The Science of Nature	Rohdewald_Margarete	https://www.grid.ac/institutes/grid.418441.c

Resources for Sektion mapping¶

Person DB¶

One version was manually cleaned to delete double mentions, years, brackets and so on in instituts names. This data was then used to create a dictionary of mappings.

dfInstCleaned = pd.read_csv(gitDataPath + 'mpg_institute_cleanded.csv')

preprocessed = dfInstCleaned['institut_name']

The main information about the institutes of the MPG is derived from the research database of the research program “History of the Max Planck Society”, for more details see the website.

dfInstDirty = pd.read_csv(gitDataPath + 'Institute.csv')

dfInstDirtMPG = dfInstDirty[dfInstDirty.is_mpg == True].reset_index(drop=True)

dfInstDirtMPG.insert(0,'preprocessed', preprocessed)

cleanstopDirt = dfInstDirtMPG.preprocessed.apply(lambda row: cleanStopWords(row))

dfInstDirtMPG.insert(0,'cleaned',cleanstopDirt)

Two translation dictionaries : from the cleaned words to the database institut name, and the same to the sektion. At least one MPI ( Empirical Aestetics ) has no Sektion.

input2DBName = dfInstDirtMPG[['cleaned','institut_name','date_founded']].sort_values('date_founded').set_index('cleaned')['institut_name'].to_dict()

input2Sektion = dfInstDirtMPG[['cleaned','sektion','date_founded']].sort_values('date_founded').replace('','MPG').fillna('None').set_index('cleaned')['sektion'].to_dict()

Additionaly, create dictionary to translate from database name to sektion.

dbname2Sektion = dfInstDirtMPG[['institut_name','sektion','date_founded']].sort_values('date_founded').replace('','MPG').fillna('None').set_index('institut_name')['sektion'].to_dict()

Manual translations of OCR errors¶

This list is created manually, by comparing likely hits of instituts names.

dfManualTransl = pd.read_csv(gitDataPath + 'translations_inst_names.csv')

dfManualTransl.insert(0,'cleaned',dfManualTransl['0'].apply(lambda row: cleanStopWords(row)))

ocrInput2DBName = dfManualTransl[['cleaned','Unnamed: 0']].drop_duplicates().set_index('cleaned')['Unnamed: 0'].to_dict()

def findKey(inputString,debug=False,debugStr=''):
    ret = ''
    l1 = cleanStopWords(inputString)
    
    if l1 in input2DBName.keys():
        ret = input2DBName[l1]
    elif l1 in labels.keys():
        l2 = cleanStopWords(labels[l1])
        if l2 in input2DBName.keys():
            ret = input2DBName[l2]
    else:
        ret=inputString
    if debug:
        return f'{debugStr}: Input: {inputString} Output: {ret}' 
    else:
        return ret

Grid data¶

Source : Global Research Identifier Database

This data is used to map between identifiers of the form grid.45667.2 to instituts name. All MPG instituts are related to the MPG (GridID grid.4372.2). This allows also to create lists of german to english translations of names.

dfGrid = pd.read_csv(dataPath + 'grid/grid-2020-06-29/grid.csv')

dfGridRelations = pd.read_csv(dataPath + 'grid/grid-2020-06-29/full_tables/relationships.csv')

dfGRIDLabels = pd.read_csv(dataPath + 'grid/grid-2020-06-29/full_tables/labels.csv')

mpgGridIDs = dfGridRelations[dfGridRelations.grid_id == 'grid.4372.2'].related_grid_id.values

name2GRID = dfGRIDLabels[dfGRIDLabels.grid_id.isin(mpgGridIDs)][['grid_id','label']].set_index('label')['grid_id'].to_dict()

grid2name = {y:x for x, y in name2GRID.items()}

If there are no german labels, use the main data.

for grid_i in mpgGridIDs:
    if grid_i not in grid2name.keys():
        grid2name.update({grid_i:dfGrid[dfGrid.ID == grid_i].Name.values[0]})

English translations to DB Names¶

dfGridEngl = dfGrid[dfGrid.ID.isin(mpgGridIDs)][['ID','Name']]

germ = dfGridEngl.ID.apply(lambda row: grid2name[row])

dfGridEngl.insert(2,'gerName',germ)

cleanEng = dfGridEngl.Name.apply(lambda row: cleanStopWords(row))

dfGridEngl.insert(2,'cleanEng',cleanEng)

transl = dfGridEngl.gerName.apply(lambda row: findKey(row,debug=False))

dfGridEngl.insert(3,'foundMPIs',transl)

enginput2DBName =  dfGridEngl[['cleanEng','foundMPIs']].set_index('cleanEng')['foundMPIs'].to_dict()

Finding the right MPIs¶

This routine uses regular expression to find the affiliated MPI. In the SciGraph data are three main cases of affiliation data.

If the affiliation can be disambiguated, the GRID ID is used. If all affiliations can be disambiguated, you find expressions https://grid.ac/institutes/grid.419696.5 joined by semikolon. For each grid id, we check if it is part of the MPG list.
For older publications, often no disambiguation was possible, such that the affiliation string has the form "'affil 1','affil 2',' affil 3'". Here, regular expressions are used to find names of MPIs. Since the exact form of the string varies a lot, we have a long list of possible spellings and also OCR errors.
For mixed cases of disambiguated and raw affiliation data, we check for each element, if grid or not and then apply the relevant method.

In each case, the found MPI is converted to terms with the cleaning routine above. Then various dictionaries are aplied to find the correct database institut. Results are joined by a semikolon.

Known problems:

For some MPIs the affiliation could be captured by ([^'\[])+Max\sPlanck\sInstitute,([^;,]+)+, i.e. the name of the institute comes before mentioning that its a MPI. These cases lead to missing all other cases, such that they have to be captures manually after the main routine.
Some special collaborations, like the MPG-CAS partner instituts are not captured.
Some miss detection exists, like MPG Ranch.

findMPI = """
    Max-Delbrück-Laboratorium\sin\sder\sMPG|
    Klinische\sArbeitsgruppe\sder\sMPG|
    MPG\sArbeitsgruppe\s([^;,]+)+| #
    MPG-AG\s([^;,]+)+| # 
    MPG-CAS\sPICB|#
    CAS-MPG\sPartner\sInstitute|#
    Fritz-Haber\sInstitut\sder\sMPG| #
    Fritz-Haber-Institut\sof\sthe\sMPG| #
    Fritz-Haber-Institute\sder\sMPG|#
    Fritz-Haber-Institut\s{1,2}der\sMPG|#
    Deutschen\sForschungs-anstalt\sfür\sPsychiatrie| #
    Deutschen\sForschungsanstalt\sfür\sPsychiatrie| #
    Deutsche\sForschungsanstalt\sfür\sPsychiatrie| #
    KWI\sfür([^;,]+)+| #
    MPI\sfür([^;,]+)+| #
    MPI\sfor([^;,]+)+| #
    MPI\szur([^;,]+)+|#
    MPI\sof([^;,]+)+|#
    MPI\s([^;,]+)+|#
    Max-Planck-Institut\sf\.([^;,]+)+| 
    Max\sPlanck\sInstitut\sf\.([^;,]+)+| 
    Max-Planck-Institute\sf\.([^;,]+)+|
    Max-Planck-Inst\sf\.([^;,]+)+| 
    Max\sPlanck\sInstitute\sf\.([^;,]+)+|
    Max-Planck-Insütut\sfür([^;,]+)+|
    Max\sPlanck\sInsütut\sfür([^;,]+)+|
    Max-Planck-Institut\sfür([^;,]+)+|
    Max-Planck-Instituts\sfür([^;,]+)+|
    Max-Planck-Institute\sfür([^;,]+)+|
    Max\sPlanck\sInstituts\sfür([^;,]+)+|
    Max-Planck-Institutes\sfür([^;,]+)+|
    Max\sPlanck\sInstitutes\sfür([^;,]+)+|
    Max\sPlanck\sInstitut\sfür([^;,]+)+|#
    Max\sPlanck\sInstitute\sfür([^;,]+)+|#
    Max-Planck-Instituten\sfür([^;,]+)+|#
    Max\sPlanck\sInstituten\sfür([^;,]+)+|#
    Max\sPlanck\sInstitute\sfuer([^;,]+)+|#
    Max\sPlanck\sInstitut\sfuer([^;,]+)+|#
    Max-Planck-Institut\sfuer([^;,]+)+|#
    Max-Planck-Institute\sfuer([^;,]+)+|#
    Max-Planck-Institut\sf\?r([^;,]+)+|#
    Max\sPlanck\sInstitut\sf\?r([^;,]+)+|#
    Max-Planck-Institut\szur([^;,]+)+|#
    Max\sPlanck\sInstitut\szur([^;,]+)+|#
    Max\sPlanck\sInstitute\sfor([^;,]+)+| #
    Max-Planck-Institute\sfor([^;,]+)+| #
    Max-Planck-Institut\sfor([^;,]+)+| #
    Max\sPlanck\sInstitute\sof([^;,]+)+| #
    Max-Planck-Institute\sof([^;,]+)+| #
    Max-Planck-Institut\sof([^;,]+)+| #
    Max-Planck-Institute\s([^;,]+)+| #
    Max\sPlanck\sInstitute\s([^;,]+)+| #
    Max-Planck-Insitut\sfür([^;,]+)+| #
    Max\sPlanck\sInsitut\sfür([^;,]+)+| #
    Max-Planck-Institut\sfur([^;,]+)+| #
    Max\sPlanck\sInstitut\sfur([^;,]+)+| #
    Max-Planck-Inst.\sfür([^;,]+)+| # 
    Max\sPlanck\sInst\.\sfür([^;,]+)+| # 
    Max-Planck-Inst\.\sf\.([^;,]+)+| # 
    Max\sPlanck\sInst\.\sf\.([^;,]+)+| # 
    Max-Planck-Instiut\sfür([^;,]+)+| #  
    Max\sPlanck\sInstiut\sfür([^;,]+)+| #
    Max-Planck-Insituts\sfür([^;,]+)+| #
    Max\sPlanck\sInsituts\sfür([^;,]+)+| #
    Max-Planck-Institut\s([^;,]+)+| #
    Max\sPlanck\sInstitut\s([^;,]+)+| #
    Max\sPlanck\sInsitute\sfor\s([^;,]+)+| #
    Max-Planck-Institutt\sfür\s([^;,]+)+#
"""

    
find = re.compile(findMPI,re.X|re.MULTILINE)

def findKeyString(inputString,debug=False,debugStr=''):
    ret = ''
    l1 = cleanStopWords(inputString)
    
    if l1 in input2DBName.keys():
        ret = input2DBName[l1]
    elif l1 in enginput2DBName.keys():
        ret = enginput2DBName[l1]
    elif l1 in ocrInput2DBName.keys():
        ret = ocrInput2DBName[l1]
    elif l1 in labels.keys():
        l2 = cleanStopWords(labels[l1])
        if l2 in input2DBName.keys():
            ret = input2DBName[l2]
        elif l2 in enginput2DBName.keys():
            ret = enginput2DBName[l2]
        elif l2 in ocrInput2DBName.keys():
            ret = ocrInput2DBName[l2]
    else:
        ret=inputString
    if debug:
        return f'{debugStr}: Input: {inputString} Output: {ret}' 
    else:
        return ret

def cleanInput(text, debug = False):
    ret = []
    parts = text.split(';')
    if len(parts) == 1:
        splitted = regex.sub("'","#", text.strip(']|[')).split('#, #')
        for part in splitted:
            exp1 = regex.sub('#|\*|\(|\)|\[|\]|\+|\{|\}|\n', '', part)
            res = re.search(find, exp1)
            if res:
                try:
                    ret.append(findKeyString(res.group(0), debug=debug, debugStr='regexNoGrid'))
                except:
                    raise
    else:
        for part in parts:
            if part.startswith('http'):
                try:
                    gridid = part.split("https://www.grid.ac/institutes/")[1].strip('\n')
                    ret.append(findKeyString(grid2name[gridid], debug=debug, debugStr='grid'))
                except KeyError:
                    pass
                except IndexError:
                    pass
                except:
                    raise
            else:
                exp1 = regex.sub('#|\*|\(|\)|\[|\]|\+|\{|\}|\n', '', part)
                res = re.search(find, exp1)
                if res:
                    try:
                        ret.append(findKeyString(res.group(0), debug=debug, debugStr='regexGrid'))
                    except:
                        raise
    return ';'.join(ret)

out = dfMPGall.affName.apply(lambda row: cleanInput(row))

for col in ['foundMPIs','sektion']:
    if col in dfMPGall.columns:
        dfMPGall = dfMPGall.drop(col,axis=1)

dfMPGall.insert(0,'foundMPIs',out)

Find corresponding Sektion¶

Using the found database institute names, we can now also find the Sektion.

error = []
possibleVals = []
def applyDict(row,dbname=dbname2Sektion,di=input2Sektion, debug=False):
    res = ['None' for i,j in enumerate(row.split(';'))]
    for idx, part in enumerate(row.split(';')):
        if part in dbname.keys():
            if debug:
                res[idx] = f'dbname2sek: {dbname[part]}'
            else:
                res[idx] = dbname[part]
        else:
            cl = cleanStopWords(part)
            if cl in di.keys():
                if debug:
                    res[idx] = f'input2sek_lev1: {di[cl]}'
                else:
                    res[idx] = di[cl]
            elif cl in labels.keys():
                try:
                    l2 = cleanStopWords(labels[cl])
                    if debug:
                        res[idx] = f'input2sek_lev2:cl:{cl} l2:{l2} res:{di[l2]}'
                    else:
                        res[idx] = di[l2]
                except:
                    raise
            else:
                if debug:
                    resMPIs = []
                    for pcl in cl.split():
                        for x in input2DBName.keys():
                            if pcl in x.split():
                                resMPIs.append(input2DBName[x])
                        for x in enginput2DBName.keys():
                            if pcl in x.split():
                                resMPIs.append(enginput2DBName[x])
                        for x in ocrInput2DBName.keys():
                            if pcl in x.split():
                                resMPIs.append(ocrInput2DBName[x])
                    possibleVals.append((part,cl,resMPIs))
                error.append((cl))
    try:
        return ';'.join(res)
    except:
        print(res)

sektion = dfMPGall.foundMPIs.apply(lambda row: applyDict(row, debug=False))

dfMPGall.insert(1,'sektion',sektion)

How many author contributions can be found for each Sektion.

sektion.apply(lambda row: row.split(';')).explode().value_counts()

CPTS    71771
BMS     51065
None     8292
GSHS     5715
MPG        10
Name: foundMPIs, dtype: int64

dfMPGall.shape

(53446, 10)

Distribution of author numbers in the Sektionen¶

authBMS = dfMPGall[dfMPGall.sektion.str.contains('BMS')].authors.fillna('').apply(lambda row: len(row.split(';'))).to_frame().reset_index(drop=True).value_counts().to_frame().rename(columns={0:'BMS'})
authCPTS = dfMPGall[dfMPGall.sektion.str.contains('CPTS')].authors.fillna('').apply(lambda row: len(row.split(';'))).to_frame().reset_index(drop=True).value_counts().to_frame().rename(columns={0:'CPTS'})
authGSHS = dfMPGall[dfMPGall.sektion.str.contains('GSHS')].authors.fillna('').apply(lambda row: len(row.split(';'))).to_frame().reset_index(drop=True).value_counts().to_frame().rename(columns={0:'GSHS'})

dfCoauthorenSektionen = authBMS.merge(authCPTS,left_index=True, right_index=True,how='outer').merge(authGSHS,left_index=True, right_index=True,how='outer')

dfCoauthorenSektionen

	BMS	CPTS	GSHS
authors
1	1895.0	523.0	86.0
2	4730.0	4660.0	697.0
3	3777.0	4101.0	558.0
4	2685.0	2906.0	378.0
5	1947.0	1874.0	265.0
...	...	...	...
3173	NaN	1.0	NaN
3180	NaN	1.0	NaN
3195	NaN	1.0	NaN
5100	NaN	2.0	NaN
5114	NaN	2.0	NaN

538 rows × 3 columns

bmsNorm = dfCoauthorenSektionen['BMS']/dfCoauthorenSektionen.BMS.sum()
cptsNorm = dfCoauthorenSektionen['CPTS']/dfCoauthorenSektionen.CPTS.sum()
gshsNorm = dfCoauthorenSektionen['GSHS']/dfCoauthorenSektionen.GSHS.sum()

pd.DataFrame([dfCoauthorenSektionen['CPTS'],dfCoauthorenSektionen['BMS'],dfCoauthorenSektionen['GSHS']]).transpose()[:20].plot.area(stacked=False)

/home/arbeit/Dokumente/gwdgGitlab/GMPG/gmpg-notebooks/env/lib/python3.8/site-packages/pandas/plotting/_matplotlib/core.py:1235: UserWarning: FixedFormatter should only be used together with FixedLocator
  ax.set_xticklabels(xticklabels)

<AxesSubplot:xlabel='authors'>

../_images/6_Focus_on_MPG_publications_74_2.png

pd.DataFrame([cptsNorm,bmsNorm,gshsNorm]).transpose()[:20].plot.area(stacked=False)

/home/arbeit/Dokumente/gwdgGitlab/GMPG/gmpg-notebooks/env/lib/python3.8/site-packages/pandas/plotting/_matplotlib/core.py:1235: UserWarning: FixedFormatter should only be used together with FixedLocator
  ax.set_xticklabels(xticklabels)

<AxesSubplot:xlabel='authors'>

../_images/6_Focus_on_MPG_publications_75_2.png

Publications without Sektion data¶

For some publications, the contribution of an MPI is clear, but finding the right institute requieres expert knowledge. This work can be done in the cleaning app, developed by SHKs.

dfNotFound = dfMPGall[dfMPGall.sektion == 'None']

for val in dfNotFound.foundMPIs.unique():
    print(val)

Max-Planck-Institut München
Max-Planck-Institut in München
Max Planck Institut in Berlin
MPI Garching
Max-Planck-Institut HML
Max Planck Institut Göttingen
Deutsches Klimarechenzentrum
Forschungszentrum caesar
MPI für Verhaltensbiologie
MPI Bonn
MPI Mines Ltd.
Max Planck Institute Jena
Gesellschaft für wissenschaftliche Datenverarbeitung mbH Göttingen
MPI Research Inc.
MPI - CNRS UMR5146 - Centre SMS
Max Planck Digital Library
Max-Planck-Institut für Herz- und Lungenforschung
Center for Free-Electron Laser Science
MPI for Bioinformatics
MPI Inc.
Beilstein-Institut zur Förderung der Chemischen Wissenschaften
CAS-MPG Partner Institute
MPI Academy
Max Planck Graduate Center
Max Planck Florida Institute for Neuroscience
Ernst Strüngmann Institut
Max-Planck-Institut für Immunobiologie und Epigenetik
Max Planck Innovation
MPI für empirische Ästhetik
Max Planck Institute Luxembourg
MPI Leipzig
MPI Saarbrücken
Max Planck Institute Luxemburg for International, European and Regulatory Procedural Law
MPI ANR-11-LABX-0007-01
MPI Tübingen

Export to file¶

To create a new exported version, uncomment the following line, and change the file extension to the current date.

dfMPGall.head(2)

	foundMPIs	sektion	year	pubID	title	lang	journalID	journalName	authors	affName
0	Fritz-Haber-Institut der MPG;Fritz-Haber-Insti...	CPTS;CPTS	1946	sg:pub.10.1007/bf00643799	Kondensationsvorgänge aus überhitztem Arsenikd...	['de']	sg:journal.1018189	The Science of Nature	sg:person.013544521043.99;Korb_A.	https://www.grid.ac/institutes/grid.418028.7;h...
1		None	1946	sg:pub.10.1007/bf00624523	Uber die Phosphorylase der Leukocyten	['en']	sg:journal.1018189	The Science of Nature	Rohdewald_Margarete	https://www.grid.ac/institutes/grid.418441.c

#dfMPGall.to_csv(gitDataPath + 'MPGPubSektionen_11112020.tsv',sep='\t',index=False)

Some first interpretations¶

Intra-Sektional collaborations¶

Publications with collaborations of authors from more then once Sektion are relatively rare until the early 2000thst.

dfMPGall[dfMPGall.sektion.apply(lambda row: len(set(row.split(';')))>1)].groupby('year').size().plot()

<AxesSubplot:xlabel='year'>

../_images/6_Focus_on_MPG_publications_84_1.png

intraCollab = dfMPGall[dfMPGall.sektion.apply(lambda row: len(set([x for x in row.split(';') if x != 'None']))>1)]

intraCollab.foundMPIs.apply(lambda row: row.split(';')).explode().value_counts().to_frame()

	foundMPIs
MPI für biophysikalische Chemie (Karl-Friedrich-Bonhoeffer-Institut) (seit 1971)	259
MPI für Psychiatrie (bis 1966 Deutsche Forschungsanstalt für Psychiatrie (MPI))	195
MPI für experimentelle Medizin (1948-1965 Medizinische Forschungsanstalt der MPG)	149
MPI für medizinische Forschung	123
Center for Free-Electron Laser Science	122
...	...
MPI für Struktur und Dynamik der Materie	1
MPI für Menschheitsgeschichte (ab 2014)	1
MPI für die Physik des Lichts	1
MPI für bioanorganische Chemie (2003-2012)	1
MPI für Verhaltensphysiologie (1954-2003)	1

63 rows × 1 columns

There is only one publication from all three Sektionen, published 2018.

dfMPGall[dfMPGall.sektion.apply(lambda row: len(set([x for x in row.split(';') if x != 'None']))>2)]

	foundMPIs	sektion	year	pubID	title	lang	journalID	journalName	authors	affName
49894	MPI für biologische Kybernetik;MPI für biologi...	BMS;BMS;CPTS;BMS;GSHS;GSHS;BMS;BMS	2018	sg:pub.10.1038/s41467-018-06304-z	LISA improves statistical analysis for fMRI	['en']	sg:journal.1043282	Nature Communications	sg:person.01110373331.54;sg:person.01323631320...	https://www.grid.ac/institutes/grid.419501.8;h...

Output per Sektion¶

dfMPGall.columns

Index(['foundMPIs', 'sektion', 'year', 'pubID', 'title', 'lang', 'journalID',
       'journalName', 'authors', 'affName'],
      dtype='object')

sekSp = dfMPGall.sektion.apply(lambda row: row.split(',')[0].split(';'))

dfMPGall.insert(0,'sekSplit', sekSp)

dfMPGall.sekSplit.iloc[3]

['CPTS', 'CPTS']

result = []
for year, g0 in dfMPGall.groupby('year'):
    res = {'year':year}
    for sektion, g1 in g0.explode('sekSplit').groupby('sekSplit'):
        res[sektion] = g1.shape[0]
    result.append(res)

#result

dfSektionOutput = pd.DataFrame(result)[:-15]
dfSektionOutput = dfSektionOutput.set_index('year')

dfSektionOutput = dfSektionOutput.rename(columns={'None':'Keine Sektion'})

dfSektionOutput.columns = sorted(dfSektionOutput.columns)

dfSektionOutput.index.rename('Jahr',inplace=True)

colorSektions = ['r','g','b','k']

with PdfPages('../results/Fig8_MPG_Sektionen.pdf') as pdffigure:
    fig, ax1 = plt.subplots(figsize=(10,6))
    color = 'k'
    ax1.set_xlabel('Jahr',fontsize = 14)
    ax1.set_ylabel('Publikationen pro Sektion im SciGraph Corpus', color=color,fontsize = 14)
    plt1 = dfSektionOutput[['BMS','CPTS','GSHS','Keine Sektion']].plot.area(stacked=False, ax=ax1, color=colorSektions)
    ax1.tick_params(axis='y', labelcolor=color, labelsize = 14)
    ax1.tick_params(axis='x', labelcolor=color, labelsize = 14)
    ax1.legend(fontsize = 14,loc=2)
    ax1.annotate(
        "Quelle: SciGraph Dataset Articles, Stand: 02/2019", (
            1.02,0.7),(0, 0), xycoords = "axes fraction", textcoords = "offset points", va = "top", style = "italic", fontsize = 10, rotation = 90)
    
    fig.tight_layout() 
    # Save as PDF
    #pdffigure.savefig(bbox_inches="tight", dpi=400)
    # Save as PNG
    #plt.savefig('../../../results/SciGraph/SciGraph_MPG_Publikationen_Sektionen.png')
    
    plt.show()
    plt.close()

../_images/6_Focus_on_MPG_publications_102_0.png

The Max Planck Society and its scientific context