Distribution of research institutions¶

To compare the scientific output with other research institutions, we follow two strategies. Since many more recent publications have affiliations based on the GRID initiatives data, we collect the IDs of main institutions and their relations. In the case of the Max-Planck-Society this number is grid.4372.2. Using _.related_grid_id we find all registered Max-Planck-Institutes IDs.¶

For the following institutions, related data can be extracted

University of California system
Havard University
University of Oxford
Massachusetts Institute of Technology

Imports¶

import pandas as pd
import numpy as np
import os

import regex
from tqdm import tqdm

from functools import partial
from multiprocessing import  Pool, cpu_count

from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.pyplot as plt

Helper routines¶

n_cores = cpu_count() - 2

def parallelize(df, func, n_cores=n_cores):
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

def run_on_subset(func, data_subset):
    return data_subset.apply(func, axis=1)

def parallelize_on_rows(data, func, num_of_processes=n_cores):
    return parallelize(data, partial(run_on_subset, func), num_of_processes)

Paths setup¶

gitDataPath = '../data/processedData/'

dataPath = '/media/arbeit/b88b17b8-b5d7-4c4e-86bc-ff4b67458842/sciGraph/'
gridPath = dataPath + 'grid/'
yearPath = dataPath + 'processedData/yeardata/'

years = sorted([x for x in os.listdir(yearPath) if x.endswith('.csv') and x != 'noYear.csv'])[113:]

years[:3]

['1945.csv', '1946.csv', '1947.csv']

Grid data¶

dfGridName = pd.read_csv(gridPath + 'grid-2020-06-29/grid.csv')

dfGridRelations = pd.read_csv(gridPath + 'grid-2020-06-29/full_tables/relationships.csv')

dfGRIDLabels = pd.read_csv(gridPath + 'grid-2020-06-29/full_tables/labels.csv')

Institutional setup¶

Load GRID IDs for the institutions to compare.

MPG¶

mpgGridIDs = dfGridRelations[dfGridRelations.grid_id == 'grid.4372.2'].related_grid_id.values

University of California System¶

The GRID ID for University of California system is grid.30389.31.

ucGridIDs = dfGridRelations[dfGridRelations.grid_id == 'grid.30389.31'].related_grid_id.values

University of Harvard¶

harvardGridIDs = list(dfGridRelations[dfGridRelations.grid_id == 'grid.38142.3c'].related_grid_id.values)

harvardGridIDs.extend(['grid.38142.3c'])

University of Oxford¶

oxfordGridIDs = list(dfGridRelations[dfGridRelations.grid_id == 'grid.4991.5'].related_grid_id.values)

oxfordGridIDs.extend(['grid.4991.5'])

Get data¶

def isPartOfInstitution(row, instGridIDs, instString):
    colN = 8
    instGridIDs = instGridIDs
    instString = instString
    if type(row[colN])!=float:
        for elem in row[colN].split(';'):
            split = elem.split('https://www.grid.ac/institutes/')
            if len(split)>1:
                if split[1] in instGridIDs:
                    return 'True'
            elif regex.findall(instString, elem):
                return 'True'
        return 'False'
    else:
        return 'None'

instGroups = [
    (ucGridIDs, 'University of California'),
    (harvardGridIDs, 'Harvard University'),
    (mpgGridIDs, 'Max-Planck-I|Max Planck I|MPG\s|MPI\s'),
    (['grid.116068.8'], 'Massachusetts Institute of Technology'),
    (oxfordGridIDs , 'University of Oxford')
]

res = []
for group in instGroups:
    resGroup = []
    def isPartofWhat(row):
        return isPartOfInstitution(row, instGridIDs=group[0], instString=group[1])
    for file in tqdm(years):
        resTemp = {'year':file.split('.')[0].split('_')[0]}
        try:
            df = pd.read_csv(yearPath + file, sep='\t', header=None).fillna('')
        except:
            with open(yearPath + file, 'r') as infile:
                data = [x.split('\t') for x in infile.readlines()]
            df = pd.DataFrame(data).fillna('')
        resTemp['Npub'] = df.shape[0]
        resPart = parallelize_on_rows(df, isPartofWhat)
        tempDict = resPart.value_counts().to_dict()
        try:
            val = tempDict['True']
        except:
            val = 0
        resTemp.update({group[1]:val})
        resGroup.append(resTemp)
    res.append(resGroup)

dfList = [pd.DataFrame(x).set_index('year') for x in res]

dfInsts = pd.concat(dfList,axis=1)

dfInsts.head(2)

dfInsts = dfInsts.loc[:,~dfInsts.columns.duplicated()]

dfInsts = dfInsts.rename(columns={'Max-Planck-I|Max Planck I|MPG\s|MPI\s':'Max Planck Society'})

Calculate percentage of total publications¶

cols = ['University of California','Harvard University','Max Planck Society','Massachusetts Institute of Technology','University of Oxford']

seriesList = {}
for el in cols:
    seriesList[el] = (dfInsts[el]*100)/dfInsts['Npub']

dfNormed = pd.DataFrame(seriesList)

dfNormed.head(2)

Plotting¶

To save the figure as a PDF uncomment the corresponding line.

Plotting is limited to the reasearch time range of 1945 - 2005.

with PdfPages('../results/Fig7_ResearchInstitutions.pdf') as pdffigure:
    fig, ax1 = plt.subplots(figsize=(10,6))
    color = 'k'
    ax1.set_xlabel('Jahr',fontsize = 14)
    ax1.set_ylabel('% Publikationen pro Institution im SciGraph Corpus', color=color,fontsize = 14)
    plt1 = dfNormed[:61].plot(ax=ax1)
    ax1.tick_params(axis='y', labelcolor=color, labelsize = 14)
    ax1.tick_params(axis='x', labelcolor=color, labelsize = 14)
    ax1.legend(fontsize = 14,loc=2)
    ax1.annotate(
        "Quelle: SciGraph Dataset Articles, Stand: 02/2019", (
            1.02,0.7),(0, 0), xycoords = "axes fraction", textcoords = "offset points", va = "top", style = "italic", fontsize = 10, rotation = 90)

    fig.tight_layout() 
    # Save as PNG graphic
    plt.savefig('../../../results/SciGraph/SciGraph_Other_Institutions.png')
    # Save as PDF
    #pdffigure.savefig(bbox_inches="tight", dpi=400)
    plt.show()
    plt.close()

Export MPG publications¶

Using the GRID IDs and the already used regular expression, we can create a sub-corpus containing only the publications of the MPG.

def getMPGpubs(row):
    colN = 8
    instGridIDs = mpgGridIDs
    instString = 'Max-Planck-I|Max Planck I|MPG\s|MPI\s'
    if type(row[colN])!=float:
        for elem in row[colN].split(';'):
            split = elem.split('https://www.grid.ac/institutes/')
            if len(split)>1:
                if split[1] in instGridIDs:
                    return 'True'
            elif regex.findall(instString, elem):
                return 'True'
        return 'False'
    else:
        return 'None'

retDFList = []
for file in tqdm(years):
    try:
        df = pd.read_csv(yearPath + file, sep='\t', header=None, low_memory=False).fillna('')
    except:
        with open(yearPath + file, 'r') as infile:
            data = [x.split('\t') for x in infile.readlines()]
        df = pd.DataFrame(data).fillna('')
    retVal = parallelize_on_rows(df, getMPGpubs)
    df.insert(0,'isMPG',retVal)
    dfout = df[df.isMPG == 'True'].drop('isMPG',axis=1)
    retDFList.append(dfout)

100%|██████████| 75/75 [05:06<00:00,  4.09s/it]

dfMPG = pd.concat(retDFList)

dfMPG.to_csv(gitDataPath + 'mpgpubs.csv',index=False, sep='\t')

The Max Planck Society and its scientific context