Distribution of research institutions

To compare the scientific output with other research institutions, we follow two strategies. Since many more recent publications have affiliations based on the GRID initiatives data, we collect the IDs of main institutions and their relations. In the case of the Max-Planck-Society this number is grid.4372.2. Using _.related_grid_id we find all registered Max-Planck-Institutes IDs.¶

For the following institutions, related data can be extracted

  • University of California system

  • Havard University

  • University of Oxford

  • Massachusetts Institute of Technology

Imports

import pandas as pd
import numpy as np
import os

import regex
from tqdm import tqdm

from functools import partial
from multiprocessing import  Pool, cpu_count

from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.pyplot as plt

Helper routines

n_cores = cpu_count() - 2

def parallelize(df, func, n_cores=n_cores):
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

def run_on_subset(func, data_subset):
    return data_subset.apply(func, axis=1)

def parallelize_on_rows(data, func, num_of_processes=n_cores):
    return parallelize(data, partial(run_on_subset, func), num_of_processes)

Paths setup

gitDataPath = '../data/processedData/'
dataPath = '/media/arbeit/b88b17b8-b5d7-4c4e-86bc-ff4b67458842/sciGraph/'
gridPath = dataPath + 'grid/'
yearPath = dataPath + 'processedData/yeardata/'
years = sorted([x for x in os.listdir(yearPath) if x.endswith('.csv') and x != 'noYear.csv'])[113:]
years[:3]
['1945.csv', '1946.csv', '1947.csv']

Grid data

dfGridName = pd.read_csv(gridPath + 'grid-2020-06-29/grid.csv')
dfGridRelations = pd.read_csv(gridPath + 'grid-2020-06-29/full_tables/relationships.csv')
dfGRIDLabels = pd.read_csv(gridPath + 'grid-2020-06-29/full_tables/labels.csv')

Institutional setup

Load GRID IDs for the institutions to compare.

MPG

mpgGridIDs = dfGridRelations[dfGridRelations.grid_id == 'grid.4372.2'].related_grid_id.values

University of California System

The GRID ID for University of California system is grid.30389.31.

ucGridIDs = dfGridRelations[dfGridRelations.grid_id == 'grid.30389.31'].related_grid_id.values

University of Harvard

harvardGridIDs = list(dfGridRelations[dfGridRelations.grid_id == 'grid.38142.3c'].related_grid_id.values)
harvardGridIDs.extend(['grid.38142.3c'])

University of Oxford

oxfordGridIDs = list(dfGridRelations[dfGridRelations.grid_id == 'grid.4991.5'].related_grid_id.values)
oxfordGridIDs.extend(['grid.4991.5'])

Get data

def isPartOfInstitution(row, instGridIDs, instString):
    colN = 8
    instGridIDs = instGridIDs
    instString = instString
    if type(row[colN])!=float:
        for elem in row[colN].split(';'):
            split = elem.split('https://www.grid.ac/institutes/')
            if len(split)>1:
                if split[1] in instGridIDs:
                    return 'True'
            elif regex.findall(instString, elem):
                return 'True'
        return 'False'
    else:
        return 'None'
instGroups = [
    (ucGridIDs, 'University of California'),
    (harvardGridIDs, 'Harvard University'),
    (mpgGridIDs, 'Max-Planck-I|Max Planck I|MPG\s|MPI\s'),
    (['grid.116068.8'], 'Massachusetts Institute of Technology'),
    (oxfordGridIDs , 'University of Oxford')
]
res = []
for group in instGroups:
    resGroup = []
    def isPartofWhat(row):
        return isPartOfInstitution(row, instGridIDs=group[0], instString=group[1])
    for file in tqdm(years):
        resTemp = {'year':file.split('.')[0].split('_')[0]}
        try:
            df = pd.read_csv(yearPath + file, sep='\t', header=None).fillna('')
        except:
            with open(yearPath + file, 'r') as infile:
                data = [x.split('\t') for x in infile.readlines()]
            df = pd.DataFrame(data).fillna('')
        resTemp['Npub'] = df.shape[0]
        resPart = parallelize_on_rows(df, isPartofWhat)
        tempDict = resPart.value_counts().to_dict()
        try:
            val = tempDict['True']
        except:
            val = 0
        resTemp.update({group[1]:val})
        resGroup.append(resTemp)
    res.append(resGroup)
dfList = [pd.DataFrame(x).set_index('year') for x in res]
dfInsts = pd.concat(dfList,axis=1)
dfInsts.head(2)
dfInsts = dfInsts.loc[:,~dfInsts.columns.duplicated()]
dfInsts = dfInsts.rename(columns={'Max-Planck-I|Max Planck I|MPG\s|MPI\s':'Max Planck Society'})

Calculate percentage of total publications

cols = ['University of California','Harvard University','Max Planck Society','Massachusetts Institute of Technology','University of Oxford']
seriesList = {}
for el in cols:
    seriesList[el] = (dfInsts[el]*100)/dfInsts['Npub']
dfNormed = pd.DataFrame(seriesList)
dfNormed.head(2)

Plotting

To save the figure as a PDF uncomment the corresponding line.

Plotting is limited to the reasearch time range of 1945 - 2005.

with PdfPages('../results/Fig7_ResearchInstitutions.pdf') as pdffigure:
    fig, ax1 = plt.subplots(figsize=(10,6))
    color = 'k'
    ax1.set_xlabel('Jahr',fontsize = 14)
    ax1.set_ylabel('% Publikationen pro Institution im SciGraph Corpus', color=color,fontsize = 14)
    plt1 = dfNormed[:61].plot(ax=ax1)
    ax1.tick_params(axis='y', labelcolor=color, labelsize = 14)
    ax1.tick_params(axis='x', labelcolor=color, labelsize = 14)
    ax1.legend(fontsize = 14,loc=2)
    ax1.annotate(
        "Quelle: SciGraph Dataset Articles, Stand: 02/2019", (
            1.02,0.7),(0, 0), xycoords = "axes fraction", textcoords = "offset points", va = "top", style = "italic", fontsize = 10, rotation = 90)

    fig.tight_layout() 
    # Save as PNG graphic
    plt.savefig('../../../results/SciGraph/SciGraph_Other_Institutions.png')
    # Save as PDF
    #pdffigure.savefig(bbox_inches="tight", dpi=400)
    plt.show()
    plt.close()

Export MPG publications

Using the GRID IDs and the already used regular expression, we can create a sub-corpus containing only the publications of the MPG.

def getMPGpubs(row):
    colN = 8
    instGridIDs = mpgGridIDs
    instString = 'Max-Planck-I|Max Planck I|MPG\s|MPI\s'
    if type(row[colN])!=float:
        for elem in row[colN].split(';'):
            split = elem.split('https://www.grid.ac/institutes/')
            if len(split)>1:
                if split[1] in instGridIDs:
                    return 'True'
            elif regex.findall(instString, elem):
                return 'True'
        return 'False'
    else:
        return 'None'
retDFList = []
for file in tqdm(years):
    try:
        df = pd.read_csv(yearPath + file, sep='\t', header=None, low_memory=False).fillna('')
    except:
        with open(yearPath + file, 'r') as infile:
            data = [x.split('\t') for x in infile.readlines()]
        df = pd.DataFrame(data).fillna('')
    retVal = parallelize_on_rows(df, getMPGpubs)
    df.insert(0,'isMPG',retVal)
    dfout = df[df.isMPG == 'True'].drop('isMPG',axis=1)
    retDFList.append(dfout)
100%|██████████| 75/75 [05:06<00:00,  4.09s/it]
dfMPG = pd.concat(retDFList)
dfMPG.to_csv(gitDataPath + 'mpgpubs.csv',index=False, sep='\t')