Distribution of research institutions¶
To compare the scientific output with other research institutions, we follow two strategies. Since many more recent publications have affiliations based on the GRID initiatives data, we collect the IDs of main institutions and their relations. In the case of the Max-Planck-Society this number is grid.4372.2. Using _.related_grid_id
we find all registered Max-Planck-Institutes IDs.¶
For the following institutions, related data can be extracted
University of California system
Havard University
University of Oxford
Massachusetts Institute of Technology
Imports¶
import pandas as pd
import numpy as np
import os
import regex
from tqdm import tqdm
from functools import partial
from multiprocessing import Pool, cpu_count
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.pyplot as plt
Helper routines¶
n_cores = cpu_count() - 2
def parallelize(df, func, n_cores=n_cores):
df_split = np.array_split(df, n_cores)
pool = Pool(n_cores)
df = pd.concat(pool.map(func, df_split))
pool.close()
pool.join()
return df
def run_on_subset(func, data_subset):
return data_subset.apply(func, axis=1)
def parallelize_on_rows(data, func, num_of_processes=n_cores):
return parallelize(data, partial(run_on_subset, func), num_of_processes)
Paths setup¶
gitDataPath = '../data/processedData/'
dataPath = '/media/arbeit/b88b17b8-b5d7-4c4e-86bc-ff4b67458842/sciGraph/'
gridPath = dataPath + 'grid/'
yearPath = dataPath + 'processedData/yeardata/'
years = sorted([x for x in os.listdir(yearPath) if x.endswith('.csv') and x != 'noYear.csv'])[113:]
years[:3]
['1945.csv', '1946.csv', '1947.csv']
Grid data¶
dfGridName = pd.read_csv(gridPath + 'grid-2020-06-29/grid.csv')
dfGridRelations = pd.read_csv(gridPath + 'grid-2020-06-29/full_tables/relationships.csv')
dfGRIDLabels = pd.read_csv(gridPath + 'grid-2020-06-29/full_tables/labels.csv')
Institutional setup¶
Load GRID IDs for the institutions to compare.
MPG¶
mpgGridIDs = dfGridRelations[dfGridRelations.grid_id == 'grid.4372.2'].related_grid_id.values
University of California System¶
The GRID ID for University of California system is grid.30389.31.
ucGridIDs = dfGridRelations[dfGridRelations.grid_id == 'grid.30389.31'].related_grid_id.values
University of Harvard¶
harvardGridIDs = list(dfGridRelations[dfGridRelations.grid_id == 'grid.38142.3c'].related_grid_id.values)
harvardGridIDs.extend(['grid.38142.3c'])
University of Oxford¶
oxfordGridIDs = list(dfGridRelations[dfGridRelations.grid_id == 'grid.4991.5'].related_grid_id.values)
oxfordGridIDs.extend(['grid.4991.5'])
Get data¶
def isPartOfInstitution(row, instGridIDs, instString):
colN = 8
instGridIDs = instGridIDs
instString = instString
if type(row[colN])!=float:
for elem in row[colN].split(';'):
split = elem.split('https://www.grid.ac/institutes/')
if len(split)>1:
if split[1] in instGridIDs:
return 'True'
elif regex.findall(instString, elem):
return 'True'
return 'False'
else:
return 'None'
instGroups = [
(ucGridIDs, 'University of California'),
(harvardGridIDs, 'Harvard University'),
(mpgGridIDs, 'Max-Planck-I|Max Planck I|MPG\s|MPI\s'),
(['grid.116068.8'], 'Massachusetts Institute of Technology'),
(oxfordGridIDs , 'University of Oxford')
]
res = []
for group in instGroups:
resGroup = []
def isPartofWhat(row):
return isPartOfInstitution(row, instGridIDs=group[0], instString=group[1])
for file in tqdm(years):
resTemp = {'year':file.split('.')[0].split('_')[0]}
try:
df = pd.read_csv(yearPath + file, sep='\t', header=None).fillna('')
except:
with open(yearPath + file, 'r') as infile:
data = [x.split('\t') for x in infile.readlines()]
df = pd.DataFrame(data).fillna('')
resTemp['Npub'] = df.shape[0]
resPart = parallelize_on_rows(df, isPartofWhat)
tempDict = resPart.value_counts().to_dict()
try:
val = tempDict['True']
except:
val = 0
resTemp.update({group[1]:val})
resGroup.append(resTemp)
res.append(resGroup)
dfList = [pd.DataFrame(x).set_index('year') for x in res]
dfInsts = pd.concat(dfList,axis=1)
dfInsts.head(2)
dfInsts = dfInsts.loc[:,~dfInsts.columns.duplicated()]
dfInsts = dfInsts.rename(columns={'Max-Planck-I|Max Planck I|MPG\s|MPI\s':'Max Planck Society'})
Calculate percentage of total publications¶
cols = ['University of California','Harvard University','Max Planck Society','Massachusetts Institute of Technology','University of Oxford']
seriesList = {}
for el in cols:
seriesList[el] = (dfInsts[el]*100)/dfInsts['Npub']
dfNormed = pd.DataFrame(seriesList)
dfNormed.head(2)
Plotting¶
To save the figure as a PDF uncomment the corresponding line.
Plotting is limited to the reasearch time range of 1945 - 2005.
with PdfPages('../results/Fig7_ResearchInstitutions.pdf') as pdffigure:
fig, ax1 = plt.subplots(figsize=(10,6))
color = 'k'
ax1.set_xlabel('Jahr',fontsize = 14)
ax1.set_ylabel('% Publikationen pro Institution im SciGraph Corpus', color=color,fontsize = 14)
plt1 = dfNormed[:61].plot(ax=ax1)
ax1.tick_params(axis='y', labelcolor=color, labelsize = 14)
ax1.tick_params(axis='x', labelcolor=color, labelsize = 14)
ax1.legend(fontsize = 14,loc=2)
ax1.annotate(
"Quelle: SciGraph Dataset Articles, Stand: 02/2019", (
1.02,0.7),(0, 0), xycoords = "axes fraction", textcoords = "offset points", va = "top", style = "italic", fontsize = 10, rotation = 90)
fig.tight_layout()
# Save as PNG graphic
plt.savefig('../../../results/SciGraph/SciGraph_Other_Institutions.png')
# Save as PDF
#pdffigure.savefig(bbox_inches="tight", dpi=400)
plt.show()
plt.close()
Export MPG publications¶
Using the GRID IDs and the already used regular expression, we can create a sub-corpus containing only the publications of the MPG.
def getMPGpubs(row):
colN = 8
instGridIDs = mpgGridIDs
instString = 'Max-Planck-I|Max Planck I|MPG\s|MPI\s'
if type(row[colN])!=float:
for elem in row[colN].split(';'):
split = elem.split('https://www.grid.ac/institutes/')
if len(split)>1:
if split[1] in instGridIDs:
return 'True'
elif regex.findall(instString, elem):
return 'True'
return 'False'
else:
return 'None'
retDFList = []
for file in tqdm(years):
try:
df = pd.read_csv(yearPath + file, sep='\t', header=None, low_memory=False).fillna('')
except:
with open(yearPath + file, 'r') as infile:
data = [x.split('\t') for x in infile.readlines()]
df = pd.DataFrame(data).fillna('')
retVal = parallelize_on_rows(df, getMPGpubs)
df.insert(0,'isMPG',retVal)
dfout = df[df.isMPG == 'True'].drop('isMPG',axis=1)
retDFList.append(dfout)
100%|██████████| 75/75 [05:06<00:00, 4.09s/it]
dfMPG = pd.concat(retDFList)
dfMPG.to_csv(gitDataPath + 'mpgpubs.csv',index=False, sep='\t')