Title keywords for MPG publications¶
This notebook calculates scoring for titles in each Sektion. Using the temporal information, the aim is to find evolutions of word usage.
For the titles english and german stopwords are excluded. Only words which are tagged as nouns and adjectives by NLTK are considered for keywords.
Imports¶
import pandas as pd
import os
from tqdm import tqdm
import numpy as np
from collections import Counter
from functools import reduce
from string import punctuation
import nltk
#import nltk
#nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords
eng_stop = stopwords.words('english')
stopwords = stopwords.words('german')
stopwords.extend(eng_stop)
def cleanText(inputString):
res = []
for token in nltk.word_tokenize(inputString):
if token.lower() not in stopwords:
if token not in punctuation:
if token.isalnum():
res.append(token.lower())
return ' '.join(res)
Read data from files¶
The corpus is created by appyling a regex routine on the corpus file for every year. See Notebook 5.
dataPath = '../data/processedData/'
outPath = dataPath + 'mpgWordScores/'
dfMPGall = pd.read_csv(dataPath + 'MPGPubSektionen_11112020.tsv',sep='\t')
cleanedText = dfMPGall.title.apply(lambda row: cleanText(row))
dfMPGall.insert(0,'cleanText',cleanedText)
dfMPGall.head(3)
cleanText | foundMPIs | sektion | year | pubID | title | lang | journalID | journalName | authors | affName | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | kondensationsvorgänge überhitztem arsenikdampf | Fritz-Haber-Institut der MPG;Fritz-Haber-Insti... | CPTS;CPTS | 1946 | sg:pub.10.1007/bf00643799 | Kondensationsvorgänge aus überhitztem Arsenikd... | ['de'] | sg:journal.1018189 | The Science of Nature | sg:person.013544521043.99;Korb_A. | https://www.grid.ac/institutes/grid.418028.7;h... |
1 | uber phosphorylase leukocyten | NaN | None | 1946 | sg:pub.10.1007/bf00624523 | Uber die Phosphorylase der Leukocyten | ['en'] | sg:journal.1018189 | The Science of Nature | Rohdewald_Margarete | https://www.grid.ac/institutes/grid.418441.c |
2 | gewinnung radioaktiver indikatoren uranspaltung | NaN | None | 1946 | sg:pub.10.1007/bf00590077 | Die Gewinnung radioaktiver Indikatoren aus der... | ['de'] | sg:journal.1018189 | The Science of Nature | Seelmann-Eggebert_W. | https://www.grid.ac/institutes/grid.419509.0 |
Scoring pipeline¶
The scoring routine follows the algorithm detailed in
Abe H., Tsumoto S. (2011) Evaluating a Temporal Pattern Detection Method for Finding Research Keys in Bibliographical Data. In: Peters J.F. et al. (eds) Transactions on Rough Sets XIV. Lecture Notes in Computer Science, vol 6600. Springer, Berlin, Heidelberg. https://doi.org/10.1007/978-3-642-21563-6_1
class getScores(object):
def __init__(self, sourceDataframe):
self.baseDF = sourceDataframe
self.textCol = 'cleanText'
self.pubIDCol = 'pubID'
self.outputDict = {}
self.useSektionen = True
if self.useSektionen:
self.sektionen = ['BMS','CPTS','GSHS','MPG']
self.resultCorpus = {x:[] for x in self.sektionen}
self.allNGrams = {x:[] for x in self.sektionen}
self.allgramslist = {x:[] for x in self.sektionen}
self.counts = {x:[] for x in self.sektionen}
self.uniqueNGrams = {x:[] for x in self.sektionen}
self.sektionDataframes = {x:'' for x in self.sektionen}
for sek in self.sektionen:
self.sektionDataframes[sek] = self.baseDF[self.baseDF.sektion.str.contains(sek).fillna(False)]
def getTermPatterns(self, sektion=False, ngramEnd=5):
allNGrams = {x:[] for x in range(1, ngramEnd + 1, 1)}
pos_tag = ['NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'JJR', 'JJS']
if self.useSektionen:
for idx, row in tqdm(self.sektionDataframes[sektion].iterrows()):
tokens = nltk.word_tokenize(row[self.textCol])
pos = nltk.pos_tag(tokens)
nnJJtokens = [x[0].lower() for x in pos if x[1] in pos_tag]
self.outputDict[row[self.pubIDCol]] = nnJJtokens
self.resultCorpus[sektion].append(nnJJtokens)
for i in range(1, ngramEnd + 1, 1):
val = allNGrams[i]
val.extend(nltk.ngrams(nnJJtokens, i))
allNGrams.update({i:val})
self.allNGrams[sektion] = allNGrams
allgrams = [x for y in [y for x,y in self.allNGrams[sektion].items()] for x in y]
self.allgramslist[sektion] = allgrams
self.counts[sektion] = Counter(allgrams)
self.uniqueNGrams[sektion] = set(allgrams)
else:
for idx, row in tqdm(self.baseDF.iterrows()):
tokens = nltk.word_tokenize(row[self.textCol])
pos = nltk.pos_tag(tokens)
nnJJtokens = [x[0].lower() for x in pos if x[1] in pos_tag]
#self.resultCorpus[sektion].append(nnJJtokens)
tempNGram = []
for i in range(1, ngramEnd + 1, 1):
val = allNGrams[i]
newngrams = list(nltk.ngrams(nnJJtokens, i))
val.extend(newngrams)
tempNGram.extend(newngrams)
allNGrams.update({i:val})
self.outputDict[row[self.pubIDCol]] = tempNGram
self.allNGrams = allNGrams
allgrams = [x for y in [y for x,y in self.allNGrams.items()] for x in y]
self.allgramslist = allgrams
self.counts = Counter(allgrams)
self.uniqueNGrams = set(allgrams)
return
def getScore(self, target, sektion=False):
if self.useSektionen:
meta={'target':target, 'counts': self.counts[sektion][target], 'corpusL':len(self.allgramslist[sektion]), 'maxL':len(target)}
else:
meta={'target':target, 'counts': self.counts[target], 'corpusL':len(self.allgramslist), 'maxL':len(target)}
res = {f'l_{x}':[] for x in range(1, meta['maxL'] + 1, 1)}
resr = {f'r_{x}':[] for x in range(1, meta['maxL'] + 1, 1)}
res.update(resr)
for subgram in target:
if self.useSektionen:
tupList = self.allNGrams[sektion][2]
else:
tupList = self.allNGrams[2]
for tup in tupList:
if tup[1:] == subgram:
val = res[f'l_{key}']
res.update({f'l_{key}': val.extend(tup[:1])})
elif tup[:-1] == subgram:
val = res[f'r_{key}']
res.update({f'r_{key}': val.extend(tup[1:])})
valueList = []
for L in range(1,meta['maxL']+1,1):
leftkey = f'l_{L}'
rightkey = f'r_{L}'
valueList.append(((len(list(set(res[leftkey])))+1)*(len(list(set(res[rightkey])))+1)))
return {target: meta['counts']*(np.prod(valueList))**(1/(2.*meta['maxL']))}
def run(self,task='scores'):
if task == 'scores':
scores = {}
for sek in self.sektionen:
print(f'Calculating score for {sek}.')
tmpScores = {}
self.getTermPatterns(sektion=sek)
for target in tqdm(self.uniqueNGrams[sek]):
tmpScores.update(self.getScore(target, sektion=sek))
scores[sek] = tmpScores
return scores
elif task == 'scoresNet':
scores = {}
self.useSektionen = False
print(f'Calculating score net.')
self.getTermPatterns()
for target in tqdm(self.uniqueNGrams):
scores.update(self.getScore(target))
for key,val in self.outputDict.items():
tmpList = []
for elem in val:
tmpList.append([elem,scores[elem]])
self.outputDict.update({key:tmpList})
return scores, self.outputDict
Calculate score network¶
For each year, calculate the unique ngram distribution, allocate scores to each ngram, and create list of ngrams in publication title. The score of an ngram will determine the weight between ngram and publication. The results are saved as tsv files to be read for creating multilayer networks.
yearKeyPubMap = {}
for year,df in dfMPGall.groupby('year'):
print(f'Processing data for {year}')
sekScores = getScores(sourceDataframe=df)
yearSc, yearKeyPub = sekScores.run(task='scoresNet')
yearKeyPubMap[year] = yearKeyPub
21it [00:00, 987.25it/s]
100%|██████████| 78/78 [00:00<00:00, 10320.70it/s]
9it [00:00, 1060.03it/s]
100%|██████████| 54/54 [00:00<00:00, 11037.64it/s]
29it [00:00, 1003.31it/s]
100%|██████████| 108/108 [00:00<00:00, 9094.99it/s]
78it [00:00, 954.42it/s]
Processing data for 1946
Calculating score net.
Processing data for 1947
Calculating score net.
Processing data for 1948
Calculating score net.
Processing data for 1949
Calculating score net.
100%|██████████| 353/353 [00:00<00:00, 5131.92it/s]
111it [00:00, 920.27it/s]
0%| | 0/580 [00:00<?, ?it/s]
Processing data for 1950
Calculating score net.
100%|██████████| 580/580 [00:00<00:00, 3160.96it/s]
147it [00:00, 1019.62it/s]
0%| | 0/851 [00:00<?, ?it/s]
Processing data for 1951
Calculating score net.
100%|██████████| 851/851 [00:00<00:00, 2631.07it/s]
108it [00:00, 996.30it/s]
0%| | 0/494 [00:00<?, ?it/s]
Processing data for 1952
Calculating score net.
100%|██████████| 494/494 [00:00<00:00, 3756.03it/s]
153it [00:00, 949.61it/s]
0%| | 0/879 [00:00<?, ?it/s]
Processing data for 1953
Calculating score net.
100%|██████████| 879/879 [00:00<00:00, 2478.61it/s]
95it [00:00, 939.93it/s]
Processing data for 1954
Calculating score net.
227it [00:00, 979.90it/s]
100%|██████████| 1189/1189 [00:00<00:00, 2022.91it/s]
105it [00:00, 1044.16it/s]
Processing data for 1955
Calculating score net.
269it [00:00, 1019.88it/s]
100%|██████████| 1463/1463 [00:00<00:00, 1576.89it/s]
101it [00:00, 1002.94it/s]
Processing data for 1956
Calculating score net.
231it [00:00, 989.32it/s]
100%|██████████| 1014/1014 [00:00<00:00, 2204.65it/s]
98it [00:00, 974.67it/s]
Processing data for 1957
Calculating score net.
221it [00:00, 970.66it/s]
100%|██████████| 1297/1297 [00:00<00:00, 1798.31it/s]
190it [00:00, 960.80it/s]
0%| | 0/1039 [00:00<?, ?it/s]
Processing data for 1958
Calculating score net.
100%|██████████| 1039/1039 [00:00<00:00, 2235.75it/s]
203it [00:00, 999.97it/s]
Processing data for 1959
Calculating score net.
234it [00:00, 1003.94it/s]
100%|██████████| 1433/1433 [00:00<00:00, 1656.70it/s]
167it [00:00, 939.79it/s]
0%| | 0/927 [00:00<?, ?it/s]
Processing data for 1960
Calculating score net.
100%|██████████| 927/927 [00:00<00:00, 2331.57it/s]
166it [00:00, 884.90it/s]
0%| | 0/1033 [00:00<?, ?it/s]
Processing data for 1961
Calculating score net.
100%|██████████| 1033/1033 [00:00<00:00, 2144.96it/s]
179it [00:00, 1009.07it/s]
0%| | 0/987 [00:00<?, ?it/s]
Processing data for 1962
Calculating score net.
100%|██████████| 987/987 [00:00<00:00, 2332.68it/s]
93it [00:00, 918.49it/s]
Processing data for 1963
Calculating score net.
203it [00:00, 963.46it/s]
100%|██████████| 1154/1154 [00:00<00:00, 2016.98it/s]
98it [00:00, 976.45it/s]
Processing data for 1964
Calculating score net.
246it [00:00, 992.33it/s]
100%|██████████| 1400/1400 [00:00<00:00, 1656.59it/s]
87it [00:00, 866.71it/s]
Processing data for 1965
Calculating score net.
233it [00:00, 930.82it/s]
100%|██████████| 1407/1407 [00:00<00:00, 1787.41it/s]
93it [00:00, 924.11it/s]
Processing data for 1966
Calculating score net.
378it [00:00, 909.54it/s]
100%|██████████| 2284/2284 [00:02<00:00, 1029.41it/s]
84it [00:00, 838.64it/s]
Processing data for 1967
Calculating score net.
480it [00:00, 854.95it/s]
100%|██████████| 2650/2650 [00:03<00:00, 841.27it/s]
196it [00:00, 950.44it/s]
Processing data for 1968
Calculating score net.
457it [00:00, 995.54it/s]
100%|██████████| 2959/2959 [00:03<00:00, 823.75it/s]
85it [00:00, 845.32it/s]
Processing data for 1969
Calculating score net.
445it [00:00, 910.88it/s]
100%|██████████| 3176/3176 [00:03<00:00, 806.42it/s]
89it [00:00, 886.34it/s]
Processing data for 1970
Calculating score net.
392it [00:00, 885.99it/s]
100%|██████████| 2907/2907 [00:03<00:00, 878.15it/s]
185it [00:00, 892.15it/s]
Processing data for 1971
Calculating score net.
479it [00:00, 908.27it/s]
100%|██████████| 3741/3741 [00:05<00:00, 683.25it/s]
72it [00:00, 713.86it/s]
Processing data for 1972
Calculating score net.
498it [00:00, 825.87it/s]
100%|██████████| 4275/4275 [00:07<00:00, 585.15it/s]
81it [00:00, 805.64it/s]
Processing data for 1973
Calculating score net.
467it [00:00, 841.94it/s]
100%|██████████| 4232/4232 [00:07<00:00, 581.55it/s]
76it [00:00, 758.98it/s]
Processing data for 1974
Calculating score net.
529it [00:00, 813.90it/s]
100%|██████████| 4487/4487 [00:08<00:00, 507.50it/s]
76it [00:00, 757.54it/s]
Processing data for 1975
Calculating score net.
481it [00:00, 671.29it/s]
100%|██████████| 4478/4478 [00:08<00:00, 544.24it/s]
80it [00:00, 799.76it/s]
Processing data for 1976
Calculating score net.
502it [00:00, 869.55it/s]
100%|██████████| 4474/4474 [00:08<00:00, 542.89it/s]
72it [00:00, 715.01it/s]
Processing data for 1977
Calculating score net.
565it [00:00, 750.74it/s]
100%|██████████| 5471/5471 [00:12<00:00, 429.39it/s]
73it [00:00, 725.21it/s]
Processing data for 1978
Calculating score net.
574it [00:00, 836.34it/s]
100%|██████████| 5608/5608 [00:13<00:00, 422.25it/s]
78it [00:00, 776.14it/s]
Processing data for 1979
Calculating score net.
587it [00:00, 844.99it/s]
100%|██████████| 5490/5490 [00:12<00:00, 442.75it/s]
77it [00:00, 769.40it/s]
Processing data for 1980
Calculating score net.
638it [00:00, 834.63it/s]
100%|██████████| 5953/5953 [00:15<00:00, 372.77it/s]
59it [00:00, 588.35it/s]
Processing data for 1981
Calculating score net.
598it [00:00, 698.41it/s]
100%|██████████| 5712/5712 [00:14<00:00, 392.88it/s]
63it [00:00, 620.70it/s]
Processing data for 1982
Calculating score net.
688it [00:00, 791.84it/s]
100%|██████████| 7013/7013 [00:22<00:00, 315.81it/s]
67it [00:00, 668.54it/s]
Processing data for 1983
Calculating score net.
521it [00:00, 685.51it/s]
100%|██████████| 5139/5139 [00:11<00:00, 456.30it/s]
72it [00:00, 711.64it/s]
Processing data for 1984
Calculating score net.
682it [00:00, 815.21it/s]
100%|██████████| 6983/6983 [00:20<00:00, 333.50it/s]
71it [00:00, 705.89it/s]
Processing data for 1985
Calculating score net.
612it [00:00, 825.48it/s]
100%|██████████| 5966/5966 [00:16<00:00, 371.83it/s]
64it [00:00, 638.98it/s]
Processing data for 1986
Calculating score net.
729it [00:01, 629.24it/s]
100%|██████████| 7367/7367 [00:22<00:00, 322.14it/s]
157it [00:00, 776.49it/s]
Processing data for 1987
Calculating score net.
761it [00:00, 784.86it/s]
100%|██████████| 7702/7702 [00:25<00:00, 304.86it/s]
84it [00:00, 831.54it/s]
Processing data for 1988
Calculating score net.
747it [00:00, 841.12it/s]
100%|██████████| 7740/7740 [00:25<00:00, 304.83it/s]
71it [00:00, 702.57it/s]
Processing data for 1989
Calculating score net.
769it [00:00, 807.32it/s]
100%|██████████| 8119/8119 [00:28<00:00, 288.64it/s]
154it [00:00, 744.75it/s]
Processing data for 1990
Calculating score net.
728it [00:00, 777.98it/s]
100%|██████████| 7820/7820 [00:25<00:00, 312.73it/s]
73it [00:00, 727.23it/s]
Processing data for 1991
Calculating score net.
814it [00:00, 821.75it/s]
100%|██████████| 8191/8191 [00:28<00:00, 290.77it/s]
66it [00:00, 655.52it/s]
Processing data for 1992
Calculating score net.
808it [00:01, 744.55it/s]
100%|██████████| 7979/7979 [00:26<00:00, 299.39it/s]
64it [00:00, 638.82it/s]
Processing data for 1993
Calculating score net.
817it [00:01, 794.84it/s]
100%|██████████| 8331/8331 [00:28<00:00, 297.29it/s]
149it [00:00, 735.80it/s]
Processing data for 1994
Calculating score net.
759it [00:00, 819.52it/s]
100%|██████████| 7569/7569 [00:23<00:00, 315.50it/s]
154it [00:00, 729.76it/s]
Processing data for 1995
Calculating score net.
833it [00:01, 820.91it/s]
100%|██████████| 8881/8881 [00:31<00:00, 278.52it/s]
82it [00:00, 808.23it/s]
Processing data for 1996
Calculating score net.
890it [00:01, 842.96it/s]
100%|██████████| 9816/9816 [00:36<00:00, 266.14it/s]
70it [00:00, 697.51it/s]
Processing data for 1997
Calculating score net.
676it [00:00, 806.81it/s]
100%|██████████| 10149/10149 [00:29<00:00, 349.09it/s]
166it [00:00, 803.86it/s]
Processing data for 1998
Calculating score net.
617it [00:00, 818.11it/s]
100%|██████████| 10112/10112 [00:27<00:00, 371.16it/s]
169it [00:00, 842.25it/s]
Processing data for 1999
Calculating score net.
733it [00:00, 845.47it/s]
100%|██████████| 10857/10857 [00:32<00:00, 333.73it/s]
159it [00:00, 779.30it/s]
Processing data for 2000
Calculating score net.
729it [00:00, 839.93it/s]
100%|██████████| 11286/11286 [00:33<00:00, 336.76it/s]
172it [00:00, 856.91it/s]
Processing data for 2001
Calculating score net.
760it [00:00, 851.55it/s]
100%|██████████| 11779/11779 [00:35<00:00, 329.64it/s]
166it [00:00, 814.66it/s]
Processing data for 2002
Calculating score net.
727it [00:00, 845.03it/s]
100%|██████████| 11262/11262 [00:33<00:00, 340.43it/s]
156it [00:00, 774.76it/s]
Processing data for 2003
Calculating score net.
840it [00:01, 831.66it/s]
100%|██████████| 12596/12596 [00:43<00:00, 290.18it/s]
166it [00:00, 804.88it/s]
Processing data for 2004
Calculating score net.
885it [00:01, 705.67it/s]
100%|██████████| 12509/12509 [00:44<00:00, 283.23it/s]
161it [00:00, 802.18it/s]
Processing data for 2005
Calculating score net.
1336it [00:01, 862.34it/s]
100%|██████████| 13291/13291 [01:08<00:00, 194.60it/s]
157it [00:00, 773.09it/s]
Processing data for 2006
Calculating score net.
1401it [00:01, 827.13it/s]
100%|██████████| 15064/15064 [01:26<00:00, 174.55it/s]
154it [00:00, 738.41it/s]
Processing data for 2007
Calculating score net.
1214it [00:01, 827.22it/s]
100%|██████████| 16207/16207 [01:23<00:00, 194.73it/s]
152it [00:00, 739.62it/s]
Processing data for 2008
Calculating score net.
1233it [00:01, 819.97it/s]
100%|██████████| 17407/17407 [01:34<00:00, 184.96it/s]
159it [00:00, 783.12it/s]
Processing data for 2009
Calculating score net.
1776it [00:02, 844.60it/s]
100%|██████████| 18295/18295 [02:13<00:00, 136.71it/s]
74it [00:00, 731.56it/s]
Processing data for 2010
Calculating score net.
1810it [00:02, 838.18it/s]
100%|██████████| 19432/19432 [02:26<00:00, 133.03it/s]
241it [00:00, 786.47it/s]
Processing data for 2011
Calculating score net.
1206it [00:01, 835.18it/s]
100%|██████████| 20450/20450 [01:44<00:00, 195.56it/s]
157it [00:00, 776.57it/s]
Processing data for 2012
Calculating score net.
1288it [00:01, 829.31it/s]
100%|██████████| 21030/21030 [01:53<00:00, 186.04it/s]
158it [00:00, 767.31it/s]
Processing data for 2013
Calculating score net.
1341it [00:01, 832.94it/s]
100%|██████████| 22338/22338 [02:04<00:00, 179.21it/s]
152it [00:00, 737.09it/s]
Processing data for 2014
Calculating score net.
1494it [00:01, 816.60it/s]
100%|██████████| 24462/24462 [02:38<00:00, 154.23it/s]
71it [00:00, 703.85it/s]
Processing data for 2015
Calculating score net.
1729it [00:02, 827.37it/s]
100%|██████████| 26286/26286 [03:14<00:00, 135.07it/s]
157it [00:00, 763.18it/s]
Processing data for 2016
Calculating score net.
2132it [00:02, 817.31it/s]
100%|██████████| 28221/28221 [04:14<00:00, 110.79it/s]
159it [00:00, 781.56it/s]
Processing data for 2017
Calculating score net.
2494it [00:02, 833.49it/s]
100%|██████████| 30030/30030 [05:23<00:00, 92.83it/s]
156it [00:00, 739.19it/s]
Processing data for 2018
Calculating score net.
3118it [00:03, 823.10it/s]
100%|██████████| 33707/33707 [07:42<00:00, 72.81it/s]
69it [00:00, 688.05it/s]
Processing data for 2019
Calculating score net.
1457it [00:01, 793.01it/s]
100%|██████████| 12084/12084 [01:19<00:00, 151.07it/s]
for year in yearKeyPubMap.keys():
res = []
for key, val in yearKeyPubMap[year].items():
for elem in val:
res.append([key, elem[0],elem[1]])
dfTemp = pd.DataFrame(res)
dfTemp.to_csv(outPath + str(year) + '.tsv', sep='\t',index=None)
Calculate scores¶
Alternatively, create scores for each Sektion to later compare temporal evolution accross word usage. The scores are treated separetly for each Sektion, and saved as CSV files including all years.
yearScores = {}
for year,df in dfMPGall.groupby('year'):
print(f'Processing data for {year}')
sekScores = getScores(sourceDataframe=df)
yearSc = sekScores.run(task='scores')
yearScores[year] = yearSc
allDF = {x:[] for x in ['BMS','CPTS','GSHS','MPG']}
for year in yearScores.keys():
yearData = []
for sek in ['BMS','CPTS','GSHS','MPG']:
df = pd.DataFrame([yearScores[year][sek]]).transpose().rename(columns={0:year})
allDF[sek].append(df)
dfAllYearList = {}
for sek in ['BMS','CPTS','GSHS','MPG']:
dfAllYear = reduce(lambda df1,df2: pd.merge(df1,df2,left_index=True, right_index=True, how='outer'), allDF[sek])
dfAllYearList[sek] = dfAllYear
dfAllYearList['BMS'].to_csv(dataPath + 'mpgWordScores/allWordsYears_BMS.csv')
dfAllYearList['CPTS'].to_csv(dataPath + 'mpgWordScores/allWordsYears_CPTS.csv')
dfAllYearList['GSHS'].to_csv(dataPath + 'mpgWordScores/allWordsYears_GSHS.csv')
dfAllYearList['MPG'].to_csv(dataPath + 'mpgWordScores/allWordsYears_MPG.csv')