Title keywords for MPG publications

This notebook calculates scoring for titles in each Sektion. Using the temporal information, the aim is to find evolutions of word usage.

For the titles english and german stopwords are excluded. Only words which are tagged as nouns and adjectives by NLTK are considered for keywords.

Imports

import pandas as pd
import os
from tqdm import tqdm
import numpy as np
from collections import Counter
from functools import reduce


from string import punctuation
import nltk
#import nltk
#nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords
eng_stop = stopwords.words('english')
stopwords = stopwords.words('german')
stopwords.extend(eng_stop)
def cleanText(inputString):
    res = []
    for token in nltk.word_tokenize(inputString):
        if token.lower() not in stopwords:
            if token not in punctuation:
                if token.isalnum():
                    res.append(token.lower())
    return ' '.join(res)

Read data from files

The corpus is created by appyling a regex routine on the corpus file for every year. See Notebook 5.

dataPath = '../data/processedData/'
outPath = dataPath + 'mpgWordScores/'
dfMPGall = pd.read_csv(dataPath + 'MPGPubSektionen_11112020.tsv',sep='\t')
cleanedText = dfMPGall.title.apply(lambda row: cleanText(row))
dfMPGall.insert(0,'cleanText',cleanedText)
dfMPGall.head(3)
cleanText foundMPIs sektion year pubID title lang journalID journalName authors affName
0 kondensationsvorgänge überhitztem arsenikdampf Fritz-Haber-Institut der MPG;Fritz-Haber-Insti... CPTS;CPTS 1946 sg:pub.10.1007/bf00643799 Kondensationsvorgänge aus überhitztem Arsenikd... ['de'] sg:journal.1018189 The Science of Nature sg:person.013544521043.99;Korb_A. https://www.grid.ac/institutes/grid.418028.7;h...
1 uber phosphorylase leukocyten NaN None 1946 sg:pub.10.1007/bf00624523 Uber die Phosphorylase der Leukocyten ['en'] sg:journal.1018189 The Science of Nature Rohdewald_Margarete https://www.grid.ac/institutes/grid.418441.c
2 gewinnung radioaktiver indikatoren uranspaltung NaN None 1946 sg:pub.10.1007/bf00590077 Die Gewinnung radioaktiver Indikatoren aus der... ['de'] sg:journal.1018189 The Science of Nature Seelmann-Eggebert_W. https://www.grid.ac/institutes/grid.419509.0

Scoring pipeline

The scoring routine follows the algorithm detailed in

Abe H., Tsumoto S. (2011) Evaluating a Temporal Pattern Detection Method for Finding Research Keys in Bibliographical Data. In: Peters J.F. et al. (eds) Transactions on Rough Sets XIV. Lecture Notes in Computer Science, vol 6600. Springer, Berlin, Heidelberg. https://doi.org/10.1007/978-3-642-21563-6_1

class getScores(object):
    
    def __init__(self, sourceDataframe):
        
        self.baseDF = sourceDataframe

        self.textCol = 'cleanText'
        self.pubIDCol = 'pubID'
        self.outputDict = {}
        self.useSektionen = True
        if self.useSektionen:
            self.sektionen = ['BMS','CPTS','GSHS','MPG']
            self.resultCorpus = {x:[] for x in self.sektionen}
            self.allNGrams = {x:[] for x in self.sektionen}
            self.allgramslist = {x:[] for x in self.sektionen}
            self.counts = {x:[] for x in self.sektionen}
            self.uniqueNGrams = {x:[] for x in self.sektionen}
            self.sektionDataframes = {x:'' for x in self.sektionen}
            for sek in self.sektionen:
                self.sektionDataframes[sek] = self.baseDF[self.baseDF.sektion.str.contains(sek).fillna(False)]
    
    def getTermPatterns(self, sektion=False, ngramEnd=5):
        allNGrams = {x:[] for x in range(1, ngramEnd + 1, 1)}
        pos_tag = ['NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'JJR', 'JJS']
        if self.useSektionen:
            for idx, row in tqdm(self.sektionDataframes[sektion].iterrows()):
                tokens = nltk.word_tokenize(row[self.textCol])
                pos = nltk.pos_tag(tokens)
                nnJJtokens = [x[0].lower() for x in pos if x[1] in pos_tag]
                self.outputDict[row[self.pubIDCol]] = nnJJtokens
                self.resultCorpus[sektion].append(nnJJtokens)
                for i in range(1, ngramEnd + 1, 1):
                    val = allNGrams[i]
                    val.extend(nltk.ngrams(nnJJtokens, i))
                    allNGrams.update({i:val})
            self.allNGrams[sektion] = allNGrams
            allgrams = [x for y in [y for x,y in self.allNGrams[sektion].items()] for x in y]
            self.allgramslist[sektion] = allgrams
            self.counts[sektion] = Counter(allgrams)
            self.uniqueNGrams[sektion] = set(allgrams)
        else:
            for idx, row in tqdm(self.baseDF.iterrows()):
                tokens = nltk.word_tokenize(row[self.textCol])
                pos = nltk.pos_tag(tokens)
                nnJJtokens = [x[0].lower() for x in pos if x[1] in pos_tag]
                #self.resultCorpus[sektion].append(nnJJtokens)
                tempNGram = []
                for i in range(1, ngramEnd + 1, 1):
                    val = allNGrams[i]
                    newngrams = list(nltk.ngrams(nnJJtokens, i))
                    val.extend(newngrams)
                    tempNGram.extend(newngrams)
                    allNGrams.update({i:val})
                self.outputDict[row[self.pubIDCol]] = tempNGram
            self.allNGrams = allNGrams
            allgrams = [x for y in [y for x,y in self.allNGrams.items()] for x in y]
            self.allgramslist = allgrams
            self.counts = Counter(allgrams)
            self.uniqueNGrams = set(allgrams)
        return 

    def getScore(self, target, sektion=False):
        if self.useSektionen:
            meta={'target':target, 'counts': self.counts[sektion][target], 'corpusL':len(self.allgramslist[sektion]), 'maxL':len(target)}
        else:
            meta={'target':target, 'counts': self.counts[target], 'corpusL':len(self.allgramslist), 'maxL':len(target)}
        res = {f'l_{x}':[] for x in range(1, meta['maxL'] + 1, 1)}
        resr = {f'r_{x}':[] for x in range(1, meta['maxL'] + 1, 1)}
        res.update(resr)
        
        for subgram in target:
            if self.useSektionen:
                tupList = self.allNGrams[sektion][2]
            else:
                tupList = self.allNGrams[2]
            for tup in tupList:
                if tup[1:] == subgram:
                    val = res[f'l_{key}']
                    res.update({f'l_{key}': val.extend(tup[:1])})
                elif tup[:-1] == subgram:
                    val = res[f'r_{key}']
                    res.update({f'r_{key}': val.extend(tup[1:])})

        valueList = []
        for L in range(1,meta['maxL']+1,1):
            leftkey = f'l_{L}'
            rightkey = f'r_{L}'
            valueList.append(((len(list(set(res[leftkey])))+1)*(len(list(set(res[rightkey])))+1)))
        return {target: meta['counts']*(np.prod(valueList))**(1/(2.*meta['maxL']))}
    
    def run(self,task='scores'):
        if task == 'scores':
            scores = {}
            for sek in self.sektionen:
                print(f'Calculating score for {sek}.')
                tmpScores = {}
                self.getTermPatterns(sektion=sek)
                for target in tqdm(self.uniqueNGrams[sek]):
                    tmpScores.update(self.getScore(target, sektion=sek))
                scores[sek] = tmpScores
            return scores
        elif task == 'scoresNet':
            scores = {}
            self.useSektionen = False
            print(f'Calculating score net.')
            self.getTermPatterns()
            for target in tqdm(self.uniqueNGrams):
                scores.update(self.getScore(target))
            for key,val in self.outputDict.items():
                tmpList = []
                for elem in val:
                    tmpList.append([elem,scores[elem]])
                self.outputDict.update({key:tmpList})
            return scores, self.outputDict

Calculate score network

For each year, calculate the unique ngram distribution, allocate scores to each ngram, and create list of ngrams in publication title. The score of an ngram will determine the weight between ngram and publication. The results are saved as tsv files to be read for creating multilayer networks.

yearKeyPubMap = {}
for year,df in dfMPGall.groupby('year'):
    print(f'Processing data for {year}')
    sekScores = getScores(sourceDataframe=df)
    yearSc, yearKeyPub = sekScores.run(task='scoresNet')
    yearKeyPubMap[year] = yearKeyPub
21it [00:00, 987.25it/s]
100%|██████████| 78/78 [00:00<00:00, 10320.70it/s]
9it [00:00, 1060.03it/s]
100%|██████████| 54/54 [00:00<00:00, 11037.64it/s]
29it [00:00, 1003.31it/s]
100%|██████████| 108/108 [00:00<00:00, 9094.99it/s]
78it [00:00, 954.42it/s]
Processing data for 1946
Calculating score net.
Processing data for 1947
Calculating score net.
Processing data for 1948
Calculating score net.
Processing data for 1949
Calculating score net.
100%|██████████| 353/353 [00:00<00:00, 5131.92it/s]
111it [00:00, 920.27it/s]
  0%|          | 0/580 [00:00<?, ?it/s]
Processing data for 1950
Calculating score net.
100%|██████████| 580/580 [00:00<00:00, 3160.96it/s]
147it [00:00, 1019.62it/s]
  0%|          | 0/851 [00:00<?, ?it/s]
Processing data for 1951
Calculating score net.
100%|██████████| 851/851 [00:00<00:00, 2631.07it/s]
108it [00:00, 996.30it/s] 
  0%|          | 0/494 [00:00<?, ?it/s]
Processing data for 1952
Calculating score net.
100%|██████████| 494/494 [00:00<00:00, 3756.03it/s]
153it [00:00, 949.61it/s]
  0%|          | 0/879 [00:00<?, ?it/s]
Processing data for 1953
Calculating score net.
100%|██████████| 879/879 [00:00<00:00, 2478.61it/s]
95it [00:00, 939.93it/s]
Processing data for 1954
Calculating score net.
227it [00:00, 979.90it/s]
100%|██████████| 1189/1189 [00:00<00:00, 2022.91it/s]
105it [00:00, 1044.16it/s]
Processing data for 1955
Calculating score net.
269it [00:00, 1019.88it/s]
100%|██████████| 1463/1463 [00:00<00:00, 1576.89it/s]
101it [00:00, 1002.94it/s]
Processing data for 1956
Calculating score net.
231it [00:00, 989.32it/s] 
100%|██████████| 1014/1014 [00:00<00:00, 2204.65it/s]
98it [00:00, 974.67it/s]
Processing data for 1957
Calculating score net.
221it [00:00, 970.66it/s]
100%|██████████| 1297/1297 [00:00<00:00, 1798.31it/s]
190it [00:00, 960.80it/s]
  0%|          | 0/1039 [00:00<?, ?it/s]
Processing data for 1958
Calculating score net.
100%|██████████| 1039/1039 [00:00<00:00, 2235.75it/s]
203it [00:00, 999.97it/s]
Processing data for 1959
Calculating score net.
234it [00:00, 1003.94it/s]
100%|██████████| 1433/1433 [00:00<00:00, 1656.70it/s]
167it [00:00, 939.79it/s]
  0%|          | 0/927 [00:00<?, ?it/s]
Processing data for 1960
Calculating score net.
100%|██████████| 927/927 [00:00<00:00, 2331.57it/s]
166it [00:00, 884.90it/s]
  0%|          | 0/1033 [00:00<?, ?it/s]
Processing data for 1961
Calculating score net.
100%|██████████| 1033/1033 [00:00<00:00, 2144.96it/s]
179it [00:00, 1009.07it/s]
  0%|          | 0/987 [00:00<?, ?it/s]
Processing data for 1962
Calculating score net.
100%|██████████| 987/987 [00:00<00:00, 2332.68it/s]
93it [00:00, 918.49it/s]
Processing data for 1963
Calculating score net.
203it [00:00, 963.46it/s]
100%|██████████| 1154/1154 [00:00<00:00, 2016.98it/s]
98it [00:00, 976.45it/s]
Processing data for 1964
Calculating score net.
246it [00:00, 992.33it/s]
100%|██████████| 1400/1400 [00:00<00:00, 1656.59it/s]
87it [00:00, 866.71it/s]
Processing data for 1965
Calculating score net.
233it [00:00, 930.82it/s]
100%|██████████| 1407/1407 [00:00<00:00, 1787.41it/s]
93it [00:00, 924.11it/s]
Processing data for 1966
Calculating score net.
378it [00:00, 909.54it/s]
100%|██████████| 2284/2284 [00:02<00:00, 1029.41it/s]
84it [00:00, 838.64it/s]
Processing data for 1967
Calculating score net.
480it [00:00, 854.95it/s]
100%|██████████| 2650/2650 [00:03<00:00, 841.27it/s]
196it [00:00, 950.44it/s]
Processing data for 1968
Calculating score net.
457it [00:00, 995.54it/s]
100%|██████████| 2959/2959 [00:03<00:00, 823.75it/s]
85it [00:00, 845.32it/s]
Processing data for 1969
Calculating score net.
445it [00:00, 910.88it/s]
100%|██████████| 3176/3176 [00:03<00:00, 806.42it/s]
89it [00:00, 886.34it/s]
Processing data for 1970
Calculating score net.
392it [00:00, 885.99it/s]
100%|██████████| 2907/2907 [00:03<00:00, 878.15it/s]
185it [00:00, 892.15it/s]
Processing data for 1971
Calculating score net.
479it [00:00, 908.27it/s]
100%|██████████| 3741/3741 [00:05<00:00, 683.25it/s]
72it [00:00, 713.86it/s]
Processing data for 1972
Calculating score net.
498it [00:00, 825.87it/s]
100%|██████████| 4275/4275 [00:07<00:00, 585.15it/s]
81it [00:00, 805.64it/s]
Processing data for 1973
Calculating score net.
467it [00:00, 841.94it/s]
100%|██████████| 4232/4232 [00:07<00:00, 581.55it/s]
76it [00:00, 758.98it/s]
Processing data for 1974
Calculating score net.
529it [00:00, 813.90it/s]
100%|██████████| 4487/4487 [00:08<00:00, 507.50it/s]
76it [00:00, 757.54it/s]
Processing data for 1975
Calculating score net.
481it [00:00, 671.29it/s]
100%|██████████| 4478/4478 [00:08<00:00, 544.24it/s]
80it [00:00, 799.76it/s]
Processing data for 1976
Calculating score net.
502it [00:00, 869.55it/s]
100%|██████████| 4474/4474 [00:08<00:00, 542.89it/s]
72it [00:00, 715.01it/s]
Processing data for 1977
Calculating score net.
565it [00:00, 750.74it/s]
100%|██████████| 5471/5471 [00:12<00:00, 429.39it/s]
73it [00:00, 725.21it/s]
Processing data for 1978
Calculating score net.
574it [00:00, 836.34it/s]
100%|██████████| 5608/5608 [00:13<00:00, 422.25it/s]
78it [00:00, 776.14it/s]
Processing data for 1979
Calculating score net.
587it [00:00, 844.99it/s]
100%|██████████| 5490/5490 [00:12<00:00, 442.75it/s]
77it [00:00, 769.40it/s]
Processing data for 1980
Calculating score net.
638it [00:00, 834.63it/s]
100%|██████████| 5953/5953 [00:15<00:00, 372.77it/s]
59it [00:00, 588.35it/s]
Processing data for 1981
Calculating score net.
598it [00:00, 698.41it/s]
100%|██████████| 5712/5712 [00:14<00:00, 392.88it/s]
63it [00:00, 620.70it/s]
Processing data for 1982
Calculating score net.
688it [00:00, 791.84it/s]
100%|██████████| 7013/7013 [00:22<00:00, 315.81it/s]
67it [00:00, 668.54it/s]
Processing data for 1983
Calculating score net.
521it [00:00, 685.51it/s]
100%|██████████| 5139/5139 [00:11<00:00, 456.30it/s]
72it [00:00, 711.64it/s]
Processing data for 1984
Calculating score net.
682it [00:00, 815.21it/s]
100%|██████████| 6983/6983 [00:20<00:00, 333.50it/s]
71it [00:00, 705.89it/s]
Processing data for 1985
Calculating score net.
612it [00:00, 825.48it/s]
100%|██████████| 5966/5966 [00:16<00:00, 371.83it/s]
64it [00:00, 638.98it/s]
Processing data for 1986
Calculating score net.
729it [00:01, 629.24it/s]
100%|██████████| 7367/7367 [00:22<00:00, 322.14it/s]
157it [00:00, 776.49it/s]
Processing data for 1987
Calculating score net.
761it [00:00, 784.86it/s]
100%|██████████| 7702/7702 [00:25<00:00, 304.86it/s]
84it [00:00, 831.54it/s]
Processing data for 1988
Calculating score net.
747it [00:00, 841.12it/s]
100%|██████████| 7740/7740 [00:25<00:00, 304.83it/s]
71it [00:00, 702.57it/s]
Processing data for 1989
Calculating score net.
769it [00:00, 807.32it/s]
100%|██████████| 8119/8119 [00:28<00:00, 288.64it/s]
154it [00:00, 744.75it/s]
Processing data for 1990
Calculating score net.
728it [00:00, 777.98it/s]
100%|██████████| 7820/7820 [00:25<00:00, 312.73it/s]
73it [00:00, 727.23it/s]
Processing data for 1991
Calculating score net.
814it [00:00, 821.75it/s]
100%|██████████| 8191/8191 [00:28<00:00, 290.77it/s]
66it [00:00, 655.52it/s]
Processing data for 1992
Calculating score net.
808it [00:01, 744.55it/s]
100%|██████████| 7979/7979 [00:26<00:00, 299.39it/s]
64it [00:00, 638.82it/s]
Processing data for 1993
Calculating score net.
817it [00:01, 794.84it/s]
100%|██████████| 8331/8331 [00:28<00:00, 297.29it/s]
149it [00:00, 735.80it/s]
Processing data for 1994
Calculating score net.
759it [00:00, 819.52it/s]
100%|██████████| 7569/7569 [00:23<00:00, 315.50it/s]
154it [00:00, 729.76it/s]
Processing data for 1995
Calculating score net.
833it [00:01, 820.91it/s]
100%|██████████| 8881/8881 [00:31<00:00, 278.52it/s]
82it [00:00, 808.23it/s]
Processing data for 1996
Calculating score net.
890it [00:01, 842.96it/s]
100%|██████████| 9816/9816 [00:36<00:00, 266.14it/s]
70it [00:00, 697.51it/s]
Processing data for 1997
Calculating score net.
676it [00:00, 806.81it/s]
100%|██████████| 10149/10149 [00:29<00:00, 349.09it/s]
166it [00:00, 803.86it/s]
Processing data for 1998
Calculating score net.
617it [00:00, 818.11it/s]
100%|██████████| 10112/10112 [00:27<00:00, 371.16it/s]
169it [00:00, 842.25it/s]
Processing data for 1999
Calculating score net.
733it [00:00, 845.47it/s]
100%|██████████| 10857/10857 [00:32<00:00, 333.73it/s]
159it [00:00, 779.30it/s]
Processing data for 2000
Calculating score net.
729it [00:00, 839.93it/s]
100%|██████████| 11286/11286 [00:33<00:00, 336.76it/s]
172it [00:00, 856.91it/s]
Processing data for 2001
Calculating score net.
760it [00:00, 851.55it/s]
100%|██████████| 11779/11779 [00:35<00:00, 329.64it/s]
166it [00:00, 814.66it/s]
Processing data for 2002
Calculating score net.
727it [00:00, 845.03it/s]
100%|██████████| 11262/11262 [00:33<00:00, 340.43it/s]
156it [00:00, 774.76it/s]
Processing data for 2003
Calculating score net.
840it [00:01, 831.66it/s]
100%|██████████| 12596/12596 [00:43<00:00, 290.18it/s]
166it [00:00, 804.88it/s]
Processing data for 2004
Calculating score net.
885it [00:01, 705.67it/s]
100%|██████████| 12509/12509 [00:44<00:00, 283.23it/s]
161it [00:00, 802.18it/s]
Processing data for 2005
Calculating score net.
1336it [00:01, 862.34it/s]
100%|██████████| 13291/13291 [01:08<00:00, 194.60it/s]
157it [00:00, 773.09it/s]
Processing data for 2006
Calculating score net.
1401it [00:01, 827.13it/s]
100%|██████████| 15064/15064 [01:26<00:00, 174.55it/s]
154it [00:00, 738.41it/s]
Processing data for 2007
Calculating score net.
1214it [00:01, 827.22it/s]
100%|██████████| 16207/16207 [01:23<00:00, 194.73it/s]
152it [00:00, 739.62it/s]
Processing data for 2008
Calculating score net.
1233it [00:01, 819.97it/s]
100%|██████████| 17407/17407 [01:34<00:00, 184.96it/s]
159it [00:00, 783.12it/s]
Processing data for 2009
Calculating score net.
1776it [00:02, 844.60it/s]
100%|██████████| 18295/18295 [02:13<00:00, 136.71it/s]
74it [00:00, 731.56it/s]
Processing data for 2010
Calculating score net.
1810it [00:02, 838.18it/s]
100%|██████████| 19432/19432 [02:26<00:00, 133.03it/s]
241it [00:00, 786.47it/s]
Processing data for 2011
Calculating score net.
1206it [00:01, 835.18it/s]
100%|██████████| 20450/20450 [01:44<00:00, 195.56it/s]
157it [00:00, 776.57it/s]
Processing data for 2012
Calculating score net.
1288it [00:01, 829.31it/s]
100%|██████████| 21030/21030 [01:53<00:00, 186.04it/s]
158it [00:00, 767.31it/s]
Processing data for 2013
Calculating score net.
1341it [00:01, 832.94it/s]
100%|██████████| 22338/22338 [02:04<00:00, 179.21it/s]
152it [00:00, 737.09it/s]
Processing data for 2014
Calculating score net.
1494it [00:01, 816.60it/s]
100%|██████████| 24462/24462 [02:38<00:00, 154.23it/s]
71it [00:00, 703.85it/s]
Processing data for 2015
Calculating score net.
1729it [00:02, 827.37it/s]
100%|██████████| 26286/26286 [03:14<00:00, 135.07it/s]
157it [00:00, 763.18it/s]
Processing data for 2016
Calculating score net.
2132it [00:02, 817.31it/s]
100%|██████████| 28221/28221 [04:14<00:00, 110.79it/s]
159it [00:00, 781.56it/s]
Processing data for 2017
Calculating score net.
2494it [00:02, 833.49it/s]
100%|██████████| 30030/30030 [05:23<00:00, 92.83it/s] 
156it [00:00, 739.19it/s]
Processing data for 2018
Calculating score net.
3118it [00:03, 823.10it/s]
100%|██████████| 33707/33707 [07:42<00:00, 72.81it/s]
69it [00:00, 688.05it/s]
Processing data for 2019
Calculating score net.
1457it [00:01, 793.01it/s]
100%|██████████| 12084/12084 [01:19<00:00, 151.07it/s]
for year in yearKeyPubMap.keys():
    res = []
    for key, val in yearKeyPubMap[year].items():
        for elem in val:
            res.append([key, elem[0],elem[1]])
    dfTemp = pd.DataFrame(res)
    dfTemp.to_csv(outPath + str(year) + '.tsv', sep='\t',index=None)
    

Calculate scores

Alternatively, create scores for each Sektion to later compare temporal evolution accross word usage. The scores are treated separetly for each Sektion, and saved as CSV files including all years.

yearScores = {}
for year,df in dfMPGall.groupby('year'):
    print(f'Processing data for {year}')
    sekScores = getScores(sourceDataframe=df)
    yearSc = sekScores.run(task='scores')
    yearScores[year] = yearSc
allDF = {x:[] for x in ['BMS','CPTS','GSHS','MPG']}
for year in yearScores.keys():
    yearData = []
    for sek in ['BMS','CPTS','GSHS','MPG']:
        df  = pd.DataFrame([yearScores[year][sek]]).transpose().rename(columns={0:year})
        allDF[sek].append(df)
dfAllYearList = {}
for sek in ['BMS','CPTS','GSHS','MPG']:
    dfAllYear = reduce(lambda df1,df2: pd.merge(df1,df2,left_index=True, right_index=True, how='outer'), allDF[sek])
    dfAllYearList[sek] = dfAllYear
dfAllYearList['BMS'].to_csv(dataPath + 'mpgWordScores/allWordsYears_BMS.csv')
dfAllYearList['CPTS'].to_csv(dataPath + 'mpgWordScores/allWordsYears_CPTS.csv')
dfAllYearList['GSHS'].to_csv(dataPath + 'mpgWordScores/allWordsYears_GSHS.csv')
dfAllYearList['MPG'].to_csv(dataPath + 'mpgWordScores/allWordsYears_MPG.csv')