Languages in the corpus

For later analysis, the language of texts is of importance. We therefore need statistics of the languages, to later focus on english and german texts.

import pandas as pd
import datetime
import os
import re
import random
import numpy as np
from tqdm import tqdm
from functools import partial
from multiprocessing import  Pool, cpu_count

from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.pyplot as plt

Setup paths

yearBasePath = '/media/arbeit/b88b17b8-b5d7-4c4e-86bc-ff4b67458842/sciGraph/processedData/yeardata/'
yearLinkData = sorted([x for x in os.listdir(yearBasePath) if x.endswith('csv')])
def getNAuthors(row):
    if row:
        return len(row.split(';'))
    else:
        return 0
res = []
for year in tqdm(yearLinkData):
    try:
        dfTemp = pd.read_csv(yearBasePath + year,sep='\t',header=None, low_memory=False)
        dfTemp = dfTemp.rename(columns={0:'year',1:'publicationID',2:'title',3:'language',4:'journalID',5:'journal',6:'hasMPG',7:'authors',8:'affiliation'})
        dfLang = dfTemp.language.value_counts().to_frame().reset_index()
        langCounts = dfLang[dfLang["index"].isin(["['en']", "['de']","['fr']","['it']","['nl']","['ru']"])].set_index('index').to_dict()['language']
        langCounts.update(
            {'year': dfTemp.year.unique()[0],
             'N_pub':dfTemp.shape[0],
             'N_no_aut':dfTemp[dfTemp['authors'].isna()].shape[0],
             'N_no_aff':dfTemp[dfTemp['affiliation'].isna()].shape[0],
             'median authors':dfTemp.authors.fillna('').apply(lambda row: getNAuthors(row)).median() 
            })
        res.append(langCounts)
    except pd.errors.ParserError:
        print(year)
    
100%|██████████| 189/189 [02:00<00:00,  1.57it/s]
df = pd.DataFrame(res)
df.shape
(189, 11)
other = df.fillna(0).apply(lambda row: row['N_pub'] - row["['en']"] - row["['de']"] - row["['ru']"] - row["['fr']"] - row["['it']"],axis=1)
notDeEng = df.fillna(0).apply(lambda row: row['N_pub'] - row["['en']"] - row["['de']"],axis=1)
df.insert(0,'other',other)
df.insert(0,'not_de/eng',notDeEng)
df = df.set_index('year')
df.head(3)
not_de/eng other ['en'] N_pub N_no_aut N_no_aff median authors ['fr'] ['it'] ['de'] ['nl'] ['ru']
year
1832 0.0 0.0 50 50 10 42 1.0 NaN NaN NaN NaN NaN
1833 0.0 0.0 62 62 16 54 1.0 NaN NaN NaN NaN NaN
1834 1.0 1.0 55 56 14 43 1.0 NaN NaN NaN NaN NaN
1835 3.0 2.0 59 62 14 54 1.0 1.0 NaN NaN NaN NaN
1836 6.0 3.0 56 62 12 46 1.0 NaN 3.0 NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ...
2016 24105.0 19819.0 436896 486830 27854 50266 4.0 1683.0 2596.0 25829.0 7609.0 7.0
2017 30052.0 24435.0 551611 610996 36302 60672 4.0 1902.0 3704.0 29333.0 7410.0 11.0
2018 30017.0 25263.0 633293 694995 39056 80009 4.0 967.0 3778.0 31685.0 7073.0 9.0
2019 5821.0 4942.0 285185 302938 9249 45126 4.0 210.0 665.0 11932.0 1482.0 4.0
NaN 1.0 1.0 44 45 6 33 3.0 NaN NaN NaN NaN NaN

189 rows × 12 columns

Plots

What is the temporal distribution of languages other then english in the corpus?

  • The rise of netherlands language after 1996 is due to miss-detection of english short texts like book presentations in e.g. Nature.

  • The same might be true for italian.

with PdfPages('../results/Fig2_MinorLanguages.pdf') as pdffigure:
    fig, ax1 = plt.subplots()

    color = 'tab:red'
    ax1.set_xlabel('year')
    ax1.set_ylabel('Publications in languages', color=color)
    plt1 = df.iloc[113:][['other',"['nl']","['ru']","['it']","['fr']","['de']"]].plot(ax=ax1)
    ax1.tick_params(axis='y', labelcolor=color)
    ax2 = ax1.twinx()  
    ax1.legend(loc=0)
    color = 'tab:gray'
    ax2.set_ylabel('Number of publications', color=color)  
    plt2 = df.iloc[113:][['N_pub']].plot(ax=ax2,color=color, style='.')
    ax2.tick_params(axis='y', labelcolor=color)
    ax2.legend(loc=9)

    fig.tight_layout()  
    pdffigure.savefig(bbox_inches="tight", dpi=400)
    plt.show()
    plt.close()
/home/arbeit/Dokumente/gwdgGitlab/GMPG/gmpg-notebooks/env/lib/python3.8/site-packages/pandas/plotting/_matplotlib/core.py:1235: UserWarning: FixedFormatter should only be used together with FixedLocator
  ax.set_xticklabels(xticklabels)
../_images/4_Languages_15_1.png

Compare contribution of german to english texts in the corpus, with all other languages as addition.

with PdfPages('../results/Fig3_MajorLanguages.pdf') as pdffigure:
    fig, ax1 = plt.subplots()

    color = 'tab:red'
    ax1.set_xlabel('year')
    ax1.set_ylabel('Publications in languages', color=color)
    plt1 = df.iloc[113:][['not_de/eng',"['en']","['de']"]].plot(ax=ax1)
    ax1.tick_params(axis='y', labelcolor=color)
    ax2 = ax1.twinx()  
    ax1.legend(loc=0)
    color = 'tab:gray'
    ax2.set_ylabel('Number of publications', color=color)  
    plt2 = df.iloc[113:][['N_pub']].plot(ax=ax2,color=color, style='.')
    ax2.tick_params(axis='y', labelcolor=color)
    ax2.legend(loc=6)

    fig.tight_layout()
    pdffigure.savefig(bbox_inches="tight", dpi=400)
    plt.show()
    plt.close()
/home/arbeit/Dokumente/gwdgGitlab/GMPG/gmpg-notebooks/env/lib/python3.8/site-packages/pandas/plotting/_matplotlib/core.py:1235: UserWarning: FixedFormatter should only be used together with FixedLocator
  ax.set_xticklabels(xticklabels)
../_images/4_Languages_17_1.png
with PdfPages('../results/Fig4_MedianNAuthors.pdf') as pdffigure:
    fig, ax1 = plt.subplots()

    color = 'tab:red'
    ax1.set_xlabel('year')
    ax1.set_ylabel('Median Number of authors', color=color)
    plt1 = df.iloc[113:][['median authors']].plot(ax=ax1, color=color)
    ax1.tick_params(axis='y', labelcolor=color)
    ax2 = ax1.twinx()  
    ax1.legend(loc=0)
    color = 'tab:gray'
    ax2.set_ylabel('Number of publications', color=color) 
    plt2 = df.iloc[113:][['N_pub']].plot(ax=ax2,color=color, style='.')
    ax2.tick_params(axis='y', labelcolor=color)
    ax2.legend(loc=4)

    fig.tight_layout()  
    pdffigure.savefig(bbox_inches="tight", dpi=400)
    plt.show()
    plt.close()
/home/arbeit/Dokumente/gwdgGitlab/GMPG/gmpg-notebooks/env/lib/python3.8/site-packages/pandas/plotting/_matplotlib/core.py:1235: UserWarning: FixedFormatter should only be used together with FixedLocator
  ax.set_xticklabels(xticklabels)
../_images/4_Languages_18_1.png