import pandas as pd
import numpy as np
import time
import re
import seaborn as sns
import plotly.express as px
t = time.time()
table_1 = pd.read_csv('data/table_1.csv')
table_2 = pd.read_csv('data/table_2.csv')
cities = pd.read_csv('data/cities_loc.csv')
time.time() - t
t = time.time()
table_1 = table_1.dropna(subset = ['pmid', 'doi'])
time.time() - t
t = time.time()
table_2 = table_2[table_2.year_pub > 1975].dropna(subset = ['pmid'])
time.time() - t
t = time.time()
final_table = table_1.merge(table_2, how = 'left', on = 'pmid').dropna()
time.time() - t
final_table
t = time.time()
final_table = final_table.assign(nb_aut = final_table.authors.map(lambda aut: len(re.findall('<AUTHOR>',aut))))
time.time() - t
final_table[['pmid','nb_aut']]
t = time.time()
final_table = final_table[final_table.nb_aut > 0]
time.time() - t
sns.distplot(np.log(final_table.nb_aut +1), hist = False, kde = True,
kde_kws = {'linewidth': 3})
How many papers contains 'deep learning' or 'machine learning' and 'neural network' (also with a 's' for neural networks) in their title ? Create a binary variable to save this information. What is the mean of authors for ML papers and non#ML papers ?
Transform has_data and oa into binary variable also, what is the share of ML paper that are oa
t = time.time()
final_table[['title']] = final_table.title.str.lower()
final_table = final_table.assign(
ML = np.where(final_table.title.str.contains('deep learning|machine learning|neural networks?'),1,0),
has_data = np.where(final_table.has_data == 'Y',1,0),
oa = np.where(final_table.oa == 'Y',1,0))
time.time() - t
final_table
np.sum(final_table.ML)
np.sum(final_table.oa)
np.sum(final_table.has_data)
final_table.groupby('ML').agg({'nb_aut':np.mean,
'oa':np.mean})
t = time.time()
final_table[['pub_type']] = final_table.pubtype.map(lambda pubtype: re.findall('\[\"(.*?)\"',pubtype)[0])
time.time() - t
final_table[['pubtype','pub_type']]
final_table.groupby('pub_type').agg({'cited':np.mean}).sort_values('cited',ascending = False)
final_table.groupby('pub_type').agg({'cited':lambda x : np.std(x,ddof = 1)}).sort_values('cited',ascending = False)
countries = list(cities.country.str.lower().drop_duplicates())
countries_pd = final_table[['pmid', 'authors']]
t = time.time()
countries_pd = countries_pd.assign(
authors=countries_pd.authors.str.split('<AUTHOR>')).explode('authors')
time.time() - t
countries_pd
t = time.time()
countries_pd = countries_pd[(countries_pd.authors != '')
& (-countries_pd.authors.str.contains('<AFFILIATION>None'))]
time.time() - t
t = time.time()
countries_pd = countries_pd.assign(
authors = countries_pd.authors.map(lambda aff: aff.split('<AFFILIATION>')[1])).drop_duplicates()
time.time() - t
countries_pd
t = time.time()
for abr, name in zip([' USA', ' UK', ' Korea'], ['United States', 'United Kingdom', 'South Korea']):
countries_pd.authors = countries_pd.authors.map(lambda aff: re.sub(abr,name,aff))
countries_pd.authors = countries_pd.authors.str.lower()
time.time() - t
t = time.time()
countries_pd = countries_pd[countries_pd.authors.map(lambda aff: any([country in aff for country in countries]))]
countries_matched = countries_pd.authors.map(lambda aff: [country for country in countries if country in aff][0])
countries_pd = countries_pd.assign(country = countries_matched).drop('authors', axis = 1)
time.time() - t
t = time.time()
countries_pd = countries_pd[countries_pd.country != ''].drop_duplicates()
time.time() - t
countries_pd
t = time.time()
countries_year = countries_pd.merge(final_table[['pmid','year_pub']], how = 'left', on = 'pmid')
time.time() - t
countries_year
countries_year = countries_year.groupby(['country', 'year_pub']).agg(n = ('pmid', len)).sort_values('n', ascending = False)
countries_year = countries_year.reset_index()
countries_year
wide_countries = countries_year.pivot(index="year_pub", columns="country", values="n").reset_index()
wide_countries = wide_countries[wide_countries.year_pub > 2000]
countries_to_plot = list(wide_countries.fillna(0).drop('year_pub',axis = 1).apply(sum).sort_values()[-25:].index)
fig = px.bar(wide_countries, x="year_pub", y=countries_to_plot)
fig.update_yaxes(type="log")
fig.show()