# Python
import pandas as pd
import re
covid_papers = pd.read_csv("data/covid_papers.csv")
covid_papers
## _id pmid pmcid ... cited has_data oa
## 0 5f69d6fa65a250f0edc4ee5c 32837036.0 PMC7424135 ... 0 N Y
## 1 5f69d6fa65a250f0edc4ee5d 32196410.0 PMC7170333 ... 61 N Y
## 2 5f69d6fa65a250f0edc4ee5e 32778421.0 PMC7402212 ... 0 N Y
## 3 5f69d6fa65a250f0edc4ee5f 32730205.0 PMC7382925 ... 0 N Y
## 4 5f69d6fa65a250f0edc4ee60 NaN NaN ... 0 N N
## ... ... ... ... ... ... ... ..
## 148911 5f69f29165a250f0edc7340b 31314321.0 PMC7121204 ... 0 N Y
## 148912 5f69f29165a250f0edc7340c 31314312.0 PMC7123335 ... 0 N Y
## 148913 5f69f29165a250f0edc7340d 32091757.0 PMC7124085 ... 0 N Y
## 148914 5f69f29165a250f0edc7340e 28590697.0 PMC7193726 ... 0 N Y
## 148915 5f69f29165a250f0edc7340f 28590695.0 PMC7193709 ... 0 N Y
##
## [148916 rows x 19 columns]
covid_papers.sample(frac = 0.001).shape
## (149, 19)
# Python
covid_papers.doi
## 0 10.1007/s12291-020-00919-0
## 1 10.1080/22221751.2020.1746199
## 2 10.1016/j.jiph.2020.07.011
## 3 10.1016/j.ijsu.2020.07.032
## 4 10.20944/preprints202008.0312.v1
## ...
## 148911 NaN
## 148912 NaN
## 148913 NaN
## 148914 NaN
## 148915 NaN
## Name: doi, Length: 148916, dtype: object
covid_papers[['doi','title']]
## doi title
## 0 10.1007/s12291-020-00919-0 The Pathophysiology, Diagnosis and Treatment o...
## 1 10.1080/22221751.2020.1746199 Hypothesis for potential pathogenesis of SARS-...
## 2 10.1016/j.jiph.2020.07.011 The outbreak of the novel severe acute respira...
## 3 10.1016/j.ijsu.2020.07.032 A comparative overview of COVID-19, MERS and S...
## 4 10.20944/preprints202008.0312.v1 Virion Structure and Mechanism of Propagation ...
## ... ... ...
## 148911 NaN BMT Settings, Infection and Infection Control
## 148912 NaN Early and Acute Complications and the Principl...
## 148913 NaN Viral Infections
## 148914 NaN International Collaboration for Global Public ...
## 148915 NaN Resource Allocation and Priority Setting
##
## [148916 rows x 2 columns]
covid_papers.drop(['doi','creation'],axis = 1)
## _id pmid pmcid ... cited has_data oa
## 0 5f69d6fa65a250f0edc4ee5c 32837036.0 PMC7424135 ... 0 N Y
## 1 5f69d6fa65a250f0edc4ee5d 32196410.0 PMC7170333 ... 61 N Y
## 2 5f69d6fa65a250f0edc4ee5e 32778421.0 PMC7402212 ... 0 N Y
## 3 5f69d6fa65a250f0edc4ee5f 32730205.0 PMC7382925 ... 0 N Y
## 4 5f69d6fa65a250f0edc4ee60 NaN NaN ... 0 N N
## ... ... ... ... ... ... ... ..
## 148911 5f69f29165a250f0edc7340b 31314321.0 PMC7121204 ... 0 N Y
## 148912 5f69f29165a250f0edc7340c 31314312.0 PMC7123335 ... 0 N Y
## 148913 5f69f29165a250f0edc7340d 32091757.0 PMC7124085 ... 0 N Y
## 148914 5f69f29165a250f0edc7340e 28590697.0 PMC7193726 ... 0 N Y
## 148915 5f69f29165a250f0edc7340f 28590695.0 PMC7193709 ... 0 N Y
##
## [148916 rows x 16 columns]
# Python
import numpy as np
covid_papers = covid_papers.rename({ name:replace for name, replace in zip(list(covid_papers.columns[:4]),'V'+np.char.array(['1','2','3']))},axis = 1)
# filter columns using regex
covid_papers.filter(regex=("V[12]"))
covid_papers.filter(regex=("V.*"))
covid_papers.filter(regex=("3$"))
covid_papers.filter(regex=("^V"))
# Python
covid_papers.loc[3:4,]
covid_papers.drop(range(3,8))
covid_papers[covid_papers.year_pub > 2018]
covid_papers[covid_papers.title.isin(["Deep convolutional approaches for the analysis of Covid-19 using chest X-Ray images from portable devices",
"Evaluating the effect of city lock-down on controlling COVID-19 propagation through deep learning and network science models."])]
covid_papers[covid_papers.year_pub == 2018 & covid_papers.title.isin(["Deep convolutional approaches for the analysis of Covid-19 using chest X-Ray images from portable devices",
"Evaluating the effect of city lock-down on controlling COVID-19 propagation through deep learning and network science models."])]
covid_papers = covid_papers.dropna(subset = ['title'])
covid_papers[covid_papers.title.str.contains('deep learning')][:5]
covid_papers.drop_duplicates()
covid_papers.drop_duplicates(subset=['V1', 'title'])
# Python
covid_papers.sort_values('year_pub')
## V1 V2 V3 ... cited has_data oa
## 148865 5f69f28f65a250f0edc733dd NaN PMC5613494 ... 0 N Y
## 148799 5f69f28c65a250f0edc7339b NaN PMC5639501 ... 0 N Y
## 148793 5f69f28b65a250f0edc73395 29919123.0 PMC5100527 ... 0 N Y
## 148888 5f69f29065a250f0edc733f4 29917937.0 PMC5082205 ... 0 N Y
## 148867 5f69f28f65a250f0edc733df 29918028.0 PMC5086967 ... 0 N Y
## ... ... ... ... ... ... ... ..
## 138520 5f69f09565a250f0edc70b74 NaN PMC7120167 ... 0 N Y
## 139464 5f69f0c265a250f0edc70f24 NaN PMC7123719 ... 0 N Y
## 142783 5f69f16265a250f0edc71c1b NaN PMC7121730 ... 0 N Y
## 145952 5f69f1fb65a250f0edc7287c NaN PMC7153414 ... 0 N Y
## 147551 5f69f24a65a250f0edc72ebb NaN PMC7123278 ... 0 N Y
##
## [148872 rows x 18 columns]
covid_papers.sort_values('year_pub',ascending=False)
## V1 V2 V3 ... cited has_data oa
## 118345 5f69ecb165a250f0edc6bca5 32835130.0 PMC7332907 ... 0 N Y
## 92675 5f69e7a965a250f0edc6585f NaN PMC7258713 ... 0 N Y
## 125910 5f69ee2465a250f0edc6da32 32904425.0 PMC7455151 ... 0 N Y
## 50368 5f69dfcb65a250f0edc5b31c 32834290.0 PMC7396951 ... 0 N Y
## 16091 5f69d9b565a250f0edc52d37 32921917.0 NaN ... 0 N N
## ... ... ... ... ... ... ... ..
## 138520 5f69f09565a250f0edc70b74 NaN PMC7120167 ... 0 N Y
## 139464 5f69f0c265a250f0edc70f24 NaN PMC7123719 ... 0 N Y
## 142783 5f69f16265a250f0edc71c1b NaN PMC7121730 ... 0 N Y
## 145952 5f69f1fb65a250f0edc7287c NaN PMC7153414 ... 0 N Y
## 147551 5f69f24a65a250f0edc72ebb NaN PMC7123278 ... 0 N Y
##
## [148872 rows x 18 columns]
# Python
covid_papers.agg({'cited':sum})
## cited 1241806
## dtype: int64
covid_papers.agg({'cited':[sum,np.std]})
## cited
## sum 1.241806e+06
## std 3.883440e+01
# Python
covid_papers.assign(oa = covid_papers.oa.map(lambda oa: 1 if oa=='Y' else 0),
log_cited = np.log(covid_papers.cited+1))
## V1 V2 ... oa log_cited
## 0 5f69d6fa65a250f0edc4ee5c 32837036.0 ... 1 0.000000
## 1 5f69d6fa65a250f0edc4ee5d 32196410.0 ... 1 4.127134
## 2 5f69d6fa65a250f0edc4ee5e 32778421.0 ... 1 0.000000
## 3 5f69d6fa65a250f0edc4ee5f 32730205.0 ... 1 0.000000
## 4 5f69d6fa65a250f0edc4ee60 NaN ... 0 0.000000
## ... ... ... ... .. ...
## 148911 5f69f29165a250f0edc7340b 31314321.0 ... 1 0.000000
## 148912 5f69f29165a250f0edc7340c 31314312.0 ... 1 0.000000
## 148913 5f69f29165a250f0edc7340d 32091757.0 ... 1 0.000000
## 148914 5f69f29165a250f0edc7340e 28590697.0 ... 1 0.000000
## 148915 5f69f29165a250f0edc7340f 28590695.0 ... 1 0.000000
##
## [148872 rows x 19 columns]
np.where( covid_papers.oa=='Y',1,0)
## array([1, 1, 1, ..., 1, 1, 1])
covid_papers.title.map(lambda x: str(x).lower())
## 0 the pathophysiology, diagnosis and treatment o...
## 1 hypothesis for potential pathogenesis of sars-...
## 2 the outbreak of the novel severe acute respira...
## 3 a comparative overview of covid-19, mers and s...
## 4 virion structure and mechanism of propagation ...
## ...
## 148911 bmt settings, infection and infection control
## 148912 early and acute complications and the principl...
## 148913 viral infections
## 148914 international collaboration for global public ...
## 148915 resource allocation and priority setting
## Name: title, Length: 148872, dtype: object
# Python
covid_papers.groupby('year_pub').language.nunique()
# By several groups
## year_pub
## 1820.0 1
## 1821.0 1
## 1830.0 1
## 1832.0 1
## 1833.0 1
## ..
## 2017.0 12
## 2018.0 8
## 2019.0 10
## 2020.0 21
## 2021.0 2
## Name: language, Length: 137, dtype: int64
covid_papers.groupby('year_pub').agg({'cited':sum})
# first obs for each group
## cited
## year_pub
## 1820.0 0
## 1821.0 0
## 1830.0 0
## 1832.0 0
## 1833.0 0
## ... ...
## 2017.0 24850
## 2018.0 17139
## 2019.0 10151
## 2020.0 147528
## 2021.0 2
##
## [137 rows x 1 columns]
covid_papers.groupby('source').first()
## V1 V2 V3 ... cited has_data oa
## source ...
## AGR 5f69dfd965a250f0edc5b44c NaN None ... 0 N N
## CBA 5f69e54965a250f0edc626c8 NaN None ... 0 N N
## CTX 5f69ebc365a250f0edc6a9dc NaN None ... 0 N N
## ETH 5f69e80565a250f0edc65f8e NaN None ... 0 N N
## HIR 5f69e5dd65a250f0edc63360 NaN None ... 0 N N
## MED 5f69d6fa65a250f0edc4ee5c 32837036.0 PMC7424135 ... 0 N Y
## PAT 5f69e34f65a250f0edc5fdbb NaN None ... 0 N N
## PMC 5f69d6fa65a250f0edc4ee61 NaN PMC7480804 ... 0 N Y
## PPR 5f69d6fa65a250f0edc4ee60 NaN None ... 0 N N
##
## [9 rows x 17 columns]
covid_papers.groupby('source').last()
# Add a column with number of observations for each group
## V1 V2 V3 ... cited has_data oa
## source ...
## AGR 5f69f16f65a250f0edc71d24 NaN None ... 0 N N
## CBA 5f69f0cf65a250f0edc7103d NaN None ... 0 N N
## CTX 5f69f00f65a250f0edc700b7 NaN None ... 0 N N
## ETH 5f69ee9365a250f0edc6e2d7 NaN None ... 0 N N
## HIR 5f69e5dd65a250f0edc63360 NaN None ... 0 N N
## MED 5f69f29165a250f0edc7340f 28590695.0 PMC7193709 ... 0 N Y
## PAT 5f69f1e565a250f0edc726c3 NaN None ... 0 N N
## PMC 5f69f29065a250f0edc733f1 NaN PMC2297516 ... 0 N N
## PPR 5f69eaec65a250f0edc698de NaN None ... 0 N N
##
## [9 rows x 17 columns]
covid_papers['grp_count'] = covid_papers.groupby('source')['source'].transform('count')
# Python
covid_papers.groupby('source').apply(lambda df: df.iloc[:3])
covid_papers.groupby('source').apply(lambda df: df.iloc[-5:])
# Python
covid_papers.groupby(['source','year_pub']).ngroups
# Python
covid_papers.groupby(['source','year_pub']).size()
## source year_pub
## AGR 1987.0 2
## 1991.0 1
## 1993.0 1
## 1994.0 1
## 1995.0 10
## ...
## PPR 2016.0 6
## 2017.0 15
## 2018.0 24
## 2019.0 34
## 2020.0 13726
## Length: 301, dtype: int64
# Python
covid_papers.groupby(['source','year_pub']).agg({'cited':np.mean,'V1':'count'})
## cited V1
## source year_pub
## AGR 1987.0 0.500000 2
## 1991.0 1.000000 1
## 1993.0 3.000000 1
## 1994.0 0.000000 1
## 1995.0 11.600000 10
## ... ... ...
## PPR 2016.0 0.000000 6
## 2017.0 0.533333 15
## 2018.0 0.541667 24
## 2019.0 0.147059 34
## 2020.0 0.120866 13726
##
## [301 rows x 2 columns]
# Python
# Summarise several columns
covid_papers.agg({var:func for var, func in zip(['cited','year_pub'],[np.mean]*2)})
# Summarise several columns by group
## cited 8.341434
## year_pub 2015.348434
## dtype: float64
covid_papers.groupby(['source','year_pub']).agg({var:func for var, func in zip(['cited','year_pub'],[np.mean]*2)})
## using patterns (regex)
## cited year_pub
## source year_pub
## AGR 1987.0 0.500000 1987.0
## 1991.0 1.000000 1991.0
## 1993.0 3.000000 1993.0
## 1994.0 0.000000 1994.0
## 1995.0 11.600000 1995.0
## ... ... ...
## PPR 2016.0 0.000000 2016.0
## 2017.0 0.533333 2017.0
## 2018.0 0.541667 2018.0
## 2019.0 0.147059 2019.0
## 2020.0 0.120866 2020.0
##
## [301 rows x 2 columns]
cols = list(filter(lambda x: x.startswith("V"),covid_papers.columns.tolist()))
covid_papers.groupby(['source','year_pub']).agg({var:func for var, func in zip(cols,['count']*len(cols))})
# Summarise with more than one function by group
## V1 V2 V3
## source year_pub
## AGR 1987.0 2 0 0
## 1991.0 1 0 0
## 1993.0 1 0 0
## 1994.0 1 0 0
## 1995.0 10 0 0
## ... ... .. ..
## PPR 2016.0 6 0 0
## 2017.0 15 0 0
## 2018.0 24 0 0
## 2019.0 34 0 0
## 2020.0 13726 0 0
##
## [301 rows x 3 columns]
covid_papers.groupby(['source']).agg({var:func for var, func in zip(['year_pub','cited'],[np.mean,np.median]*len(cols))})
## year_pub cited
## source
## AGR 2003.371429 2
## CBA 2004.014870 0
## CTX 2001.333333 0
## ETH 2004.666667 0
## HIR 2004.000000 0
## MED 2015.179733 0
## PAT 2005.511289 0
## PMC 2013.610694 0
## PPR 2019.988194 0
def tobinary(x):
if str(x) == 'Y':
return 1
else :
return 0
# Modify several columns (dropping the others)
covid_papers.agg({var:func for var, func in zip(['oa','has_data'],[tobinary]*2)})
# Modify several columns (keeping the others)
## oa has_data
## 0 1 0
## 1 1 0
## 2 1 0
## 3 1 0
## 4 0 0
## ... .. ...
## 148911 1 0
## 148912 1 0
## 148913 1 0
## 148914 1 0
## 148915 1 0
##
## [148872 rows x 2 columns]
for var in ['oa','has_data']:
covid_papers = covid_papers.assign(var = tobinary)
# Modify columns using a condition
for var in ['oa','has_data']:
if covid_papers[var].dtypes == 'O':
covid_papers = covid_papers.assign(var = tobinary)
# Python
grp_size = covid_papers.groupby('year_pub').size().to_frame()
grp_size.columns = ['share_en']
covid_papers.assign(is_en = np.where(covid_papers.language == 'eng',1,0),
oa = tobinary).groupby('year_pub').agg(share_en= ('is_en',sum)).div(grp_size).sort_values('share_en')
## share_en
## year_pub
## 1956.0 0.666667
## 1964.0 0.750000
## 1966.0 0.750000
## 1967.0 0.785714
## 2020.0 0.796331
## ... ...
## 1935.0 1.000000
## 1968.0 1.000000
## 1936.0 1.000000
## 1965.0 1.000000
## 1820.0 1.000000
##
## [137 rows x 1 columns]
When you have a variable with multiple categories that are stored in one cell, such as the ‘pubtype’ variable in our dataset, you may want to create separate columns for each category. The opposite is also possible, where you have several columns with values that you want to transform into two columns: one with the factor and the other with the values.
To clean this variable and separate the string in each cell, you can use the tidyr package in R. The separate() function allows you to split a single column into multiple columns based on a delimiter, and the spread() function can be used to transform wide format data into long format data. This can be used to create separate rows for each pubtype.
# Python
covid_papers.pubtype.loc[1]
## '{"pubType": ["review-article", "Journal Article"]}'
def clean_pubtype(str_):
return re.sub('[{]|[}]|pubType|: |[[]|[]]','',str_)
clean_pubtype(covid_papers.pubtype.loc[1])
## '"""review-article", "Journal Article"'
covid_papers['pubtype'] = covid_papers.pubtype.map(lambda pt: clean_pubtype(str(pt)))
pubtype_cp = covid_papers.assign(pubtype = covid_papers.pubtype.str.split('", "')).explode('pubtype')
pubtype_cp['pubtype'] = pubtype_cp.pubtype.map(lambda pubtype: re.sub('"','',(pubtype)))
pubtype_cp
## V1 V2 V3 ... oa grp_count var
## 0 5f69d6fa65a250f0edc4ee5c 32837036.0 PMC7424135 ... Y 113919 0
## 0 5f69d6fa65a250f0edc4ee5c 32837036.0 PMC7424135 ... Y 113919 0
## 0 5f69d6fa65a250f0edc4ee5c 32837036.0 PMC7424135 ... Y 113919 0
## 1 5f69d6fa65a250f0edc4ee5d 32196410.0 PMC7170333 ... Y 113919 0
## 1 5f69d6fa65a250f0edc4ee5d 32196410.0 PMC7170333 ... Y 113919 0
## ... ... ... ... ... .. ... ..
## 148913 5f69f29165a250f0edc7340d 32091757.0 PMC7124085 ... Y 113919 0
## 148914 5f69f29165a250f0edc7340e 28590697.0 PMC7193726 ... Y 113919 0
## 148914 5f69f29165a250f0edc7340e 28590697.0 PMC7193726 ... Y 113919 0
## 148915 5f69f29165a250f0edc7340f 28590695.0 PMC7193709 ... Y 113919 0
## 148915 5f69f29165a250f0edc7340f 28590695.0 PMC7193709 ... Y 113919 0
##
## [308408 rows x 20 columns]
pubtype_cp = pubtype_cp[['V2','pubtype']].groupby(['V2','pubtype']).size().to_frame('n').reset_index()
pubtype_cp_i = pubtype_cp[['V2','pubtype']].groupby(['V2','pubtype']).size().to_frame('n')
# Python
pubtype_cp_i.unstack().fillna(0)
## n ...
## pubtype AIM Adaptive Clinical Trial ... review-article systematic-review
## V2 ...
## 2142.0 0.0 0.0 ... 0.0 0.0
## 2145.0 0.0 0.0 ... 0.0 0.0
## 12235.0 0.0 0.0 ... 0.0 0.0
## 30881.0 0.0 0.0 ... 0.0 0.0
## 36849.0 0.0 0.0 ... 0.0 0.0
## ... ... ... ... ... ...
## 32931118.0 0.0 0.0 ... 0.0 0.0
## 32931136.0 0.0 0.0 ... 0.0 0.0
## 32931146.0 0.0 0.0 ... 0.0 0.0
## 32931161.0 0.0 0.0 ... 0.0 0.0
## 32931162.0 0.0 0.0 ... 0.0 0.0
##
## [113919 rows x 118 columns]
casted_pubtype_cp = pubtype_cp.pivot(index="V2", columns="pubtype", values="n").fillna(0)
casted_pubtype_cp.shape
## (113919, 118)
# Python
casted_pubtype_cp.stack()
## V2 pubtype
## 2142.0 AIM 0.0
## Adaptive Clinical Trial 0.0
## Address 0.0
## Addresses 0.0
## Autobiography 0.0
## ...
## 32931162.0 report 0.0
## research-article 0.0
## retraction 0.0
## review-article 0.0
## systematic-review 0.0
## Length: 13442442, dtype: float64
casted_pubtype_cp.stack().to_frame('n').reset_index()
## V2 pubtype n
## 0 2142.0 AIM 0.0
## 1 2142.0 Adaptive Clinical Trial 0.0
## 2 2142.0 Address 0.0
## 3 2142.0 Addresses 0.0
## 4 2142.0 Autobiography 0.0
## ... ... ... ...
## 13442437 32931162.0 report 0.0
## 13442438 32931162.0 research-article 0.0
## 13442439 32931162.0 retraction 0.0
## 13442440 32931162.0 review-article 0.0
## 13442441 32931162.0 systematic-review 0.0
##
## [13442442 rows x 3 columns]
# Python
list(covid_papers.groupby('oa'))
# Python
covid_papers[['doi1','doi2']] = covid_papers.doi.str.split("/",expand=True).loc[:,:1]
covid_papers[['doi','doi1','doi2']]
## doi doi1 doi2
## 0 10.1007/s12291-020-00919-0 10.1007 s12291-020-00919-0
## 1 10.1080/22221751.2020.1746199 10.1080 22221751.2020.1746199
## 2 10.1016/j.jiph.2020.07.011 10.1016 j.jiph.2020.07.011
## 3 10.1016/j.ijsu.2020.07.032 10.1016 j.ijsu.2020.07.032
## 4 10.20944/preprints202008.0312.v1 10.20944 preprints202008.0312.v1
## ... ... ... ...
## 148911 NaN NaN NaN
## 148912 NaN NaN NaN
## 148913 NaN NaN NaN
## 148914 NaN NaN NaN
## 148915 NaN NaN NaN
##
## [148872 rows x 3 columns]
covid_papers.doi.str.split('/').explode()
## 0 10.1007
## 0 s12291-020-00919-0
## 1 10.1080
## 1 22221751.2020.1746199
## 2 10.1016
## ...
## 148911 NaN
## 148912 NaN
## 148913 NaN
## 148914 NaN
## 148915 NaN
## Name: doi, Length: 279799, dtype: object
# Python
covid_papers = covid_papers.set_index('V2')
covid_papers.loc[32837036,'doi']
## '10.1007/s12291-020-00919-0'
covid_papers.iloc[2,5]
## 2020.0
# Python
df1 = pd.DataFrame({'Id': ['A','B','C','C'],
'X1': [1,3,5,7],
'X2': ['x2','x4','x6','x8']}).set_index('Id')
df2 = pd.DataFrame({'Id': ['A','B','B','D'],
'Y1': [1,3,5,7],
'Y2': ['y1','y3','y5','y7']}).set_index('Id')
# Python
df1.join(df2)
## X1 X2 Y1 Y2
## Id
## A 1 x2 1.0 y1
## B 3 x4 3.0 y3
## B 3 x4 5.0 y5
## C 5 x6 NaN NaN
## C 7 x8 NaN NaN
pd.merge(df1, df2, how= 'left', on = 'Id')
## X1 X2 Y1 Y2
## Id
## A 1 x2 1.0 y1
## B 3 x4 3.0 y3
## B 3 x4 5.0 y5
## C 5 x6 NaN NaN
## C 7 x8 NaN NaN
# Python
df2.join(df1)
## Y1 Y2 X1 X2
## Id
## A 1 y1 1.0 x2
## B 3 y3 3.0 x4
## B 5 y5 3.0 x4
## D 7 y7 NaN NaN
pd.merge(df1, df2, how= 'right', on = 'Id')
## X1 X2 Y1 Y2
## Id
## A 1.0 x2 1 y1
## B 3.0 x4 3 y3
## B 3.0 x4 5 y5
## D NaN NaN 7 y7
# Python
pd.merge(df1, df2, how= 'inner', on = 'Id')
## X1 X2 Y1 Y2
## Id
## A 1 x2 1 y1
## B 3 x4 3 y3
## B 3 x4 5 y5
# Python
pd.merge(df1, df2, how= 'outer', on = 'Id')
## X1 X2 Y1 Y2
## Id
## A 1.0 x2 1.0 y1
## B 3.0 x4 3.0 y3
## B 3.0 x4 5.0 y5
## C 5.0 x6 NaN NaN
## C 7.0 x8 NaN NaN
## D NaN NaN 7.0 y7
# Python
df1[df1.index == df2.index]
## X1 X2
## Id
## A 1 x2
## B 3 x4
# Python
df1[df1.index != df2.index]
## X1 X2
## Id
## C 5 x6
## C 7 x8
# Python
df1 = pd.DataFrame({'Id': ['A','B','C'],
'X1': [1,3,5],
'X2': ['x2','x4','x6']}).set_index('Id')
df2 = pd.DataFrame({'Id': ['A','B','C'],
'Y1': [1,3,5],
'Y2': ['y1','y3','y5']}).set_index('Id')
pd.concat([df1,df2])
## X1 X2 Y1 Y2
## Id
## A 1.0 x2 NaN NaN
## B 3.0 x4 NaN NaN
## C 5.0 x6 NaN NaN
## A NaN NaN 1.0 y1
## B NaN NaN 3.0 y3
## C NaN NaN 5.0 y5
# Python
df3 = pd.concat([df1,df2],axis =1)
df3
## X1 X2 Y1 Y2
## Id
## A 1 x2 1 y1
## B 3 x4 3 y3
## C 5 x6 5 y5
# Python
df3.shift(periods = 1)
## X1 X2 Y1 Y2
## Id
## A NaN NaN NaN NaN
## B 1.0 x2 1.0 y1
## C 3.0 x4 3.0 y3
df3.shift(periods = 1, axis= 1)
## X1 X2 Y1 Y2
## Id
## A NaN 1 x2 1
## B NaN 3 x4 3
## C NaN 5 x6 5
# Python
covid_papers = pd.read_csv("data/covid_papers.csv")
covid_papers
## _id pmid pmcid ... cited has_data oa
## 0 5f69d6fa65a250f0edc4ee5c 32837036.0 PMC7424135 ... 0 N Y
## 1 5f69d6fa65a250f0edc4ee5d 32196410.0 PMC7170333 ... 61 N Y
## 2 5f69d6fa65a250f0edc4ee5e 32778421.0 PMC7402212 ... 0 N Y
## 3 5f69d6fa65a250f0edc4ee5f 32730205.0 PMC7382925 ... 0 N Y
## 4 5f69d6fa65a250f0edc4ee60 NaN NaN ... 0 N N
## ... ... ... ... ... ... ... ..
## 148911 5f69f29165a250f0edc7340b 31314321.0 PMC7121204 ... 0 N Y
## 148912 5f69f29165a250f0edc7340c 31314312.0 PMC7123335 ... 0 N Y
## 148913 5f69f29165a250f0edc7340d 32091757.0 PMC7124085 ... 0 N Y
## 148914 5f69f29165a250f0edc7340e 28590697.0 PMC7193726 ... 0 N Y
## 148915 5f69f29165a250f0edc7340f 28590695.0 PMC7193709 ... 0 N Y
##
## [148916 rows x 19 columns]
# Python
covid_papers.to_csv("data/covid_papers.csv")
Using first dplyr
and the tidyverse
, Use table_1.csv
, table_2.csv
and loc_cities.csv
, we will need information from the three datasets.
Import datasets
Remove articles with no pmid and no DOI, and all articles before 1975
Merge the two datasets, pmid is unique for each paper
Create a new variable with the number of authors for each paper
plot distribution for the log(number of authors +1)
How many papers contains ‘deep learning’ or ‘machine learning’ and ‘neural network’ (also with a ‘s’ for neural networks) in their title ? Create a binary variable to save this information. What is the mean of authors for ML papers and non#ML papers ?
Transform has_data and oa into binary variable also, what is the share of ML paper that are oa
Clean up pubtype, for simplicity just get the first type
What is the pub type with the highest mean/sd of citation for each type of publication ? (use cited and the cleaned pub_type)
Which are the most representative country by year ? You may want to separate rows for each authors to get all countries involved in the paper, in an authors have multiple affiliations, take the first one. Store it in an other tibble, keep only pmid and authors, get the country for each author from the loc_cities.csv.
Select the top 25 of countries involved in coronavirus research since 2001, plot the evolution on a bar chart with plot_ly