Statistics cheatsheet

In the Python code import pandas as pd has been run

Basics

Operation STATA Pandas Base R
Create new dataset from values
input a b
1 4
2 5
3 6
end
d = {'a' : [1,2,3], 'b' : [4,5,6]}

df = pd.DataFrame(d)
df <- data.frame(a=1:3, b=4:6)
Create new dataset from csv file
import delim mydata.csv, delimiters(",")
df = pd.read_csv('mydata.csv', sep=',')
df <- read.csv('my_data.csv', sep=',')
Print observations
list
df
df
Print observations of variable x
list x
df['x']
df$x
Select only variable x
keep x
df = df['x']
df <- df$x
Select only variables x and y
keep x y
df = df[['x', 'y']]
df <- df[c(‘x’, ‘y’)]
Drop variable x
drop x
df = df.drop('x', axis=1)
df$x <- NULL
Generate new variable
gen z = x + y
df['z'] = df['x'] + df['y']
df$z <- df$x + df$y
Rename variable
rename x y
df.rename(columns = {'x' : 'y'})
names(df)[names(df) == ‘x’] <- ‘y’
Sort by variable
sort x
df.sort_values('x')
df[order(df$x), ]

Filtering data

Operation STATA Pandas Base R
Conditionally print observations
list if x > 1
df[df['x'] > 1]
subset(df, x == 1)
Conditionally print observations with ‘or’ operator
list if x > 1 | y < 0
df[(df['x'] > 1) | (df['y'] < 0)]
subset(df, x == 1 | y < 0)
Conditionally print observations with ‘and’ operator
list if x < 1 & y > 5
df[(df['x'] > 1) & (df['y'] < 0)]
subset(df, x == 1 & y < 0)
Print subset of observations based on location
list in 1/3
df[0:3]
df[1:3, ]
Print observations with missing values in x
list if missing(x)
df[df['x'].isnull()]
subset(df, is.na(x))

Summarizing data

Description STATA Pandas Base R
Print summary statistics
summarize
df.describe()
summary(df)
Print information about variables and data types
describe
df.info()
str(df)
Print aggregation of variable
mean x
df['x'].mean()
mean(df$x)
Group data by variable and summarize
bysort x: summarize
df.groupby('x').describe()
aggregate(. ~ x, df, summary)
Print frequency table
tab x
df['x'].value_counts()
table(df$x)
Print cross-tabulation
tab x y
pd.crosstab(df['x'], df['y'])
table(df$x, df$y)
Create bins based on values in x in new column ‘bins’
egen bins = cut x, group(3)
df['bins'] = pd.cut(df['x'], 3)
df$bins <- cut(df$x, 3)

Reshaping data

Operation STATA Pandas Base R
Reshape data from wide to long panel
reshape long x, i(i) j(j)
pd.wide_to_long(df, ['x'], i='i', j='j')
reshape(df, direction=’long’, varying=grep(‘x’, names(df), value=TRUE), sep='')
Reshape data from long to wide panel
reshape wide
df.unstack()

# returns hierarchical columns
reshape(df, direction='wide', varying=x, value=TRUE), sep='')

Merging data

Operation STATA Pandas Base R
Vertically concatenate datasets
append using y
pd.concat([x, y])
rbind(x, y)

# note that columns must be the same for each dataset
Merge datasets on key
merge 1:1 key using y
pd.merge(x, y, on='key', how='inner')
merge(x, y, by='key')

Plotting

Operation STATA Pandas Base R
Scatter plot
plot x y
df.plot.scatter('x', 'y')
plot(df$x, df$y)
Line plot
line x y
df.plot('x', 'y')
lines(df$x, df$y)
Histogram
hist x
df.hist('x')
hist(df$x)
Boxplot
graph box x
df.boxplot('x')
boxplot(df$x)