Statistics cheatsheet

In the Python code import pandas as pd has been run

Basics

Operation

STATA

Pandas

Base R

Create new dataset from values

input a b
1 4
2 5
3 6
end
d = {'a' : [1,2,3], 'b' : [4,5,6]}

df = pd.DataFrame(d)
df <- data.frame(a=1:3, b=4:6)

Create new dataset from csv file

import delim mydata.csv, delimiters(",")
df = pd.read_csv('mydata.csv', sep=',')
df <- read.csv('my_data.csv', sep=',')

Print observations

list
df
df

Print observations of variable x

list x
df['x']
df$x

Select only variable x

keep x
df = df['x']
df <- df$x

Select only variables x and y

keep x y
df = df[['x', 'y']]
df <- df[c(x,y)]

Drop variable x

drop x
df = df.drop('x', axis=1)
df$x <- NULL

Generate new variable

gen z = x + y
df['z'] = df['x'] + df['y']
df$z <- df$x + df$y

Rename variable

rename x y
df.rename(columns = {'x' : 'y'})
names(df)[names(df) ==x] <-y

Sort by variable

sort x
df.sort_values('x')
df[order(df$x), ]

Filtering data

Operation

STATA

Pandas

Base R

Conditionally print observations

list if x > 1
df[df['x'] > 1]
subset(df, x == 1)

Conditionally print observations with 'or' operator

list if x > 1 | y < 0
df[(df['x'] > 1) | (df['y'] < 0)]
subset(df, x == 1 | y < 0)

Conditionally print observations with 'and' operator

list if x < 1 & y > 5
df[(df['x'] > 1) & (df['y'] < 0)]
subset(df, x == 1 & y < 0)

Print subset of observations based on location

list in 1/3
df[0:3]
df[1:3, ]

Print observations with missing values in x

list if missing(x)
df[df['x'].isnull()]
subset(df, is.na(x))

Summarizing data

Description

STATA

Pandas

Base R

Print summary statistics

summarize
df.describe()
summary(df)

Print information about variables and data types

describe
df.info()
str(df)

Print aggregation of variable

mean x
df['x'].mean()
mean(df$x)

Group data by variable and summarize

bysort x: summarize
df.groupby('x').describe()
aggregate(. ~ x, df, summary)

Print frequency table

tab x
df['x'].value_counts()
table(df$x)

Print cross-tabulation

tab x y
pd.crosstab(df['x'], df['y'])
table(df$x, df$y)

Create bins based on values in x in new column 'bins'

egen bins = cut x, group(3)
df['bins'] = pd.cut(df['x'], 3)
df$bins <- cut(df$x, 3)

Reshaping data

Operation

STATA

Pandas

Base R

Reshape data from wide to long panel

reshape long x, i(i) j(j)
pd.wide_to_long(df, ['x'], i='i', j='j')
reshape(df, direction='long', varying=grep('j', names(df), value=TRUE), sep='')

Reshape data from long to wide panel

reshape wide
df.unstack()

# returns hierarchical columns
reshape(df, timevar='x', idvar='i', direction='wide')

Merging data

Operation

STATA

Pandas

Base R

Vertically concatenate datasets

append using y
pd.concat([x, y])
rbind(x, y)

# note that columns must be the same for each dataset

Merge datasets on key

merge 1:1 key using y
pd.merge(x, y, on='key', how='inner')
merge(x, y, by='key')

Plotting

Operation

STATA

Pandas

Base R

Scatter plot

plot x y
df.plot.scatter('x', 'y')
plot(df$x, df$y)

Line plot

line x y
df.plot('x', 'y')
lines(df$x, df$y)

Histogram

hist x
df.hist('x')
hist(df$x)

Boxplot

graph box x
df.boxplot('x')
boxplot(df$x)