There is no shortcut to knowledge; and there are no worthwhile data without preprocessing. In the first three sections of this chapter, we discuss situations that necessitate data preprocessing and how to handle them. In the final section we discuss how to manipulate data in general; specifically, how to manipulate data in R using the reshape2
and plyr
packages and in Python using the pandas
module.
Note: The examples below are abridged; the book contains more details.
The code below analyzes the dataframe movies
in the ggplot2movies
package
with some missing values:
library(ggplot2movies)
names(movies)
movies[9000:9020, 1:6] # display 20 rows, 6 first columns
mean(movies$length)
mean(movies$budget)
mean(movies$budget, na.rm = TRUE)
mean(is.na(movies$budget))
moviesNoNA = na.omit(movies)
qplot(rating, budget, data = moviesNoNA, size = I(1.2)) +
stat_smooth(color = "red", size = I(2), se = F)
library(ggplot2movies)
moviesNoNA = na.omit(movies)
qplot(rating, votes, data = moviesNoNA, size = I(1.2))
The code below defines a small dataframe with missing values, and demonstrates
the isnull
, dropna
, and fillna
methods:
import numpy as np
import pandas as pd
data = pd.DataFrame([[1, 2, np.nan],
[3, np.nan, 4],
[1, 2, 3]])
# Example 1 - the isnull method
print(f"data:\n{data}")
print("pd.isnull(data):")
print(pd.isnull(data))
# Example 2 - the dropna method
# Drop rows
print("data.dropna():")
print(data.dropna())
# Drop columns
print("data.dropna(axis = 1):")
print(data.dropna(axis = 1))
# Example 3 - the fillna method
# Fill in missing entries with column means
print("data.fillna(data.mean()):")
print(data.fillna(data.mean()))
# Example 1 - the Black Monday stock crash on October 19, 1987
library(Ecdat)
data(SP500, package = 'Ecdat')
qplot(r500,
main = "Histogram of log(P(t)/P(t-1)) for SP500 (1981-91)",
xlab = "log returns",
data = SP500)
qplot(seq(along = r500),
r500,
data = SP500,
geom = "line",
xlab = "trading days since January 1981",
ylab = "log returns",
main = "log(P(t)/P(t-1)) for SP500 (1981-91)")
# Example 2 - the R code below removes outliers
original_data = rnorm(20)
original_data[1] = 1000
sorted_data = sort(original_data)
filtered_data = sorted_data[3:18]
lower_limit = mean(filtered_data) - 5 * sd(filtered_data)
upper_limit = mean(filtered_data) + 5 * sd(filtered_data)
not_outlier_ind = (lower_limit < original_data) &
(original_data < upper_limit)
print(not_outlier_ind)
data_w_no_outliers = original_data[not_outlier_ind]
# Example 3 - winsorizes data containing an outlier
library(robustHD)
original_data = c(1000, rnorm(10))
print(original_data)
print(winsorize(original_data))
# Example 1 - the diamonds dataset
head(diamonds)
summary(diamonds)
# Example 2 - the price histogram of a random subset of 1000 diamonds
diamondsSubset = diamonds[sample(dim(diamonds)[1], 1000), ]
qplot(price, data = diamondsSubset)
# Example 3 - a transformed histogram
qplot(log(price), size = I(1), data = diamondsSubset)
# Example 4 - transforming both variables
qplot(carat,
price,
size = I(1),
data = diamondsSubset)
qplot(carat,
log(price),
size = I(1),
data = diamondsSubset)
qplot(log(carat),
price,
size = I(1),
data = diamondsSubset)
qplot(log(carat),
log(price),
size = I(1),
data = diamondsSubset)
# Example 5 - display the original un-transformed variables on logarithmic axes
qplot(carat,
price,
log = "xy",
size = I(1),
data = diamondsSubset)
# Example 1 - the Animals dataset
library(MASS)
Animals
qplot(brain, body, data = Animals)
# Example 2 - a power transformation
qplot(brain, body, log = "xy", data = Animals)
# Example 1 - binning
import numpy as np
import pandas as pd
data = [23, 13, 5, 3, 41, 33]
bin_boundaries = [0, 10, 20, 30, 40, 50]
bin_values = [5, 15, 25, 35, 45]
cut_data = pd.cut(data, bin_boundaries)
print("labels:", cut_data.codes)
binned_data = np.zeros(shape = np.size(data))
for k, x in enumerate(cut_data.codes):
binned_data[k] = bin_values[x]
print("binned_data:", binned_data)
# Example 2 - quantile binning
original_data = [23, 13, 5, 3, 41, 33]
cut_data = pd.qcut(original_data, 2)
print("C.labels:", cut_data.codes)
import numpy as np
import pandas as pd
data = [23, 13, 5, 3, 41, 33]
indicator_values = pd.get_dummies(pd.qcut(data, 2))
print("indicator_values:", indicator_values)
# Example 1 - sample 3 times from the values [1, 2, 3, 4] without replacements
sampled_row_indices = sample(1:4, 3, replace = FALSE)
print(sampled_row_indices)
# Example 2 - sampled rows
d = array(data = seq(1, 20, length.out = 20), dim = c(4, 5))
d_sampled = d[sampled_row_indices, ]
print(d_sampled)
# Example 3 - partition an array into two new arrays of sizes 75% and 25%
d = array(data = seq(1, 20, length.out = 20), dim = c(4, 5))
print(d)
rand_perm = sample(4, 4)
first_set_of_indices = rand_perm[1:floor(4 * .75)]
second_set_of_indices = rand_perm[(floor(4 * .75) + 1):4]
print(d[first_set_of_indices, ])
print(d[second_set_of_indices, ])
# Example 4 - shuffling
d = array(data = seq(1, 20, length.out = 20), dim = c(4, 5))
print(d)
d_shuffled = d[sample(4, 4), ]
print(d_shuffled)
import numpy as np
import pandas as pd
# Example 1 - concatenate two dataframes
data1 = {"ID" : ["2134", "4524"],
"name" : ["John Smith", "Jane Doe"]}
df1 = pd.DataFrame(data1)
data2 = {"ID" : ["9423", "3483"],
"name" : ["Bob Jones", "Mary Smith"]}
df2 = pd.DataFrame(data2)
df3 = pd.concat([df1, df2])
print("concatenation of df1 and df2:\n" + str(df3))
# Example 2 - concatenating dataframes with non identical columns
data1 = {"ID" : ["2134", "4524"],
"name" : ["John Smith", "Jane Doe"]}
df1 = pd.DataFrame(data1)
data2 = {"ID" : ["9423", "3483"],
"nick name" : ["Bobby", "Abby"]}
df2 = pd.DataFrame(data2)
df3 = pd.concat([df1, df2])
print("concatenation of df1 and df2:\n" + str(df3))
# Example 3 - inner join
data1 = {"ID" : ["2134", "4524"],
"name" : ["John Smith", "Jane Doe"]}
df1 = pd.DataFrame(data1)
data2 = {"ID" : ["6325", "2134"],
"age" : [25, 35],
"tenure" : [3, 8]}
df2 = pd.DataFrame(data2)
df3 = pd.merge(df1, df2, on = 'ID', how = 'inner')
print("inner join of df1 and df2:\n" + str(df3))
# Example 3 - outer and left joins
data1 = {"ID" : ["2134", "4524"],
"name" : ["John Smith", "Jane Doe"]}
df1 = pd.DataFrame(data1)
data2 = {"ID" : ["6325", "2134"],
"age" : [25, 35],
"tenure" : [3, 8]}
df2 = pd.DataFrame(data2)
df3 = pd.merge(df1, df2, on = 'ID', how = 'outer')
print("outer join of df1 and df2:\n" + str(df3))
df4 = pd.merge(df1, df2, on = 'ID', how = 'left')
print("left join of df1 and df2:\n" + str(df4))
# Example 4 - outer join with multiple records with the same key
data1 = {"ID" : ["2134", "4524", "2134"],
"name" : ["John Smith", "Jane Doe", "JOHN SMITH"]}
df1 = pd.DataFrame(data1)
data2 = {"ID" : ["6325", "2134"],
"age" : [25, 35],
"tenure" : [3, 8]}
df2 = pd.DataFrame(data2)
df3 = pd.merge(df1, df2, on = 'ID', how = 'outer')
print("outer join of df1 and df2:\n" + str(df3))
# Example 5 - outer join with attribute name suffix
data1 = {"ID" : ["2134", "4524"],
"f1" : ["John Smith", "Jane Doe"]}
df1 = pd.DataFrame(data1)
data2 = {"ID" : ["6325", "2134"],
"f1" : [25, 35],
"f2" : [3, 8]}
df2 = pd.DataFrame(data2)
df3 = pd.merge(df1, df2, on = 'ID', how = 'outer')
print("outer join of df1 and df2:\n" + str(df3))
# Example 1 - the melt function
library(reshape2)
# toy (wide) dataframe in the reshape2 package
smiths
# columns 2, 3, 4, 5 are measurements, 1 is key
melt(smiths, id = 1)
# columns 3, 4, 5 are measurements, 1,2 are key
melt(smiths, id = c(1, 2))
# Example 2 - the tips dataset
tips$total.bill = tips$total_bill
qplot(total.bill,
tip,
facets = sex~time,
size = I(1.5),
data = tips)
# Example 2 - analyzing tips and bills
head(tips) # first six rows
tipsm = melt(tips,
id = c("sex","smoker","day","time","size"))
head(tipsm) # first six rows
tail(tipsm) # last six rows
# Mean of measurement variables broken by sex.
# Note the role of mean as the aggregating function.
dcast(tipsm,
sex~variable,
fun.aggregate = mean)
# Number of occurrences for measurement variables broken by sex.
# Note the role of length as the aggregating function.
dcast(tipsm,
sex~variable,
fun.aggregate = length)
# Average total bill and tip for different times
dcast(tipsm,
time~variable,
fun.aggregate = mean)
# Similar to above with breakdown for sex and time:
dcast(tipsm,
sex+time~variable,
fun.aggregate = length)
# Similar to above, but with mean and added margins
dcast(tipsm,
sex+time~variable,
fun.aggregate = mean,
margins = TRUE)
# Example 1 - the ddply function
library(plyr)
head(baseball)
# Count number of players recorded for each year
bbPerYear = ddply(baseball, "year", "nrow")
head(bbPerYear)
qplot(x = year,
y = nrow,
data = bbPerYear,
geom = "line",
ylab = "number of player seasons")
# Example 2 - another example of the ddply function
# Compute mean rbi (batting attempt resulting in runs)
# for all years. Summarize is the apply function, which
# takes as argument a function that computes the rbi mean
bbMod = ddply(baseball,
"year",
summarise,
mean.rbi = mean(rbi, na.rm = TRUE))
head(bbMod)
qplot(x = year,
y = mean.rbi,
data = bbMod,
geom = "line",
ylab = "mean RBI")
# Example 3 - adding a new variable
# Add a column career.year which measures the number of years
# passed since each player started batting
bbMod2 = ddply(baseball,
"id",
transform,
career.year = year - min(year) + 1)
# Sample a random subset 3000 rows to avoid over-plotting
bbSubset = bbMod2[sample(dim(bbMod2)[1], 3000), ]
qplot(career.year,
rbi, data = bbSubset,
size = I(0.8),
geom = "jitter",
ylab = "RBI",
xlab = "years of playing") +
geom_smooth(color = "red", se = F, size = 1.5)
# Example 4 - the aaply function
dim(ozone)
latitude.mean = aaply(ozone, 1, mean)
longitude.mean = aaply(ozone, 2, mean)
time.mean = aaply(ozone, 3, mean)
longitude = seq(along = longitude.mean)
qplot(x = longitude,
y = longitude.mean,
ylab = "mean ozone level")
latitude = seq(along = latitude.mean)
qplot(x = latitude,
y = latitude.mean,
ylab = "mean ozone level",
geom = "line")
months = seq(along = time.mean)
qplot(x = months,
y = time.mean,
geom = "line",
ylab = "mean ozone level",
xlab = "months since January 1985")