| title | Reproducible Research: Peer Assessment 1 | ||||
|---|---|---|---|---|---|
| output |
|
The number of steps per 5 minute interval over two months is provided in the file activity.csv.
library(dplyr)
library(lubridate)
stepsDf <- read.csv('activity.csv', stringsAsFactors = FALSE)
head(stepsDf)## steps date interval
## 1 NA 2012-10-01 0
## 2 NA 2012-10-01 5
## 3 NA 2012-10-01 10
## 4 NA 2012-10-01 15
## 5 NA 2012-10-01 20
## 6 NA 2012-10-01 25
steps <- tbl_df(stepsDf)
stepsByDay <- steps %>%
group_by(date) %>%
summarise(stepTotal = sum(steps, na.rm = TRUE))
hist( stepsByDay$stepTotal,
breaks = 10,
main = "Histogram of Total Number of Steps Taken Daily",
xlab = "Total Steps")mean(stepsByDay$stepTotal, na.rm = TRUE)## [1] 9354.23
median(stepsByDay$stepTotal, na.rm = TRUE)## [1] 10395
stepsByInterval <- steps %>%
group_by(interval) %>%
summarise(stepsMean = mean(steps, na.rm=TRUE)) %>%
mutate( t = sprintf('%04d', interval),
interval_t = paste(substr(t, 1, 2), substr(t, 3, 4), '00', sep = ':' ),
tod = hms(interval_t))
with(stepsByInterval, plot( x = tod,
y = stepsMean,
xaxt = 'n',
type = 'l',
main = 'Average Daily Activity',
ylab = 'Average Steps',
xlab = 'Time of Day'
))
axis(side = 1, at = seq(0, 86400, 3600), labels = paste0(seq(0, 24, 1), ':00'))stepsByInterval[[which(stepsByInterval$stepsMean == max(stepsByInterval$stepsMean)), 'interval']]## [1] 835
sum(is.na(stepsDf[,1]))## [1] 2304
The missing values in the original dataset are replaced with the mean of the same interval across those observations that were not missing
fullSteps <- merge( steps, stepsByInterval,
by.x='interval', by.y='interval') %>%
mutate(newsteps = ifelse(is.na(steps), stepsMean, steps))
newSteps <- fullSteps %>%
select(steps = newsteps, date, interval) %>%
arrange(date, interval)
head(newSteps)## steps date interval
## 1 1.7169811 2012-10-01 0
## 2 0.3396226 2012-10-01 5
## 3 0.1320755 2012-10-01 10
## 4 0.1509434 2012-10-01 15
## 5 0.0754717 2012-10-01 20
## 6 2.0943396 2012-10-01 25
stepsByDay <- newSteps %>%
group_by(date) %>%
summarise(stepTotal = sum(steps, na.rm = TRUE))
hist( stepsByDay$stepTotal,
breaks = 10,
main = "Histogram of Total Number of Steps Taken Daily (Missing Values Imputed)",
xlab = "Total Steps")mean(stepsByDay$stepTotal, na.rm = TRUE)## [1] 10766.19
median(stepsByDay$stepTotal, na.rm = TRUE)## [1] 10766.19
While it appears unusual that the mean and median are identical, there are a number of days that the steps value for every interval was missing. This imputing strategy resulted in those days having the mean sum of steps. How many days does this affect?
sum(stepsByDay[, 'stepTotal'] == mean(stepsByDay$stepTotal, na.rm = TRUE))## [1] 8
stepdays <- newSteps %>%
mutate(dow = weekdays(as.Date(date))) %>%
mutate(dow = factor(dow,
levels = c( "Monday", "Tuesday", "Wednesday",
"Thursday", "Friday", "Saturday", "Sunday"),
labels = c( "week", "week", "week", "week",
"week", "weekend", "weekend"))) %>%
group_by(interval, dow) %>%
summarise(stepsMean = mean(steps, na.rm = TRUE))
stepsAsDf = as.data.frame(stepdays)
stepsAsDf <- stepsAsDf %>%
mutate( t = sprintf('%04d', interval),
interval_t = paste(substr(t, 1, 2), substr(t, 3, 4), '00', sep = ':' ),
tod = hms(interval_t))
par(mfrow = c(2,1), mar = c(0, 0, 0, 0), oma = c(4, 4, 4, 0.5))
with(stepsAsDf[which(stepsAsDf[,2] == 'week'),],
plot( x = tod,
y = stepsMean,
type = 'l',
xaxt = 'n',
ylab = 'Average Steps',
xlab = 'Time of Day'))
mtext('week', side = 3, line = -1, adj = 0.1)
#axis(side = 1, at = seq(0, 86400, 3600), labels = paste0(seq(0, 24, 1), ':00'))
with(stepsAsDf[which(stepsAsDf[,2] == 'weekend'),],
plot( x = tod,
y = stepsMean,
type = 'l',
xaxt = 'n',
ylab = 'Average Steps',
xlab = 'Time of Day'))
mtext('weekend', side = 3, line = -1, adj = 0.1)
axis(side = 1, at = seq(0, 86400, 3600), labels = paste0(seq(0, 24, 1), ':00'))
mtext("Time of Day", side = 1, outer = TRUE, line = 2.2)
mtext("Steps Mean", side = 2, outer = TRUE, line = 2.2)
mtext('Average Daily Activity', side = 3, outer = TRUE, line = 2.2) 


