title | output | ||||
---|---|---|---|---|---|
Reproducible Research: Peer Assessment 1 |
|
Sys.setlocale(category = "LC_ALL", locale = "en_US.utf8")
## [1] "LC_CTYPE=en_US.utf8;LC_NUMERIC=C;LC_TIME=en_US.utf8;LC_COLLATE=en_US.utf8;LC_MONETARY=en_US.utf8;LC_MESSAGES=zh_CN.UTF-8;LC_PAPER=zh_CN.UTF-8;LC_NAME=C;LC_ADDRESS=C;LC_TELEPHONE=C;LC_MEASUREMENT=zh_CN.UTF-8;LC_IDENTIFICATION=C"
library(dplyr)
library(lattice)
unzip("activity.zip")
activity <- read.csv("activity.csv")
activity_df <- tbl_df(activity)
activity_perday <- summarise(group_by(activity_df, date), steps = sum(steps))
barplot(activity_perday$steps, names.arg = activity_perday$date,
ylab = "Number of Steps",
main = "Total Number of Steps Taken per Day"
)
Mean of above distribution is 1.0766 × 104 while the meadian is 10765.
activity_interval <- summarise(group_by(activity_df, interval), steps = mean(steps, na.rm = TRUE))
plot(activity_interval$interval, activity_interval$steps,
type = "l",
ylab = "Number of Steps",
xlab = "Time Intervals",
main = "Average Daily Steps by Time(5 Mins per interval)"
)
Across all days, the maximum of average number of steps happened on interval 835, which is 206.1698.
The total number of rows with NAs is 2304. Use average steps of a given interval to fill the missing value.
activity_filled <- activity_df
for (x in 1:nrow(activity_filled)){
if(is.na(activity_filled[x,]$steps)){
mark <- activity_filled[x,]$interval
activity_filled[x,]$steps <- filter(activity_interval, interval == mark)$steps
}
}
rm(x, mark)
Below is what the histgram of the new filled dataset.
activity_filled_perday <- summarise(group_by(activity_filled, date), steps = sum(steps))
barplot(activity_filled_perday$steps, names.arg = activity_filled_perday$date,
ylab = "Number of Steps",
main = "Total Number of Steps Taken per Day(Filled)"
)
Mean of above distribution is 1.0766 × 104 while the meadian is 1.0766 × 104. There should be no big change from original dataset.
activity_week <- cbind(activity_filled, week = NA)
for (x in 1:nrow(activity_week)){
activity_week[x,]$week <- ifelse(test = (weekdays(strptime(activity_week[x,]$date, format = "%Y-%m-%d"), TRUE) %in% c("Sat", "Sun")),
yes = "weekend",
no = "weekday")
}
rm(x)
activity_week_interval <- summarise(group_by(activity_week, interval, week), steps = mean(steps))
xyplot(steps ~ interval | week, data = activity_week_interval,
layout = c(1, 2),
type = "l",
ylab = "Number of Steps",
xlab = "Time Intervals",
main = "Average Daily Steps by Time(5 Mins per interval)"
)
Above chart suggests there indeed some differences between weekdays and weekend.