getdata_project/run_analysis.R at master · linuxfranz/getdata_project · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
library(dplyr)

# download and unzip raw data if necessary
url <- "https://d396qusza40orc.cloudfront.net/getdata%2Fprojectfiles%2FUCI%20HAR%20Dataset.zip"
file <- "Dataset.zip"
dir <- "UCI HAR Dataset"
if (!file.exists(file)) {
    download.file(url, destfile=file, method="curl")
}
if (!file.exists(dir)) {
    unzip(file)
}

# build vector with NULL values for all columns
# that should not be read from data_x features
cols_3d <- c(rep("numeric",6),rep("NULL",34))  # 40
cols_3df <- c(rep("numeric",6),rep("NULL",73)) # 79
cols_2d <- c(rep("numeric",2),rep("NULL",11))  # 13
cols <- c(rep(cols_3d,5),   # 200
          rep(cols_2d,5),   # 265
          rep(cols_3df,3),  # 502
          rep(cols_2d,4),   # 554
          rep("NULL", 7))     # 561
# same for the column labels
cols_labels <- cols == "numeric"

# read and merge subjects
subjects <- read.csv(paste(dir, "test/subject_test.txt", sep="/"), sep="", header=FALSE)
subjects <- rbind(subjects,read.csv(paste(dir, "train/subject_train.txt", sep="/"), sep="", header=FALSE))
names(subjects) <- "subject"

# read and merge features
data_x <- read.csv(paste(dir, "test/X_test.txt", sep="/"), sep="", header=FALSE, colClasses=cols)
data_y <- read.csv(paste(dir, "test/y_test.txt", sep="/"), sep="", header=FALSE)
data_x <- rbind(data_x, read.csv(paste(dir, "train/X_train.txt", sep="/"), sep="", header=FALSE, colClasses=cols))
data_y <- rbind(data_y, read.csv(paste(dir, "train/y_train.txt", sep="/"), sep="", header=FALSE))

# read, fix and set column labels (features)
feats <- read.csv(paste(dir, "features.txt", sep="/"),
                  sep="", header=FALSE, colClasses=c("NULL","character"))
features_fixed <- gsub("\\(\\)", "",feats[cols_labels,1])
features_fixed <- gsub("BodyBody", "Body", features_fixed)
features_fixed <- gsub("^t", "Time", features_fixed)
features_fixed <- gsub("^f", "Frequency", features_fixed)
features_fixed <- gsub("-mean", "Mean", features_fixed)
features_fixed <- gsub("-std", "Std", features_fixed)
features_fixed <- gsub("-X", "X", features_fixed)
features_fixed <- gsub("-Y", "Y", features_fixed)
features_fixed <- gsub("-Z", "Z", features_fixed)
names(data_x) <- features_fixed
names(data_y) <- "activitynr"

# merge subjects, data_x and data_y
data <- cbind(subjects, data_y, data_x)

# read and merge activities, delete activitynr
activities <- read.csv(paste(dir, "activity_labels.txt", sep="/"), sep="", header=FALSE, col.names=c("nr", "activity"))
data <- merge(data, activities, by.x="activitynr", by.y="nr")
data$activitynr <- NULL

# group by subject & activity, calculate mean of other cols
# (using dplyr & function chaining)
result <- data %>%
    group_by(subject,activity) %>%
    summarise_each(funs(mean))

# write result to file result.txt
write.table(result, "result.txt", row.name=FALSE)