Skip to content

Commit 5672ad5

Browse files
committed
end_of_day
1 parent f0ae3e3 commit 5672ad5

19 files changed

+359
-48
lines changed

DESCRIPTION

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,10 @@ Imports:
3535
hellno (>= 0.0.1),
3636
magrittr (>= 1.5),
3737
digest (>= 0.6.9),
38+
dplyr(>= 0.5.0),
39+
data.table (>= 1.9.6),
40+
dtplyr (>= 0.0.1),
41+
Rcpp (>= 0.12.6),
3842
stats,
3943
graphics
4044
Suggests:
@@ -45,4 +49,5 @@ BugReports: https://github.com/petermeissner/diffrprojects/issues
4549
URL: https://github.com/petermeissner/diffrprojects
4650
RoxygenNote: 5.0.1
4751
VignetteBuilder: knitr
48-
Additional_repositories: http://petermeissner.github.io/drat
52+
LinkingTo:
53+
Rcpp

NAMESPACE

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,10 @@
33
export("%>%")
44
export(diffrproject)
55
export(dp_text_base_data)
6+
export(moc_helper_trivial_matches)
67
import(hellno)
78
import(rtext)
89
import(stringb)
910
importFrom(R6,R6Class)
1011
importFrom(magrittr,"%>%")
12+
useDynLib(diffrprojects)

R/RcppExports.R

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
# This file was generated by Rcpp::compileAttributes
2+
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
3+
4+
#' (function to calculate distance matrix of integers)
5+
#' takes vector of size n and vector of size m and gives back matrix of n rows and m columns
6+
#' @param x a vector of type numeric
7+
#' @param y a vector of type numeric
8+
#' @keywords internal
9+
dist_mat_absolute <- function(x, y) {
10+
.Call('diffrprojects_dist_mat_absolute', PACKAGE = 'diffrprojects', x, y)
11+
}
12+
13+
#' (function to calculate minimum and position of minimum)
14+
#' takes vector of size n and vector of size m and gives back list with
15+
#' vectors of size n (minimum distance and location of minimum in y)
16+
#' @param x a vector of type integer
17+
#' @param y a vector of type integer
18+
#' @keywords internal
19+
which_dist_min_absolute <- function(x, y) {
20+
.Call('diffrprojects_which_dist_min_absolute', PACKAGE = 'diffrprojects', x, y)
21+
}
22+

R/imports.R

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
#' @import hellno
44
#' @import stringb
55
#' @import rtext
6+
#' @useDynLib diffrprojects
67
dummyimport <- function(){
78
R6::R6Class()
89
1 %>% magrittr::add(1)

R/imports.r

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
#' imports
2+
#' @importFrom R6 R6Class
3+
#' @import hellno
4+
#' @import stringb
5+
#' @import rtext
6+
#' @useDynLib diffrprojects
7+
dummyimport <- function(){
8+
R6::R6Class()
9+
1 %>% magrittr::add(1)
10+
}
11+
12+
#' @importFrom magrittr %>%
13+
#' @export
14+
magrittr::`%>%`

R/moc_helper.R

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
#' trivial matches
2+
#'
3+
#' merhtod of comparison helper function
4+
#' @param tt1 tokenized text number 1
5+
#' @param tt2 tokenized text number 2
6+
#' @export
7+
moc_helper_trivial_matches <- function(tt1, tt2){
8+
# preparation
9+
tt1 <- subset( tt1, TRUE, c(token, token_i))
10+
tt1 <- data.table::as.data.table(tt1)
11+
data.table::setkey(tt1, token)
12+
13+
tt2 <- subset( tt2, TRUE, c(token, token_i))
14+
tt2 <- data.table::as.data.table(tt2)
15+
data.table::setkey(tt2, token)
16+
17+
# merge / join
18+
matches <-
19+
suppressWarnings(dplyr::inner_join(tt1, tt2, by="token"))
20+
data.table::setkey(matches, token_i.x, token_i.y)
21+
# clean up names
22+
names(matches) <-
23+
names(matches) %>%
24+
stringb::text_replace("\\.", "_") %>%
25+
stringb::text_replace("x", "1") %>%
26+
stringb::text_replace("y", "2")
27+
# keep unique matches only
28+
iffer <- unique(matches$token_i_1)
29+
matches <- matches[iffer, ]
30+
iffer <- unique(matches$token_i_2)
31+
matches <- matches[iffer, ]
32+
# return
33+
return(matches)
34+
}

R/tools.R

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,19 @@
1+
#' which are minima in vector
2+
#' @param x vector to check
3+
#' @param unique defaults to false
4+
#' @keywords internal
5+
is_minimum <- function(x, unique=FALSE){
6+
if(unique){
7+
return(
8+
min(x) == x & !duplicated(x)
9+
)
10+
}else{
11+
return(
12+
min(x) == x
13+
)
14+
}
15+
}
16+
117
#' checking if value is uniqe in set
218
#' @param x vector to check
319
#' @keywords internal
@@ -16,7 +32,7 @@ is_duplicate <- function(x){
1632
#' @param l list
1733
#' @param item name or index of item to extract
1834
#' @param unlist defaults to TRUE, whether to unlist results or leave as list
19-
#' @keywords internal
35+
#' @keywords internal
2036
get_list_item <- function(l, item, unlist=TRUE){
2137
tmp <- lapply(l, `[`, item)
2238
index <- vapply(tmp, is.null, TRUE)

dev.R

Lines changed: 82 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1,77 +1,115 @@
11
#### ---------------------------------------------------------------------------
22

33
library(diffrprojects)
4+
is_unique <- diffrprojects:::is_unique
5+
is_minimum <- diffrprojects:::is_minimum
6+
dim1 <- diffrprojects:::dim1
7+
which_dist_min_absolute <- diffrprojects:::which_dist_min_absolute
48

59

6-
#### ---------------------------------------------------------------------------
10+
library(dplyr)
11+
library(data.table)
12+
library(dtplyr)
13+
library(Rcpp)
714

815

9-
if( !length(grepl("Windows", as.list(Sys.getenv())$OS))==0 ){
10-
text_path <- "/users/peter/Dropbox/IDEP_Database/rawdata/AUT/txts"
11-
}else{
12-
text_path <- "~/Dropbox/IDEP_Database/rawdata/AUT/txts"
13-
}
1416

15-
text_files <- list.files(text_path, pattern = "txt", full.names = TRUE)
16-
dp <- diffrproject$new()
17-
dp$text_add(text_files, encoding="latin1")
18-
dp$texts_link()
1917

2018

21-
length(dp$texts)
22-
names(dp$texts)
23-
dp$text_data()
19+
#### ---------------------------------------------------------------------------
20+
21+
text_path <- "~/Dropbox/IDEP_Database/rawdata/AUT/txts"
22+
23+
text_files <- list.files(text_path, pattern = "txt", full.names = TRUE)
2424

25+
text1 <- rtext$new(text_file=text_files[13], encoding="latin1")$text_get()
26+
text2 <- rtext$new(text_file=text_files[14], encoding="latin1")$text_get()
2527

28+
text1 <- rtext$new(text_file=stringb:::test_file("rc_2.txt"))$text_get()
29+
text2 <- rtext$new(text_file=stringb:::test_file("rc_3.txt"))$text_get()
2630

27-
text1 <- rtext$new(text_file=text_files[13], encoding="latin1")
28-
text2 <- rtext$new(text_file=text_files[14], encoding="latin1")
31+
tokenizer <- text_tokenize_words
32+
clean <- function(x){x}
33+
distance <- function(x,y){matrix(runif(length(x)*length(y), 0, 100), nrow=length(x), ncol=length(y))}
2934

30-
text1 <- rtext$new(text_file=stringb:::test_file("rc_2.txt"))
31-
text2 <- rtext$new(text_file=stringb:::test_file("rc_3.txt"))
35+
#### ---------------------------------------------------------------------------
3236

33-
moc_linewise_bow <- function(text1, text2, ...){
3437

35-
# tokenize
36-
text1_tokenized <- text_tokenize(text1$text_get(), "\n")
37-
text2_tokenized <- text_tokenize(text2$text_get(), "\n")
38+
moc <- function(
39+
text1 = NULL,
40+
text2 = NULL,
41+
tokenizer = text_tokenize_lines,
42+
clean = function(x){x},
43+
distance = NULL
44+
){}
3845

46+
# tokenize
47+
message(" - tokenizing")
48+
text1_tokenized <- tokenizer(text1)[1:3]
3949
text1_tokenized$token_i <- seq_along(text1_tokenized$token)
50+
51+
text2_tokenized <- tokenizer(text2)[1:3]
4052
text2_tokenized$token_i <- seq_along(text2_tokenized$token)
4153

4254
# clean
43-
# ...
55+
message(" - cleaning")
56+
text1_tokenized$token <- clean(text1_tokenized$token)
57+
text2_tokenized$token <- clean(text2_tokenized$token)
58+
59+
60+
# alignment and distances
61+
62+
#### trivial matches -- unique 1:1 matches
63+
message(" - trivial matching")
64+
res <- moc_helper_trivial_matches( tt1 = text1_tokenized, tt2 = text2_tokenized )
65+
66+
#### matching text1 tokens and text2 tokens
67+
message(" - easy matching")
68+
tt1 <- text1_tokenized %>% subset( !(token_i %in% res$token_i_1) ) %>% data.table::as.data.table() %>% data.table::setkey(token)
69+
tt2 <- text2_tokenized %>% subset( !(token_i %in% res$token_i_2) ) %>% data.table::as.data.table() %>% data.table::setkey(token)
70+
71+
dist <- which_dist_min_absolute(tt1$token_i, res$token_i_1)
72+
tt1$min_dist_1 <- dist$minimum
73+
tmp <- subset(res[dist$location, ], TRUE, c(token_i_1, token_i_2))
74+
names(tmp) <- paste0("res_",names(tmp))
75+
tt1_tmp <- cbind(subset(tt1,TRUE, c(token, token_i, min_dist_1)), tmp)
76+
tt1_tmp <- suppressWarnings( left_join(tt1_tmp, subset(tt2, TRUE, c(token, token_i)), by="token") )
77+
names(tt1_tmp)[names(tt1_tmp)=="token_i.x"] <- "token_i_1"
78+
names(tt1_tmp)[names(tt1_tmp)=="token_i.y"] <- "token_i_2"
79+
80+
tt1_tmp[, token := NULL]
81+
tt1_tmp[, res_token_i_1 := NULL]
82+
83+
tt1_tmp$min_dist_2 <- 0L
84+
tt1_tmp$min_dist_2 <- abs(tt1_tmp$res_token_i_2 - tt1_tmp$token_i_2)
85+
86+
tt1_tmp[, token := NULL]
87+
tt1_tmp[, res_token_i_2 := NULL]
88+
89+
setorder(tt1_tmp, min_dist_1, min_dist_2, token_i_1, token_i_2)
90+
91+
tt1_tmp <- subset(tt1_tmp, TRUE, c(token_i_1, token_i_2))
92+
tt1_tmp <- tt1_tmp[!is.na(tt1_tmp$token_i_2),]
93+
tt1_tmp[, min_dist_1 := NULL]
94+
tt1_tmp[, min_dist_2 := NULL]
95+
96+
97+
98+
99+
100+
101+
102+
103+
44104

45-
# trivial matches
46-
matches <- merge(text1_tokenized, text2_tokenized, by="token")
47-
matches <-
48-
matches[
49-
is_unique(matches$token_i.x) & is_unique(matches$token_i.y) ,
50-
]
51-
matches <- matches[order(matches$token_i.x, matches$token_i.y),]
52105

53106

54-
# statistics
55-
sum(!(text1_tokenized$token_i %in% matches$token_i.x))
56-
sum(!(text2_tokenized$token_i %in% matches$token_i.y))
57107

58-
# data
59-
data.frame(
60-
text1 = substr(text1_tokenized$token, 1,15),
61-
line1 = seq_along(text1_tokenized$from),
62-
line2 = seq_along(text2_tokenized$from)[m1],
63-
text2 = substr(text2_tokenized$token[m1], 1,15)
64-
)
65108

66109

67-
}
68110

69111

70-
# moc_template <- function(text1, text2, tokenize, clean, distance){
71-
#
72-
# }
73112

74113

75114

76-
#### ---------------------------------------------------------------------------
77115

man/dist_mat_absolute.Rd

Lines changed: 20 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/dummyimport.Rd

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)