|
1 | 1 | #### --------------------------------------------------------------------------- |
2 | 2 |
|
3 | 3 | library(diffrprojects) |
| 4 | +is_unique <- diffrprojects:::is_unique |
| 5 | +is_minimum <- diffrprojects:::is_minimum |
| 6 | +dim1 <- diffrprojects:::dim1 |
| 7 | +which_dist_min_absolute <- diffrprojects:::which_dist_min_absolute |
4 | 8 |
|
5 | 9 |
|
6 | | -#### --------------------------------------------------------------------------- |
| 10 | +library(dplyr) |
| 11 | +library(data.table) |
| 12 | +library(dtplyr) |
| 13 | +library(Rcpp) |
7 | 14 |
|
8 | 15 |
|
9 | | -if( !length(grepl("Windows", as.list(Sys.getenv())$OS))==0 ){ |
10 | | - text_path <- "/users/peter/Dropbox/IDEP_Database/rawdata/AUT/txts" |
11 | | -}else{ |
12 | | - text_path <- "~/Dropbox/IDEP_Database/rawdata/AUT/txts" |
13 | | -} |
14 | 16 |
|
15 | | -text_files <- list.files(text_path, pattern = "txt", full.names = TRUE) |
16 | | -dp <- diffrproject$new() |
17 | | -dp$text_add(text_files, encoding="latin1") |
18 | | -dp$texts_link() |
19 | 17 |
|
20 | 18 |
|
21 | | -length(dp$texts) |
22 | | -names(dp$texts) |
23 | | -dp$text_data() |
| 19 | +#### --------------------------------------------------------------------------- |
| 20 | + |
| 21 | +text_path <- "~/Dropbox/IDEP_Database/rawdata/AUT/txts" |
| 22 | + |
| 23 | +text_files <- list.files(text_path, pattern = "txt", full.names = TRUE) |
24 | 24 |
|
| 25 | +text1 <- rtext$new(text_file=text_files[13], encoding="latin1")$text_get() |
| 26 | +text2 <- rtext$new(text_file=text_files[14], encoding="latin1")$text_get() |
25 | 27 |
|
| 28 | +text1 <- rtext$new(text_file=stringb:::test_file("rc_2.txt"))$text_get() |
| 29 | +text2 <- rtext$new(text_file=stringb:::test_file("rc_3.txt"))$text_get() |
26 | 30 |
|
27 | | -text1 <- rtext$new(text_file=text_files[13], encoding="latin1") |
28 | | -text2 <- rtext$new(text_file=text_files[14], encoding="latin1") |
| 31 | +tokenizer <- text_tokenize_words |
| 32 | +clean <- function(x){x} |
| 33 | +distance <- function(x,y){matrix(runif(length(x)*length(y), 0, 100), nrow=length(x), ncol=length(y))} |
29 | 34 |
|
30 | | -text1 <- rtext$new(text_file=stringb:::test_file("rc_2.txt")) |
31 | | -text2 <- rtext$new(text_file=stringb:::test_file("rc_3.txt")) |
| 35 | +#### --------------------------------------------------------------------------- |
32 | 36 |
|
33 | | -moc_linewise_bow <- function(text1, text2, ...){ |
34 | 37 |
|
35 | | - # tokenize |
36 | | - text1_tokenized <- text_tokenize(text1$text_get(), "\n") |
37 | | - text2_tokenized <- text_tokenize(text2$text_get(), "\n") |
| 38 | +moc <- function( |
| 39 | + text1 = NULL, |
| 40 | + text2 = NULL, |
| 41 | + tokenizer = text_tokenize_lines, |
| 42 | + clean = function(x){x}, |
| 43 | + distance = NULL |
| 44 | +){} |
38 | 45 |
|
| 46 | + # tokenize |
| 47 | + message(" - tokenizing") |
| 48 | + text1_tokenized <- tokenizer(text1)[1:3] |
39 | 49 | text1_tokenized$token_i <- seq_along(text1_tokenized$token) |
| 50 | + |
| 51 | + text2_tokenized <- tokenizer(text2)[1:3] |
40 | 52 | text2_tokenized$token_i <- seq_along(text2_tokenized$token) |
41 | 53 |
|
42 | 54 | # clean |
43 | | - # ... |
| 55 | + message(" - cleaning") |
| 56 | + text1_tokenized$token <- clean(text1_tokenized$token) |
| 57 | + text2_tokenized$token <- clean(text2_tokenized$token) |
| 58 | + |
| 59 | + |
| 60 | + # alignment and distances |
| 61 | + |
| 62 | + #### trivial matches -- unique 1:1 matches |
| 63 | + message(" - trivial matching") |
| 64 | + res <- moc_helper_trivial_matches( tt1 = text1_tokenized, tt2 = text2_tokenized ) |
| 65 | + |
| 66 | + #### matching text1 tokens and text2 tokens |
| 67 | + message(" - easy matching") |
| 68 | + tt1 <- text1_tokenized %>% subset( !(token_i %in% res$token_i_1) ) %>% data.table::as.data.table() %>% data.table::setkey(token) |
| 69 | + tt2 <- text2_tokenized %>% subset( !(token_i %in% res$token_i_2) ) %>% data.table::as.data.table() %>% data.table::setkey(token) |
| 70 | + |
| 71 | + dist <- which_dist_min_absolute(tt1$token_i, res$token_i_1) |
| 72 | + tt1$min_dist_1 <- dist$minimum |
| 73 | + tmp <- subset(res[dist$location, ], TRUE, c(token_i_1, token_i_2)) |
| 74 | + names(tmp) <- paste0("res_",names(tmp)) |
| 75 | + tt1_tmp <- cbind(subset(tt1,TRUE, c(token, token_i, min_dist_1)), tmp) |
| 76 | + tt1_tmp <- suppressWarnings( left_join(tt1_tmp, subset(tt2, TRUE, c(token, token_i)), by="token") ) |
| 77 | + names(tt1_tmp)[names(tt1_tmp)=="token_i.x"] <- "token_i_1" |
| 78 | + names(tt1_tmp)[names(tt1_tmp)=="token_i.y"] <- "token_i_2" |
| 79 | + |
| 80 | + tt1_tmp[, token := NULL] |
| 81 | + tt1_tmp[, res_token_i_1 := NULL] |
| 82 | + |
| 83 | + tt1_tmp$min_dist_2 <- 0L |
| 84 | + tt1_tmp$min_dist_2 <- abs(tt1_tmp$res_token_i_2 - tt1_tmp$token_i_2) |
| 85 | + |
| 86 | + tt1_tmp[, token := NULL] |
| 87 | + tt1_tmp[, res_token_i_2 := NULL] |
| 88 | + |
| 89 | + setorder(tt1_tmp, min_dist_1, min_dist_2, token_i_1, token_i_2) |
| 90 | + |
| 91 | + tt1_tmp <- subset(tt1_tmp, TRUE, c(token_i_1, token_i_2)) |
| 92 | + tt1_tmp <- tt1_tmp[!is.na(tt1_tmp$token_i_2),] |
| 93 | + tt1_tmp[, min_dist_1 := NULL] |
| 94 | + tt1_tmp[, min_dist_2 := NULL] |
| 95 | + |
| 96 | + |
| 97 | + |
| 98 | + |
| 99 | + |
| 100 | + |
| 101 | + |
| 102 | + |
| 103 | + |
44 | 104 |
|
45 | | - # trivial matches |
46 | | - matches <- merge(text1_tokenized, text2_tokenized, by="token") |
47 | | - matches <- |
48 | | - matches[ |
49 | | - is_unique(matches$token_i.x) & is_unique(matches$token_i.y) , |
50 | | - ] |
51 | | - matches <- matches[order(matches$token_i.x, matches$token_i.y),] |
52 | 105 |
|
53 | 106 |
|
54 | | - # statistics |
55 | | - sum(!(text1_tokenized$token_i %in% matches$token_i.x)) |
56 | | - sum(!(text2_tokenized$token_i %in% matches$token_i.y)) |
57 | 107 |
|
58 | | - # data |
59 | | - data.frame( |
60 | | - text1 = substr(text1_tokenized$token, 1,15), |
61 | | - line1 = seq_along(text1_tokenized$from), |
62 | | - line2 = seq_along(text2_tokenized$from)[m1], |
63 | | - text2 = substr(text2_tokenized$token[m1], 1,15) |
64 | | - ) |
65 | 108 |
|
66 | 109 |
|
67 | | -} |
68 | 110 |
|
69 | 111 |
|
70 | | -# moc_template <- function(text1, text2, tokenize, clean, distance){ |
71 | | -# |
72 | | -# } |
73 | 112 |
|
74 | 113 |
|
75 | 114 |
|
76 | | -#### --------------------------------------------------------------------------- |
77 | 115 |
|
0 commit comments