petermeissner
diff --git a/‎DESCRIPTION‎
Lines changed: 6 additions & 1 deletion b/‎DESCRIPTION‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎NAMESPACE‎
Lines changed: 2 additions & 0 deletions b/‎NAMESPACE‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎R/RcppExports.R‎
Lines changed: 22 additions & 0 deletions b/‎R/RcppExports.R‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎R/imports.R‎
Lines changed: 1 addition & 0 deletions b/‎R/imports.R‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎R/imports.r‎
Lines changed: 14 additions & 0 deletions b/‎R/imports.r‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎R/moc_helper.R‎
Lines changed: 34 additions & 0 deletions b/‎R/moc_helper.R‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎R/tools.R‎
Lines changed: 17 additions & 1 deletion b/‎R/tools.R‎
Lines changed: 17 additions & 1 deletion
diff --git a/‎dev.R‎
Lines changed: 82 additions & 44 deletions b/‎dev.R‎
Lines changed: 82 additions & 44 deletions
diff --git a/‎man/dist_mat_absolute.Rd‎
Lines changed: 20 additions & 0 deletions b/‎man/dist_mat_absolute.Rd‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎man/dummyimport.Rd‎
Lines changed: 1 addition & 1 deletion b/‎man/dummyimport.Rd‎
Lines changed: 1 addition & 1 deletion
@@ -35,6 +35,10 @@ Imports:
     hellno (>= 0.0.1),
     magrittr (>= 1.5),
     digest (>= 0.6.9),
+    dplyr(>= 0.5.0),
+    data.table (>= 1.9.6),
+    dtplyr (>= 0.0.1),
+    Rcpp (>= 0.12.6),
     stats,
     graphics
 Suggests:
@@ -45,4 +49,5 @@ BugReports: https://github.com/petermeissner/diffrprojects/issues
 URL: https://github.com/petermeissner/diffrprojects
 RoxygenNote: 5.0.1
 VignetteBuilder: knitr
-Additional_repositories: http://petermeissner.github.io/drat
+LinkingTo:
+    Rcpp
@@ -3,8 +3,10 @@
 export("%>%")
 export(diffrproject)
 export(dp_text_base_data)
+export(moc_helper_trivial_matches)
 import(hellno)
 import(rtext)
 import(stringb)
 importFrom(R6,R6Class)
 importFrom(magrittr,"%>%")
+useDynLib(diffrprojects)
@@ -0,0 +1,22 @@
+# This file was generated by Rcpp::compileAttributes
+# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
+
+#' (function to calculate distance matrix of integers)
+#' takes vector of size n and vector of size m and gives back matrix of n rows and m columns
+#' @param x a vector of type numeric
+#' @param y a vector of type numeric
+#' @keywords internal
+dist_mat_absolute <- function(x, y) {
+    .Call('diffrprojects_dist_mat_absolute', PACKAGE = 'diffrprojects', x, y)
+}
+
+#' (function to calculate minimum and position of minimum)
+#' takes vector of size n and vector of size m and gives back list with
+#' vectors of size n (minimum distance and location of minimum in y)
+#' @param x a vector of type integer
+#' @param y a vector of type integer
+#' @keywords internal
+which_dist_min_absolute <- function(x, y) {
+    .Call('diffrprojects_which_dist_min_absolute', PACKAGE = 'diffrprojects', x, y)
+}
+
@@ -3,6 +3,7 @@
 #' @import hellno
 #' @import stringb
 #' @import rtext
+#' @useDynLib diffrprojects
 dummyimport <- function(){
   R6::R6Class()
   1 %>% magrittr::add(1)
 
@@ -0,0 +1,14 @@
+#' imports
+#' @importFrom R6 R6Class
+#' @import hellno
+#' @import stringb
+#' @import rtext
+#' @useDynLib diffrprojects
+dummyimport <- function(){
+  R6::R6Class()
+  1 %>% magrittr::add(1)
+}
+
+#' @importFrom magrittr %>%
+#' @export
+magrittr::`%>%`
@@ -0,0 +1,34 @@
+#' trivial matches
+#'
+#' merhtod of comparison helper function
+#' @param tt1 tokenized text number 1
+#' @param tt2 tokenized text number 2
+#' @export
+moc_helper_trivial_matches <- function(tt1, tt2){
+  # preparation
+  tt1 <- subset( tt1, TRUE, c(token, token_i))
+  tt1 <- data.table::as.data.table(tt1)
+  data.table::setkey(tt1, token)
+
+  tt2 <- subset( tt2, TRUE, c(token, token_i))
+  tt2 <- data.table::as.data.table(tt2)
+  data.table::setkey(tt2, token)
+
+  # merge / join
+  matches <-
+    suppressWarnings(dplyr::inner_join(tt1, tt2, by="token"))
+  data.table::setkey(matches, token_i.x, token_i.y)
+  # clean up names
+  names(matches) <-
+    names(matches) %>%
+    stringb::text_replace("\\.", "_") %>%
+    stringb::text_replace("x", "1") %>%
+    stringb::text_replace("y", "2")
+  # keep unique matches only
+  iffer <- unique(matches$token_i_1)
+  matches <- matches[iffer, ]
+  iffer <- unique(matches$token_i_2)
+  matches <- matches[iffer, ]
+  # return
+  return(matches)
+}
@@ -1,3 +1,19 @@
+#' which are minima in vector
+#' @param x vector to check
+#' @param unique defaults to false
+#' @keywords internal
+is_minimum <- function(x, unique=FALSE){
+  if(unique){
+    return(
+      min(x) == x & !duplicated(x)
+    )
+  }else{
+    return(
+      min(x) == x
+    )
+  }
+}
+
 #' checking if value is uniqe in set
 #' @param x vector to check
 #' @keywords internal
@@ -16,7 +32,7 @@ is_duplicate <- function(x){
 #' @param l list
 #' @param item name or index of item to extract
 #' @param unlist defaults to TRUE, whether to unlist results or leave as list
-#' @keywords internal 
+#' @keywords internal
 get_list_item <- function(l, item, unlist=TRUE){
   tmp <- lapply(l, `[`, item)
   index <- vapply(tmp, is.null, TRUE)
 
@@ -1,77 +1,115 @@
 #### ---------------------------------------------------------------------------
 
 library(diffrprojects)
+is_unique <- diffrprojects:::is_unique
+is_minimum <- diffrprojects:::is_minimum
+dim1 <- diffrprojects:::dim1
+which_dist_min_absolute <- diffrprojects:::which_dist_min_absolute
 
 
-#### ---------------------------------------------------------------------------
+library(dplyr)
+library(data.table)
+library(dtplyr)
+library(Rcpp)
 
 
-if( !length(grepl("Windows", as.list(Sys.getenv())$OS))==0 ){
-  text_path  <- "/users/peter/Dropbox/IDEP_Database/rawdata/AUT/txts"
-}else{
-  text_path  <- "~/Dropbox/IDEP_Database/rawdata/AUT/txts"
-}
 
-text_files <- list.files(text_path, pattern = "txt", full.names = TRUE)
-dp <- diffrproject$new()
-dp$text_add(text_files, encoding="latin1")
-dp$texts_link()
 
 
-length(dp$texts)
-names(dp$texts)
-dp$text_data()
+#### ---------------------------------------------------------------------------
+
+text_path  <- "~/Dropbox/IDEP_Database/rawdata/AUT/txts"
+
+text_files <- list.files(text_path, pattern = "txt", full.names = TRUE)
 
+text1 <- rtext$new(text_file=text_files[13], encoding="latin1")$text_get()
+text2 <- rtext$new(text_file=text_files[14], encoding="latin1")$text_get()
 
+text1 <- rtext$new(text_file=stringb:::test_file("rc_2.txt"))$text_get()
+text2 <- rtext$new(text_file=stringb:::test_file("rc_3.txt"))$text_get()
 
-text1 <- rtext$new(text_file=text_files[13], encoding="latin1")
-text2 <- rtext$new(text_file=text_files[14], encoding="latin1")
+tokenizer <- text_tokenize_words
+clean     <- function(x){x}
+distance  <- function(x,y){matrix(runif(length(x)*length(y), 0, 100), nrow=length(x), ncol=length(y))}
 
-text1 <- rtext$new(text_file=stringb:::test_file("rc_2.txt"))
-text2 <- rtext$new(text_file=stringb:::test_file("rc_3.txt"))
+#### ---------------------------------------------------------------------------
 
-moc_linewise_bow <- function(text1, text2, ...){
 
-  # tokenize
-  text1_tokenized <- text_tokenize(text1$text_get(), "\n")
-  text2_tokenized <- text_tokenize(text2$text_get(), "\n")
+moc <- function(
+  text1     = NULL,
+  text2     = NULL,
+  tokenizer = text_tokenize_lines,
+  clean     = function(x){x},
+  distance  = NULL
+){}
 
+  # tokenize
+  message(" - tokenizing")
+  text1_tokenized <- tokenizer(text1)[1:3]
   text1_tokenized$token_i <- seq_along(text1_tokenized$token)
+
+  text2_tokenized <- tokenizer(text2)[1:3]
   text2_tokenized$token_i <- seq_along(text2_tokenized$token)
 
   # clean
-  # ...
+  message(" - cleaning")
+  text1_tokenized$token <- clean(text1_tokenized$token)
+  text2_tokenized$token <- clean(text2_tokenized$token)
+
+
+  # alignment and distances
+
+  #### trivial matches -- unique 1:1 matches
+  message(" - trivial matching")
+  res <- moc_helper_trivial_matches( tt1 = text1_tokenized, tt2 = text2_tokenized )
+
+  #### matching text1 tokens and text2 tokens
+  message(" - easy matching")
+  tt1 <- text1_tokenized %>% subset( !(token_i %in% res$token_i_1) ) %>% data.table::as.data.table() %>% data.table::setkey(token)
+  tt2 <- text2_tokenized %>% subset( !(token_i %in% res$token_i_2) ) %>% data.table::as.data.table() %>% data.table::setkey(token)
+
+  dist         <- which_dist_min_absolute(tt1$token_i, res$token_i_1)
+  tt1$min_dist_1 <- dist$minimum
+  tmp          <- subset(res[dist$location, ], TRUE, c(token_i_1, token_i_2))
+  names(tmp)   <- paste0("res_",names(tmp))
+  tt1_tmp <- cbind(subset(tt1,TRUE, c(token, token_i, min_dist_1)), tmp)
+  tt1_tmp <- suppressWarnings( left_join(tt1_tmp, subset(tt2, TRUE, c(token, token_i)), by="token") )
+  names(tt1_tmp)[names(tt1_tmp)=="token_i.x"] <- "token_i_1"
+  names(tt1_tmp)[names(tt1_tmp)=="token_i.y"] <- "token_i_2"
+
+  tt1_tmp[, token := NULL]
+  tt1_tmp[, res_token_i_1 := NULL]
+
+  tt1_tmp$min_dist_2 <- 0L
+  tt1_tmp$min_dist_2 <- abs(tt1_tmp$res_token_i_2 - tt1_tmp$token_i_2)
+
+  tt1_tmp[, token := NULL]
+  tt1_tmp[, res_token_i_2 := NULL]
+
+  setorder(tt1_tmp, min_dist_1, min_dist_2, token_i_1, token_i_2)
+
+  tt1_tmp <- subset(tt1_tmp, TRUE, c(token_i_1, token_i_2))
+  tt1_tmp <- tt1_tmp[!is.na(tt1_tmp$token_i_2),]
+  tt1_tmp[, min_dist_1 := NULL]
+  tt1_tmp[, min_dist_2 := NULL]
+
+
+
+
+
+
+
+
+
 
-  # trivial matches
-  matches <- merge(text1_tokenized, text2_tokenized, by="token")
-  matches <-
-    matches[
-      is_unique(matches$token_i.x) & is_unique(matches$token_i.y)  ,
-    ]
-  matches <- matches[order(matches$token_i.x, matches$token_i.y),]
 
 
-  # statistics
-  sum(!(text1_tokenized$token_i %in% matches$token_i.x))
-  sum(!(text2_tokenized$token_i %in% matches$token_i.y))
 
-  # data
-  data.frame(
-    text1 = substr(text1_tokenized$token, 1,15),
-    line1 = seq_along(text1_tokenized$from),
-    line2 = seq_along(text2_tokenized$from)[m1],
-    text2 = substr(text2_tokenized$token[m1], 1,15)
-  )
 
 
-}
 
 
-# moc_template <- function(text1, text2, tokenize, clean, distance){
-#
-# }
 
 
 
-#### ---------------------------------------------------------------------------