Skip to content

Commit 9a34f5b

Browse files
committed
restructuring rtext
1 parent e9f14b1 commit 9a34f5b

29 files changed

+33
-908
lines changed

DESCRIPTION

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
Package: diffrprojects
22
Title: Using diffr for more than two files
3-
Date: 2016-07-06
4-
Version: 0.1.0.90000
3+
Date: 2016-08-01
4+
Version: 0.1.1.90000
55
Authors@R: c(
66
person(
77
"Peter", "Meissner",
@@ -32,8 +32,9 @@ Imports:
3232
R6 (>= 2.1.2),
3333
hellno (>= 0.0.1),
3434
magrittr (>= 1.5),
35-
Rcpp (>= 0.12.5),
3635
digest (>= 0.6.9),
36+
stringb (>= 0.1.0),
37+
rtext (>= 0.1.0),
3738
stats,
3839
graphics
3940
Suggests:
@@ -43,7 +44,5 @@ Suggests:
4344
BugReports: https://github.com/petermeissner/diffrprojects/issues
4445
URL: https://github.com/petermeissner/diffrprojects
4546
RoxygenNote: 5.0.1
46-
VignetteBuilder: knitr
47-
LinkingTo:
48-
Rcpp
47+
4948

NAMESPACE

Lines changed: 2 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,45 +1,21 @@
11
# Generated by roxygen2: do not edit by hand
22

3-
S3method(plot,rtext)
4-
S3method(text_collapse,data.frame)
5-
S3method(text_collapse,default)
6-
S3method(text_collapse,list)
7-
S3method(text_tokenize,default)
8-
S3method(text_tokenize,rtext)
93
export(classes)
104
export(diffrproject)
115
export(dim1)
126
export(dim2)
13-
export(dp_hash)
147
export(dp_ls)
158
export(dp_text_base_data)
169
export(dp_tf)
1710
export(get_vector_element)
1811
export(is_between)
19-
export(load_into)
2012
export(modus)
2113
export(rbind_fill)
22-
export(rtext)
23-
export(rtext_get_character)
24-
export(rtext_tokenizer_list)
2514
export(seq_dim1)
2615
export(shift)
27-
export(text_collapse)
28-
export(text_eval)
29-
export(text_extract)
30-
export(text_extract_all)
31-
export(text_length)
32-
export(text_nchar)
33-
export(text_read)
34-
export(text_show)
35-
export(text_snippet)
36-
export(text_tokenize)
37-
export(text_tokenize_words)
38-
export(vector_delete)
3916
export(which_token)
40-
export(which_token_worker)
4117
import(hellno)
18+
import(rtext)
19+
import(stringb)
4220
importFrom(R6,R6Class)
43-
importFrom(Rcpp,sourceCpp)
4421
importFrom(magrittr,"%>%")
45-
useDynLib(diffrprojects)

NEWS.md

Lines changed: 6 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -1,119 +1,36 @@
11
NEWS diffrprojects
22
==========================================================================
33

4-
version 0.5.0 // 2016-06-09 ...
4+
version 0.1.2 // 2016-06-09 ...
55
--------------------------------------------------------------------------
66

77
* BUGFIXES
8-
- fixing which_token supressing results error
98

10-
11-
* FEATURE
12-
- rtext : char_code() -> char_data_set()
13-
- rtext : char_get_code() -> char_data_get()
14-
- rtext : introducing tokens and token_data
15-
- introducing Rcpp to speed up: which_token_worker, which_token
16-
17-
18-
19-
version 0.4.2 // 2016-06-07 ...
20-
--------------------------------------------------------------------------
21-
22-
* BUGFIXES
239

2410

2511
* FEATURE
26-
- rtext : char_code()
27-
- rtext : char_length()
28-
29-
30-
version 0.4.1 // 2016-06-06 ...
31-
--------------------------------------------------------------------------
32-
33-
* BUGFIXES
34-
- rtext : text_get() and char_get() would return text decently encoded as UTF-8 nut fail to tell Windows about that
35-
36-
* FEATURE
37-
38-
39-
40-
41-
version 0.4.0 // 2016-05-14 ...
42-
--------------------------------------------------------------------------
43-
44-
* BUGFIXES
45-
4612

47-
* FEATURE
48-
- rtext : save()
49-
- rtext : load()
50-
- rtext : text is tokenized into characters and then stored in characters
51-
- rtext : char_add()
52-
- rtext : char_delete()
53-
- rtext : char_replace()
54-
- rtext : text_hash()
55-
- rtext : hash_text()
5613

14+
* DEVELOPMENT
5715

5816

5917

60-
version 0.3.4 // 2016-05-13 ...
18+
version 0.1.1 // 2016-06-07 ...
6119
--------------------------------------------------------------------------
6220

6321
* BUGFIXES
64-
- rtext : getting tokenization on init right
65-
66-
* FEATURE
67-
- dp_text : !!! rename to rtext :-) !!!
68-
69-
70-
7122

72-
version 0.3.3 // 2016-05-09 ...
73-
--------------------------------------------------------------------------
74-
75-
* BUGFIXES
76-
- fixing documentation and minor build check complaints
7723

7824
* FEATURE
79-
- dp_text() : add tokenization to initializetion stage
80-
81-
82-
83-
84-
version 0.3.2 // 2016-05-09 ...
85-
--------------------------------------------------------------------------
8625

87-
* FEATURES
88-
- tools : text_tokenize()
89-
- tools : text_tokenize_words()
90-
91-
92-
93-
94-
version 0.2.0 // 2016-04-28 ...
95-
--------------------------------------------------------------------------
96-
97-
* FEATURES
98-
- dp_text : show_text()
99-
- dp_text : info()
100-
- dp_text : get_text()
101-
102-
103-
104-
105-
version 0.1.0 // 2016-04-27 ...
106-
--------------------------------------------------------------------------
10726

108-
* FEATURES
109-
- dp_text : an object for text (basic layout)
110-
- tools : text_read() function for reading text
111-
- tools : text_snippet() function for getting snippet of text
27+
* DEVELOPMENT
28+
- big big restructuring: putting rtext into separate package
11229

11330

11431

11532

116-
version 0.0.1 // 2016-04-26 ...
33+
version 0.1.0 // 2016-04-26 ...
11734
--------------------------------------------------------------------------
11835

11936
* START of development

R/diffrproject.R

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,12 +68,12 @@ diffrproject <-
6868
# doing-duty-to-do
6969
if( is.null(name) ){
7070
next_num <- max(c(as.numeric(text_extract(names, "\\d+")),0))+1
71-
name <- text_collapse( "noname_", next_num)
71+
name <- text_c( "noname_", next_num)
7272
}
7373
self$texts[[name]] <- rtext
7474
i <- 0
7575
while( rtext$id %in% ids ){
76-
rtext$id <- text_collapse(id, "_", i)
76+
rtext$id <- text_c(id, "_", i)
7777
i <- i+1
7878
}
7979

R/imports.r

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,13 @@
22
#' @importFrom R6 R6Class
33
#' @import hellno
44
#' @importFrom magrittr %>%
5+
#' @import stringb
6+
#' @import rtext
57
dummyimport <- function(){
68
R6::R6Class()
79
1 %>% magrittr::add(1)
810
}
911

10-
#' @useDynLib diffrprojects
11-
#' @importFrom Rcpp sourceCpp
12-
NULL
12+
# #' @useDynLib diffrprojects
13+
# #' @importFrom Rcpp sourceCpp
14+
# NULL

dev.R

Lines changed: 13 additions & 107 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,9 @@ library(magrittr)
55
library(dplyr)
66
library(hellno)
77

8+
library(stringb)
9+
library(rtext)
10+
811
#### ---------------------------------------------------------------------------
912

1013

@@ -13,115 +16,18 @@ text_files <- list.files(text_path, pattern = "txt", full.names = TRUE)
1316

1417

1518
dp <- diffrproject$new()
16-
dp$text_add(rtext = rtext$new(text_file=text_files[1], encoding="latin1", tokenize_by="\n"),name = basename(text_files[1]))
17-
dp$text_add(rtext = rtext$new(text_file=text_files[1], encoding="latin1", tokenize_by="\n"),name = basename(text_files[2]))
18-
19-
20-
#### ---------------------------------------------------------------------------
21-
22-
23-
rtext_tokenizer_data <- function(rt, tokenize_by){
19+
dp$text_add(
20+
rtext =
21+
rtext$new(text_file=text_files[1], encoding="latin1")
22+
)
2423

25-
}
24+
dp$texts
2625

27-
tokenize_data = function(...){
28-
# datanize tokens
29-
update_token_data <- function(...){
30-
# tokenize if necessary
31-
private$tokenize()
32-
if( !is.null(private$char_data$i) ){
33-
# datanize tokens
34-
token_i <- which_token( private$char_data$i, private$token$from, private$token$to )
35-
if( "FUN" %in% names(as.list(match.call())) ){
36-
# user supplied functions and otpions
37-
private$token_data <-
38-
private$char_data[,-1] %>%
39-
stats::aggregate(by = list( token_i=token_i ), ... )
40-
}else{
41-
# standard
42-
private$token_data <-
43-
private$char_data[,-1] %>%
44-
stats::aggregate(
45-
by = list( token_i=token_i ),
46-
FUN="modus",
47-
multimodal=NA,
48-
warn=FALSE
49-
)
50-
}
51-
names(private$token_data)[-1] <- names(private$char_data)[-1]
52-
}
53-
# store hashes
54-
private$token_store$tok_hashed_data <- private$hashed_data
55-
private$token_store$tok_hashed_call <- dp_hash(as.list(match.call()))
56-
}
57-
# deciding when to re-datanize tokens
58-
if( # no datanization has been done so far
59-
length(private$hashed_text)==0 |
60-
length(private$token_store$tok_hashed_text)==0 |
61-
length(private$hashed_data)==0 |
62-
length(private$token_store$tok_hashed_data)==0 |
63-
length(private$token_store$tok_hashed_call)==0
64-
){
65-
self$message("datanizing tokens")
66-
update_token_data(...)
67-
}else if( # text / data / call has changed
68-
private$hashed_text != private$token_store$tok_hashed_text |
69-
identical(private$hashed_text, character(0)) |
70-
private$hashed_text != private$token_store$tok_hashed_data |
71-
identical(private$hashed_data, character(0)) |
72-
dp_hash(as.list(match.call())) != private$token_store$tok_hashed_call
73-
){
74-
self$message("datanizing tokens")
75-
update_token_data(...)
76-
}
77-
}
26+
dp$text_add(
27+
rtext = rtext$new(text_file=text_files[1], encoding="latin1"),
28+
name = basename(text_files[2])
29+
)
7830

7931

80-
# token_get
81-
token_get = function(){
82-
# tokenize text if necessary else take cache
83-
private$tokenize()
84-
# return tokens
85-
data.frame( private$token, token_i=seq_len(dim1(private$token)) )
86-
},
87-
token_data_get = function(...){
88-
# tokenize text / gen token data if necessary else take cache
89-
private$tokenize_data(...)
90-
# return token data
91-
private$token_data
92-
},
93-
94-
95-
96-
97-
token_store =
98-
list(
99-
tok_hashed_text = character(0),
100-
tok_hashed_data = character(0),
101-
tok_hashed_call = character(0)
102-
),
103-
32+
#### ---------------------------------------------------------------------------
10433

105-
# get text line information
106-
text_lines = function(){
107-
lengths <- nchar(self$text_get(split="\n"))+1
108-
lengths[length(lengths)] <- lengths[length(lengths)]-1
109-
res <-
110-
data.frame(
111-
line_i = seq_along(lengths),
112-
from = c(0, cumsum(lengths)[seq_len(length(lengths)-1)] )+1,
113-
to = cumsum(lengths),
114-
nchar = lengths
115-
)
116-
return(res)
117-
},
118-
text_lines_get = function(lines, nl=FALSE){
119-
res <- character(length(lines))
120-
lines <- self$text_lines()[lines,]
121-
from <- lines$from
122-
to <- lines$to
123-
for( i in seq_along(from) ){
124-
res[i] <- self$text_get(from=from[i], to=to[i] - ifelse(!nl&from[i]<to[i],1,0))
125-
}
126-
return(res)
127-
},

man/dp_hash.Rd

Lines changed: 0 additions & 15 deletions
This file was deleted.

0 commit comments

Comments
 (0)