Skip to content

Commit 7d73e58

Browse files
committed
end of day
1 parent 6d37241 commit 7d73e58

23 files changed

+505
-481
lines changed

DESCRIPTION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
Package: diffrprojects
22
Title: Using diffr for more than two files
33
Date: 2016-07-06
4-
Version: 0.5.1.90000
4+
Version: 0.6.0.90000
55
Authors@R: c(
66
person(
77
"Peter", "Meissner",

NAMESPACE

Lines changed: 45 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,43 +1,45 @@
1-
# Generated by roxygen2: do not edit by hand
2-
3-
S3method(plot,rtext)
4-
S3method(text_collapse,data.frame)
5-
S3method(text_collapse,default)
6-
S3method(text_collapse,list)
7-
export(classes)
8-
export(diffrproject)
9-
export(dim1)
10-
export(dim2)
11-
export(dp_hash)
12-
export(dp_ls)
13-
export(dp_text_base_data)
14-
export(dp_tf)
15-
export(get_vector_element)
16-
export(is_between)
17-
export(load_into)
18-
export(modus)
19-
export(rbind_fill)
20-
export(rtext)
21-
export(rtext_get_character)
22-
export(rtext_tokenizer)
23-
export(seq_dim1)
24-
export(shift)
25-
export(text_collapse)
26-
export(text_eval)
27-
export(text_extract)
28-
export(text_extract_all)
29-
export(text_length)
30-
export(text_nchar)
31-
export(text_read)
32-
export(text_show)
33-
export(text_snippet)
34-
export(text_tokenize)
35-
export(text_tokenize_words)
36-
export(vector_delete)
37-
export(which_token)
38-
export(which_token_worker)
39-
import(hellno)
40-
importFrom(R6,R6Class)
41-
importFrom(Rcpp,sourceCpp)
42-
importFrom(magrittr,"%>%")
43-
useDynLib(diffrprojects)
1+
# Generated by roxygen2: do not edit by hand
2+
3+
S3method(plot,rtext)
4+
S3method(text_collapse,data.frame)
5+
S3method(text_collapse,default)
6+
S3method(text_collapse,list)
7+
S3method(text_tokenize,default)
8+
S3method(text_tokenize,rtext)
9+
export(classes)
10+
export(diffrproject)
11+
export(dim1)
12+
export(dim2)
13+
export(dp_hash)
14+
export(dp_ls)
15+
export(dp_text_base_data)
16+
export(dp_tf)
17+
export(get_vector_element)
18+
export(is_between)
19+
export(load_into)
20+
export(modus)
21+
export(rbind_fill)
22+
export(rtext)
23+
export(rtext_get_character)
24+
export(rtext_tokenizer_list)
25+
export(seq_dim1)
26+
export(shift)
27+
export(text_collapse)
28+
export(text_eval)
29+
export(text_extract)
30+
export(text_extract_all)
31+
export(text_length)
32+
export(text_nchar)
33+
export(text_read)
34+
export(text_show)
35+
export(text_snippet)
36+
export(text_tokenize)
37+
export(text_tokenize_words)
38+
export(vector_delete)
39+
export(which_token)
40+
export(which_token_worker)
41+
import(hellno)
42+
importFrom(R6,R6Class)
43+
importFrom(Rcpp,sourceCpp)
44+
importFrom(magrittr,"%>%")
45+
useDynLib(diffrprojects)

R/rtext.R

Lines changed: 0 additions & 150 deletions
Original file line numberDiff line numberDiff line change
@@ -81,8 +81,6 @@ rtext <-
8181
tmp = NULL,
8282
char = character(0),
8383
char_data = data.frame(),
84-
token = data.frame(),
85-
token_data = data.frame(),
8684

8785
hashed_all = character(0),
8886
hashed_text = character(0),
@@ -102,94 +100,9 @@ rtext <-
102100
private$hashed_data <- dp_hash(private$char_data)
103101
private$hashed_text <- dp_hash(private$char)
104102
private$hashed_all <- dp_hash(list(private$hashed_data, private$hashed_text))
105-
},
106-
107-
token_store =
108-
list(
109-
tok_hashed_text = character(0),
110-
tok_hashed_data = character(0),
111-
tok_hashed_call = character(0)
112-
),
113-
tokenize = function(){
114-
# helper functions
115-
update_token <- function(){
116-
# tokenize
117-
private$token <-
118-
self$tokenizer(private$text()) %>%
119-
dp_arrange("from","to")
120-
#Encoding(private$token$token) <- "UTF-8"
121-
# store text hash
122-
private$token_store$tok_hashed_text <- private$hashed_text
123-
}
124-
# deciding when to re-tokenize
125-
if( # no tokenization done so far
126-
length(private$hashed_text)==0 |
127-
length(private$token_store$tok_hashed_text)==0
128-
){
129-
self$message("tokenizing")
130-
update_token()
131-
}else if( # text has changed
132-
private$hashed_text != private$token_store$tok_hashed_text |
133-
identical(private$hashed_text, character(0))
134-
){
135-
self$message("tokenizing")
136-
update_token()
137-
}
138-
},
139-
tokenize_data = function(...){
140-
# datanize tokens
141-
update_token_data <- function(...){
142-
# tokenize if necessary
143-
private$tokenize()
144-
if( !is.null(private$char_data$i) ){
145-
# datanize tokens
146-
token_i <- which_token( private$char_data$i, private$token$from, private$token$to )
147-
if( "FUN" %in% names(as.list(match.call())) ){
148-
# user supplied functions and otpions
149-
private$token_data <-
150-
private$char_data[,-1] %>%
151-
stats::aggregate(by = list( token_i=token_i ), ... )
152-
}else{
153-
# standard
154-
private$token_data <-
155-
private$char_data[,-1] %>%
156-
stats::aggregate(
157-
by = list( token_i=token_i ),
158-
FUN="modus",
159-
multimodal=NA,
160-
warn=FALSE
161-
)
162-
}
163-
names(private$token_data)[-1] <- names(private$char_data)[-1]
164-
}
165-
# store hashes
166-
private$token_store$tok_hashed_data <- private$hashed_data
167-
private$token_store$tok_hashed_call <- dp_hash(as.list(match.call()))
168-
}
169-
# deciding when to re-datanize tokens
170-
if( # no datanization has been done so far
171-
length(private$hashed_text)==0 |
172-
length(private$token_store$tok_hashed_text)==0 |
173-
length(private$hashed_data)==0 |
174-
length(private$token_store$tok_hashed_data)==0 |
175-
length(private$token_store$tok_hashed_call)==0
176-
){
177-
self$message("datanizing tokens")
178-
update_token_data(...)
179-
}else if( # text / data / call has changed
180-
private$hashed_text != private$token_store$tok_hashed_text |
181-
identical(private$hashed_text, character(0)) |
182-
private$hashed_text != private$token_store$tok_hashed_data |
183-
identical(private$hashed_data, character(0)) |
184-
dp_hash(as.list(match.call())) != private$token_store$tok_hashed_call
185-
){
186-
self$message("datanizing tokens")
187-
update_token_data(...)
188-
}
189103
}
190104
),
191105

192-
193106
#### public ==================================================================
194107
public = list(
195108

@@ -209,10 +122,8 @@ rtext <-
209122
function(
210123
text = NULL,
211124
text_file = NULL,
212-
tokenizer = function(x){text_tokenize(x, "\n", non_token = TRUE)},
213125
encoding = "UTF-8",
214126
id = NULL,
215-
tokenize_by = NULL,
216127
save_file = NULL,
217128
verbose = TRUE
218129
)
@@ -251,24 +162,6 @@ rtext <-
251162
Encoding(private$char) <- "UTF-8"
252163
self$encoding <- "UTF-8"
253164

254-
#### Tokenizer
255-
# assign tokenizer
256-
self$tokenizer <- tokenizer
257-
if( !is.null(tokenize_by) ){
258-
self$tokenizer <-
259-
function(x){
260-
text_tokenize(x, regex = tokenize_by, non_token = TRUE)
261-
}
262-
}
263-
# check if tokenizer is valid
264-
stopifnot( "data.frame" %in% class(self$tokenizer("")) )
265-
stopifnot( dim2(self$tokenizer(""))==4 )
266-
267-
268-
#### Tokenize
269-
private$tokenize()
270-
271-
272165
##### ID
273166
if( is.null(id) ){
274167
self$id <- dp_hash(self)
@@ -313,7 +206,6 @@ rtext <-
313206
list(
314207
text_file = self$text_file,
315208
character = length(private$char),
316-
token = sum(self$token_get()$is_token),
317209
encoding = self$encoding,
318210
sourcetype = self$sourcetype
319211
)
@@ -334,29 +226,6 @@ rtext <-
334226
}
335227
return(res)
336228
},
337-
# get text line information
338-
text_lines = function(){
339-
lengths <- nchar(self$text_get(split="\n"))+1
340-
lengths[length(lengths)] <- lengths[length(lengths)]-1
341-
res <-
342-
data.frame(
343-
line_i = seq_along(lengths),
344-
from = c(0, cumsum(lengths)[seq_len(length(lengths)-1)] )+1,
345-
to = cumsum(lengths),
346-
nchar = lengths
347-
)
348-
return(res)
349-
},
350-
text_lines_get = function(lines, nl=FALSE){
351-
res <- character(length(lines))
352-
lines <- self$text_lines()[lines,]
353-
from <- lines$from
354-
to <- lines$to
355-
for( i in seq_along(from) ){
356-
res[i] <- self$text_get(from=from[i], to=to[i] - ifelse(!nl&from[i]<to[i],1,0))
357-
}
358-
return(res)
359-
},
360229
# char_get
361230
char_get = function(length=Inf, from=NULL, to=NULL, raw=FALSE){
362231
if(raw | identical(length, TRUE) ){
@@ -497,10 +366,7 @@ rtext <-
497366
text_file = self$text_file,
498367
encoding = self$encoding,
499368
save_file = self$save_file,
500-
tokenizer = self$tokenizer,
501369
sourcetype = self$sourcetype,
502-
token = private$token,
503-
token_data = private$token_data,
504370
session_info = list(
505371
dp_version=packageVersion("diffrprojects"),
506372
r_version=paste(version$major, version$minor, sep="."),
@@ -544,36 +410,20 @@ rtext <-
544410
# setting public
545411
self$id <- tmp$id
546412
self$text_file <- tmp$text_file
547-
self$tokenizer <- tmp$tokenizer
548413
self$encoding <- tmp$encoding
549414
self$sourcetype <- tmp$sourcetype
550415
self$save_file <- tmp$save_file
551416

552417
# setting private
553418
private$char <- tmp$char
554419
private$char_data <- tmp$char_data
555-
private$token <- tmp$token
556-
private$token_data <- tmp$token_data
557420

558421
# updating rest
559422
private$hash_all()
560423

561424
# return for piping
562425
invisible(self)
563426
},
564-
# token_get
565-
token_get = function(){
566-
# tokenize text if necessary else take cache
567-
private$tokenize()
568-
# return tokens
569-
data.frame( private$token, token_i=seq_len(dim1(private$token)) )
570-
},
571-
token_data_get = function(...){
572-
# tokenize text / gen token data if necessary else take cache
573-
private$tokenize_data(...)
574-
# return token data
575-
private$token_data
576-
},
577427
# save_as
578428
export = function(){
579429
message("TBD")

R/rtext_tools.R

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ dp_storage <- new.env(parent = emptyenv())
55

66
#' list of ready to use functions for rtext initialization and tokenization
77
#' @export
8-
rtext_tokenizer <- list(
8+
rtext_tokenizer_list <- list(
99
words = function(x){text_tokenize_words(x, non_token = TRUE )},
1010
words2 = function(x){text_tokenize_words(x, non_token = FALSE)},
1111
lines = function(x){text_tokenize(x, "\n", non_token = TRUE)}
@@ -201,5 +201,3 @@ plot.rtext <-
201201

202202

203203

204-
205-

0 commit comments

Comments
 (0)