@@ -81,8 +81,6 @@ rtext <-
8181 tmp = NULL ,
8282 char = character (0 ),
8383 char_data = data.frame (),
84- token = data.frame (),
85- token_data = data.frame (),
8684
8785 hashed_all = character (0 ),
8886 hashed_text = character (0 ),
@@ -102,94 +100,9 @@ rtext <-
102100 private $ hashed_data <- dp_hash(private $ char_data )
103101 private $ hashed_text <- dp_hash(private $ char )
104102 private $ hashed_all <- dp_hash(list (private $ hashed_data , private $ hashed_text ))
105- },
106-
107- token_store =
108- list (
109- tok_hashed_text = character (0 ),
110- tok_hashed_data = character (0 ),
111- tok_hashed_call = character (0 )
112- ),
113- tokenize = function (){
114- # helper functions
115- update_token <- function (){
116- # tokenize
117- private $ token <-
118- self $ tokenizer(private $ text()) %> %
119- dp_arrange(" from" ," to" )
120- # Encoding(private$token$token) <- "UTF-8"
121- # store text hash
122- private $ token_store $ tok_hashed_text <- private $ hashed_text
123- }
124- # deciding when to re-tokenize
125- if ( # no tokenization done so far
126- length(private $ hashed_text )== 0 |
127- length(private $ token_store $ tok_hashed_text )== 0
128- ){
129- self $ message(" tokenizing" )
130- update_token()
131- }else if ( # text has changed
132- private $ hashed_text != private $ token_store $ tok_hashed_text |
133- identical(private $ hashed_text , character (0 ))
134- ){
135- self $ message(" tokenizing" )
136- update_token()
137- }
138- },
139- tokenize_data = function (... ){
140- # datanize tokens
141- update_token_data <- function (... ){
142- # tokenize if necessary
143- private $ tokenize()
144- if ( ! is.null(private $ char_data $ i ) ){
145- # datanize tokens
146- token_i <- which_token( private $ char_data $ i , private $ token $ from , private $ token $ to )
147- if ( " FUN" %in% names(as.list(match.call())) ){
148- # user supplied functions and otpions
149- private $ token_data <-
150- private $ char_data [,- 1 ] %> %
151- stats :: aggregate(by = list ( token_i = token_i ), ... )
152- }else {
153- # standard
154- private $ token_data <-
155- private $ char_data [,- 1 ] %> %
156- stats :: aggregate(
157- by = list ( token_i = token_i ),
158- FUN = " modus" ,
159- multimodal = NA ,
160- warn = FALSE
161- )
162- }
163- names(private $ token_data )[- 1 ] <- names(private $ char_data )[- 1 ]
164- }
165- # store hashes
166- private $ token_store $ tok_hashed_data <- private $ hashed_data
167- private $ token_store $ tok_hashed_call <- dp_hash(as.list(match.call()))
168- }
169- # deciding when to re-datanize tokens
170- if ( # no datanization has been done so far
171- length(private $ hashed_text )== 0 |
172- length(private $ token_store $ tok_hashed_text )== 0 |
173- length(private $ hashed_data )== 0 |
174- length(private $ token_store $ tok_hashed_data )== 0 |
175- length(private $ token_store $ tok_hashed_call )== 0
176- ){
177- self $ message(" datanizing tokens" )
178- update_token_data(... )
179- }else if ( # text / data / call has changed
180- private $ hashed_text != private $ token_store $ tok_hashed_text |
181- identical(private $ hashed_text , character (0 )) |
182- private $ hashed_text != private $ token_store $ tok_hashed_data |
183- identical(private $ hashed_data , character (0 )) |
184- dp_hash(as.list(match.call())) != private $ token_store $ tok_hashed_call
185- ){
186- self $ message(" datanizing tokens" )
187- update_token_data(... )
188- }
189103 }
190104 ),
191105
192-
193106 # ### public ==================================================================
194107 public = list (
195108
@@ -209,10 +122,8 @@ rtext <-
209122 function (
210123 text = NULL ,
211124 text_file = NULL ,
212- tokenizer = function (x ){text_tokenize(x , " \n " , non_token = TRUE )},
213125 encoding = " UTF-8" ,
214126 id = NULL ,
215- tokenize_by = NULL ,
216127 save_file = NULL ,
217128 verbose = TRUE
218129 )
@@ -251,24 +162,6 @@ rtext <-
251162 Encoding(private $ char ) <- " UTF-8"
252163 self $ encoding <- " UTF-8"
253164
254- # ### Tokenizer
255- # assign tokenizer
256- self $ tokenizer <- tokenizer
257- if ( ! is.null(tokenize_by ) ){
258- self $ tokenizer <-
259- function (x ){
260- text_tokenize(x , regex = tokenize_by , non_token = TRUE )
261- }
262- }
263- # check if tokenizer is valid
264- stopifnot( " data.frame" %in% class(self $ tokenizer(" " )) )
265- stopifnot( dim2(self $ tokenizer(" " ))== 4 )
266-
267-
268- # ### Tokenize
269- private $ tokenize()
270-
271-
272165 # #### ID
273166 if ( is.null(id ) ){
274167 self $ id <- dp_hash(self )
@@ -313,7 +206,6 @@ rtext <-
313206 list (
314207 text_file = self $ text_file ,
315208 character = length(private $ char ),
316- token = sum(self $ token_get()$ is_token ),
317209 encoding = self $ encoding ,
318210 sourcetype = self $ sourcetype
319211 )
@@ -334,29 +226,6 @@ rtext <-
334226 }
335227 return (res )
336228 },
337- # get text line information
338- text_lines = function (){
339- lengths <- nchar(self $ text_get(split = " \n " ))+ 1
340- lengths [length(lengths )] <- lengths [length(lengths )]- 1
341- res <-
342- data.frame (
343- line_i = seq_along(lengths ),
344- from = c(0 , cumsum(lengths )[seq_len(length(lengths )- 1 )] )+ 1 ,
345- to = cumsum(lengths ),
346- nchar = lengths
347- )
348- return (res )
349- },
350- text_lines_get = function (lines , nl = FALSE ){
351- res <- character (length(lines ))
352- lines <- self $ text_lines()[lines ,]
353- from <- lines $ from
354- to <- lines $ to
355- for ( i in seq_along(from ) ){
356- res [i ] <- self $ text_get(from = from [i ], to = to [i ] - ifelse(! nl & from [i ]< to [i ],1 ,0 ))
357- }
358- return (res )
359- },
360229 # char_get
361230 char_get = function (length = Inf , from = NULL , to = NULL , raw = FALSE ){
362231 if (raw | identical(length , TRUE ) ){
@@ -497,10 +366,7 @@ rtext <-
497366 text_file = self $ text_file ,
498367 encoding = self $ encoding ,
499368 save_file = self $ save_file ,
500- tokenizer = self $ tokenizer ,
501369 sourcetype = self $ sourcetype ,
502- token = private $ token ,
503- token_data = private $ token_data ,
504370 session_info = list (
505371 dp_version = packageVersion(" diffrprojects" ),
506372 r_version = paste(version $ major , version $ minor , sep = " ." ),
@@ -544,36 +410,20 @@ rtext <-
544410 # setting public
545411 self $ id <- tmp $ id
546412 self $ text_file <- tmp $ text_file
547- self $ tokenizer <- tmp $ tokenizer
548413 self $ encoding <- tmp $ encoding
549414 self $ sourcetype <- tmp $ sourcetype
550415 self $ save_file <- tmp $ save_file
551416
552417 # setting private
553418 private $ char <- tmp $ char
554419 private $ char_data <- tmp $ char_data
555- private $ token <- tmp $ token
556- private $ token_data <- tmp $ token_data
557420
558421 # updating rest
559422 private $ hash_all()
560423
561424 # return for piping
562425 invisible (self )
563426 },
564- # token_get
565- token_get = function (){
566- # tokenize text if necessary else take cache
567- private $ tokenize()
568- # return tokens
569- data.frame ( private $ token , token_i = seq_len(dim1(private $ token )) )
570- },
571- token_data_get = function (... ){
572- # tokenize text / gen token data if necessary else take cache
573- private $ tokenize_data(... )
574- # return token data
575- private $ token_data
576- },
577427 # save_as
578428 export = function (){
579429 message(" TBD" )
0 commit comments