@@ -5,6 +5,9 @@ library(magrittr)
55library(dplyr )
66library(hellno )
77
8+ library(stringb )
9+ library(rtext )
10+
811# ### ---------------------------------------------------------------------------
912
1013
@@ -13,115 +16,18 @@ text_files <- list.files(text_path, pattern = "txt", full.names = TRUE)
1316
1417
1518dp <- diffrproject $ new()
16- dp $ text_add(rtext = rtext $ new(text_file = text_files [1 ], encoding = " latin1" , tokenize_by = " \n " ),name = basename(text_files [1 ]))
17- dp $ text_add(rtext = rtext $ new(text_file = text_files [1 ], encoding = " latin1" , tokenize_by = " \n " ),name = basename(text_files [2 ]))
18-
19-
20- # ### ---------------------------------------------------------------------------
21-
22-
23- rtext_tokenizer_data <- function (rt , tokenize_by ){
19+ dp $ text_add(
20+ rtext =
21+ rtext $ new(text_file = text_files [1 ], encoding = " latin1" )
22+ )
2423
25- }
24+ dp $ texts
2625
27- tokenize_data = function (... ){
28- # datanize tokens
29- update_token_data <- function (... ){
30- # tokenize if necessary
31- private $ tokenize()
32- if ( ! is.null(private $ char_data $ i ) ){
33- # datanize tokens
34- token_i <- which_token( private $ char_data $ i , private $ token $ from , private $ token $ to )
35- if ( " FUN" %in% names(as.list(match.call())) ){
36- # user supplied functions and otpions
37- private $ token_data <-
38- private $ char_data [,- 1 ] %> %
39- stats :: aggregate(by = list ( token_i = token_i ), ... )
40- }else {
41- # standard
42- private $ token_data <-
43- private $ char_data [,- 1 ] %> %
44- stats :: aggregate(
45- by = list ( token_i = token_i ),
46- FUN = " modus" ,
47- multimodal = NA ,
48- warn = FALSE
49- )
50- }
51- names(private $ token_data )[- 1 ] <- names(private $ char_data )[- 1 ]
52- }
53- # store hashes
54- private $ token_store $ tok_hashed_data <- private $ hashed_data
55- private $ token_store $ tok_hashed_call <- dp_hash(as.list(match.call()))
56- }
57- # deciding when to re-datanize tokens
58- if ( # no datanization has been done so far
59- length(private $ hashed_text )== 0 |
60- length(private $ token_store $ tok_hashed_text )== 0 |
61- length(private $ hashed_data )== 0 |
62- length(private $ token_store $ tok_hashed_data )== 0 |
63- length(private $ token_store $ tok_hashed_call )== 0
64- ){
65- self $ message(" datanizing tokens" )
66- update_token_data(... )
67- }else if ( # text / data / call has changed
68- private $ hashed_text != private $ token_store $ tok_hashed_text |
69- identical(private $ hashed_text , character (0 )) |
70- private $ hashed_text != private $ token_store $ tok_hashed_data |
71- identical(private $ hashed_data , character (0 )) |
72- dp_hash(as.list(match.call())) != private $ token_store $ tok_hashed_call
73- ){
74- self $ message(" datanizing tokens" )
75- update_token_data(... )
76- }
77- }
26+ dp $ text_add(
27+ rtext = rtext $ new(text_file = text_files [1 ], encoding = " latin1" ),
28+ name = basename(text_files [2 ])
29+ )
7830
7931
80- # token_get
81- token_get = function (){
82- # tokenize text if necessary else take cache
83- private $ tokenize()
84- # return tokens
85- data.frame ( private $ token , token_i = seq_len(dim1(private $ token )) )
86- },
87- token_data_get = function (... ){
88- # tokenize text / gen token data if necessary else take cache
89- private $ tokenize_data(... )
90- # return token data
91- private $ token_data
92- },
93-
94-
95-
96-
97- token_store =
98- list (
99- tok_hashed_text = character (0 ),
100- tok_hashed_data = character (0 ),
101- tok_hashed_call = character (0 )
102- ),
103-
32+ # ### ---------------------------------------------------------------------------
10433
105- # get text line information
106- text_lines = function (){
107- lengths <- nchar(self $ text_get(split = " \n " ))+ 1
108- lengths [length(lengths )] <- lengths [length(lengths )]- 1
109- res <-
110- data.frame (
111- line_i = seq_along(lengths ),
112- from = c(0 , cumsum(lengths )[seq_len(length(lengths )- 1 )] )+ 1 ,
113- to = cumsum(lengths ),
114- nchar = lengths
115- )
116- return (res )
117- },
118- text_lines_get = function (lines , nl = FALSE ){
119- res <- character (length(lines ))
120- lines <- self $ text_lines()[lines ,]
121- from <- lines $ from
122- to <- lines $ to
123- for ( i in seq_along(from ) ){
124- res [i ] <- self $ text_get(from = from [i ], to = to [i ] - ifelse(! nl & from [i ]< to [i ],1 ,0 ))
125- }
126- return (res )
127- },
0 commit comments