-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtext-mining-nlp.html
More file actions
794 lines (755 loc) · 93.7 KB
/
text-mining-nlp.html
File metadata and controls
794 lines (755 loc) · 93.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
<!DOCTYPE html>
<html >
<head>
<meta charset="UTF-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<title>R User Notebook</title>
<meta name="description" content="A notebook of useful R code and concepts">
<meta name="generator" content="bookdown 0.7 and GitBook 2.6.7">
<meta property="og:title" content="R User Notebook" />
<meta property="og:type" content="book" />
<meta property="og:description" content="A notebook of useful R code and concepts" />
<meta name="twitter:card" content="summary" />
<meta name="twitter:title" content="R User Notebook" />
<meta name="twitter:description" content="A notebook of useful R code and concepts" />
<meta name="author" content="Kieran Driscoll">
<meta name="date" content="2019-02-11">
<meta name="viewport" content="width=device-width, initial-scale=1">
<meta name="apple-mobile-web-app-capable" content="yes">
<meta name="apple-mobile-web-app-status-bar-style" content="black">
<link rel="prev" href="machine-learning-predictive-analytics.html">
<link rel="next" href="shiny.html">
<script src="libs/jquery-2.2.3/jquery.min.js"></script>
<link href="libs/gitbook-2.6.7/css/style.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-bookdown.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-highlight.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-search.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-fontsettings.css" rel="stylesheet" />
<script src="libs/htmlwidgets-1.0/htmlwidgets.js"></script>
<link href="libs/datatables-css-0.0.0/datatables-crosstalk.css" rel="stylesheet" />
<script src="libs/datatables-binding-0.4/datatables.js"></script>
<link href="libs/dt-core-1.10.16/css/jquery.dataTables.min.css" rel="stylesheet" />
<link href="libs/dt-core-1.10.16/css/jquery.dataTables.extra.css" rel="stylesheet" />
<script src="libs/dt-core-1.10.16/js/jquery.dataTables.min.js"></script>
<link href="libs/crosstalk-1.0.0/css/crosstalk.css" rel="stylesheet" />
<script src="libs/crosstalk-1.0.0/js/crosstalk.min.js"></script>
<link href="libs/nouislider-7.0.10/jquery.nouislider.min.css" rel="stylesheet" />
<script src="libs/nouislider-7.0.10/jquery.nouislider.min.js"></script>
<link href="libs/selectize-0.12.0/selectize.bootstrap3.css" rel="stylesheet" />
<script src="libs/selectize-0.12.0/selectize.min.js"></script>
<link href="libs/dygraphs-1.1.1/dygraph.css" rel="stylesheet" />
<script src="libs/dygraphs-1.1.1/dygraph-combined.js"></script>
<script src="libs/moment-2.8.4/moment.js"></script>
<script src="libs/moment-timezone-0.2.5/moment-timezone-with-data.js"></script>
<script src="libs/moment-fquarter-1.0.0/moment-fquarter.min.js"></script>
<script src="libs/dygraphs-binding-1.1.1.4/dygraphs.js"></script>
<link href="libs/leaflet-1.3.1/leaflet.css" rel="stylesheet" />
<script src="libs/leaflet-1.3.1/leaflet.js"></script>
<link href="libs/leafletfix-1.0.0/leafletfix.css" rel="stylesheet" />
<script src="libs/Proj4Leaflet-1.0.1/proj4-compressed.js"></script>
<script src="libs/Proj4Leaflet-1.0.1/proj4leaflet.js"></script>
<link href="libs/rstudio_leaflet-1.3.1/rstudio_leaflet.css" rel="stylesheet" />
<script src="libs/leaflet-binding-2.0.2/leaflet.js"></script>
<script src="libs/d3-4.3.0/d3.min.js"></script>
<script src="libs/d3-lasso-0.0.5/d3-lasso.min.js"></script>
<link href="libs/ggiraph-0.3.0/styles.css" rel="stylesheet" />
<script src="libs/ggiraph-0.3.0/ggiraph_utils.js"></script>
<script src="libs/ggiraph-0.3.0/ggiraph_over_effect.js"></script>
<script src="libs/ggiraph-0.3.0/ggiraph_zoom.js"></script>
<script src="libs/ggiraph-0.3.0/ggiraph_tooltip.js"></script>
<script src="libs/ggiraph-0.3.0/ggiraph_selector.js"></script>
<script src="libs/ggiraph-binding-0.4.4/ggiraph.js"></script>
<script src="libs/svg_1-0.0.1/scripts_svg_1.js"></script>
<style type="text/css">
a.sourceLine { display: inline-block; line-height: 1.25; }
a.sourceLine { pointer-events: none; color: inherit; text-decoration: inherit; }
a.sourceLine:empty { height: 1.2em; }
.sourceCode { overflow: visible; }
code.sourceCode { white-space: pre; position: relative; }
div.sourceCode { margin: 1em 0; }
pre.sourceCode { margin: 0; }
@media screen {
div.sourceCode { overflow: auto; }
}
@media print {
code.sourceCode { white-space: pre-wrap; }
a.sourceLine { text-indent: -1em; padding-left: 1em; }
}
pre.numberSource a.sourceLine
{ position: relative; left: -4em; }
pre.numberSource a.sourceLine::before
{ content: attr(data-line-number);
position: relative; left: -1em; text-align: right; vertical-align: baseline;
border: none; pointer-events: all; display: inline-block;
-webkit-touch-callout: none; -webkit-user-select: none;
-khtml-user-select: none; -moz-user-select: none;
-ms-user-select: none; user-select: none;
padding: 0 4px; width: 4em;
color: #aaaaaa;
}
pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa; padding-left: 4px; }
div.sourceCode
{ }
@media screen {
a.sourceLine::before { text-decoration: underline; }
}
code span.al { color: #ff0000; font-weight: bold; } /* Alert */
code span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */
code span.at { color: #7d9029; } /* Attribute */
code span.bn { color: #40a070; } /* BaseN */
code span.bu { } /* BuiltIn */
code span.cf { color: #007020; font-weight: bold; } /* ControlFlow */
code span.ch { color: #4070a0; } /* Char */
code span.cn { color: #880000; } /* Constant */
code span.co { color: #60a0b0; font-style: italic; } /* Comment */
code span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */
code span.do { color: #ba2121; font-style: italic; } /* Documentation */
code span.dt { color: #902000; } /* DataType */
code span.dv { color: #40a070; } /* DecVal */
code span.er { color: #ff0000; font-weight: bold; } /* Error */
code span.ex { } /* Extension */
code span.fl { color: #40a070; } /* Float */
code span.fu { color: #06287e; } /* Function */
code span.im { } /* Import */
code span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */
code span.kw { color: #007020; font-weight: bold; } /* Keyword */
code span.op { color: #666666; } /* Operator */
code span.ot { color: #007020; } /* Other */
code span.pp { color: #bc7a00; } /* Preprocessor */
code span.sc { color: #4070a0; } /* SpecialChar */
code span.ss { color: #bb6688; } /* SpecialString */
code span.st { color: #4070a0; } /* String */
code span.va { color: #19177c; } /* Variable */
code span.vs { color: #4070a0; } /* VerbatimString */
code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */
</style>
</head>
<body>
<div class="book without-animation with-summary font-size-2 font-family-1" data-basepath=".">
<div class="book-summary">
<nav role="navigation">
<ul class="summary">
<li class="chapter" data-level="1" data-path="index.html"><a href="index.html"><i class="fa fa-check"></i><b>1</b> R User Notebook</a></li>
<li class="chapter" data-level="2" data-path="basic-concepts.html"><a href="basic-concepts.html"><i class="fa fa-check"></i><b>2</b> Basic Concepts</a><ul>
<li class="chapter" data-level="2.1" data-path="basic-concepts.html"><a href="basic-concepts.html#vectors"><i class="fa fa-check"></i><b>2.1</b> Vectors</a></li>
<li class="chapter" data-level="2.2" data-path="basic-concepts.html"><a href="basic-concepts.html#matrices"><i class="fa fa-check"></i><b>2.2</b> Matrices</a></li>
<li class="chapter" data-level="2.3" data-path="basic-concepts.html"><a href="basic-concepts.html#dataframes"><i class="fa fa-check"></i><b>2.3</b> Dataframes</a></li>
<li class="chapter" data-level="2.4" data-path="basic-concepts.html"><a href="basic-concepts.html#tibbles"><i class="fa fa-check"></i><b>2.4</b> Tibbles</a></li>
<li class="chapter" data-level="2.5" data-path="basic-concepts.html"><a href="basic-concepts.html#extracting-data-from-dataframestibbles"><i class="fa fa-check"></i><b>2.5</b> Extracting data from Dataframes/Tibbles</a></li>
<li class="chapter" data-level="2.6" data-path="basic-concepts.html"><a href="basic-concepts.html#lists"><i class="fa fa-check"></i><b>2.6</b> Lists</a></li>
<li class="chapter" data-level="2.7" data-path="basic-concepts.html"><a href="basic-concepts.html#factors"><i class="fa fa-check"></i><b>2.7</b> Factors</a></li>
<li class="chapter" data-level="2.8" data-path="basic-concepts.html"><a href="basic-concepts.html#dates"><i class="fa fa-check"></i><b>2.8</b> Dates</a></li>
<li class="chapter" data-level="2.9" data-path="basic-concepts.html"><a href="basic-concepts.html#style-guide"><i class="fa fa-check"></i><b>2.9</b> Style Guide</a></li>
</ul></li>
<li class="chapter" data-level="3" data-path="importing-exporting.html"><a href="importing-exporting.html"><i class="fa fa-check"></i><b>3</b> Importing & Exporting</a><ul>
<li class="chapter" data-level="3.1" data-path="importing-exporting.html"><a href="importing-exporting.html#importing-with-base-r"><i class="fa fa-check"></i><b>3.1</b> Importing with Base R</a></li>
<li class="chapter" data-level="3.2" data-path="importing-exporting.html"><a href="importing-exporting.html#importing-with-other-packages"><i class="fa fa-check"></i><b>3.2</b> Importing with other packages</a><ul>
<li class="chapter" data-level="3.2.1" data-path="importing-exporting.html"><a href="importing-exporting.html#importing-a-csv"><i class="fa fa-check"></i><b>3.2.1</b> Importing a CSV</a></li>
<li class="chapter" data-level="3.2.2" data-path="importing-exporting.html"><a href="importing-exporting.html#importing-a-tsv"><i class="fa fa-check"></i><b>3.2.2</b> Importing a TSV</a></li>
<li class="chapter" data-level="3.2.3" data-path="importing-exporting.html"><a href="importing-exporting.html#importing-a-r-data-file"><i class="fa fa-check"></i><b>3.2.3</b> Importing a R data file</a></li>
<li class="chapter" data-level="3.2.4" data-path="importing-exporting.html"><a href="importing-exporting.html#importing-a-excel-file"><i class="fa fa-check"></i><b>3.2.4</b> Importing a Excel file</a></li>
<li class="chapter" data-level="3.2.5" data-path="importing-exporting.html"><a href="importing-exporting.html#importing-a-sas-file"><i class="fa fa-check"></i><b>3.2.5</b> Importing a SAS file</a></li>
<li class="chapter" data-level="3.2.6" data-path="importing-exporting.html"><a href="importing-exporting.html#importing-json-newline-delimited-json"><i class="fa fa-check"></i><b>3.2.6</b> Importing JSON (Newline Delimited JSON)</a></li>
</ul></li>
<li class="chapter" data-level="3.3" data-path="importing-exporting.html"><a href="importing-exporting.html#exporting"><i class="fa fa-check"></i><b>3.3</b> Exporting</a><ul>
<li class="chapter" data-level="3.3.1" data-path="importing-exporting.html"><a href="importing-exporting.html#write-to-an-r-dataset"><i class="fa fa-check"></i><b>3.3.1</b> Write to an R dataset</a></li>
<li class="chapter" data-level="3.3.2" data-path="importing-exporting.html"><a href="importing-exporting.html#write-to-an-csv-file"><i class="fa fa-check"></i><b>3.3.2</b> Write to an CSV file</a></li>
</ul></li>
</ul></li>
<li class="chapter" data-level="4" data-path="manipulating-data.html"><a href="manipulating-data.html"><i class="fa fa-check"></i><b>4</b> Manipulating data</a><ul>
<li class="chapter" data-level="4.1" data-path="manipulating-data.html"><a href="manipulating-data.html#basic-data-wrangling"><i class="fa fa-check"></i><b>4.1</b> Basic data wrangling</a></li>
<li class="chapter" data-level="4.2" data-path="manipulating-data.html"><a href="manipulating-data.html#summary-statistics"><i class="fa fa-check"></i><b>4.2</b> Summary statistics</a></li>
<li class="chapter" data-level="4.3" data-path="manipulating-data.html"><a href="manipulating-data.html#conditional-statements"><i class="fa fa-check"></i><b>4.3</b> Conditional statements</a></li>
<li class="chapter" data-level="4.4" data-path="manipulating-data.html"><a href="manipulating-data.html#recode"><i class="fa fa-check"></i><b>4.4</b> Recode</a></li>
<li class="chapter" data-level="4.5" data-path="manipulating-data.html"><a href="manipulating-data.html#appending-columns-and-rows"><i class="fa fa-check"></i><b>4.5</b> Appending columns and rows</a></li>
<li class="chapter" data-level="4.6" data-path="manipulating-data.html"><a href="manipulating-data.html#joins"><i class="fa fa-check"></i><b>4.6</b> Joins</a></li>
<li class="chapter" data-level="4.7" data-path="manipulating-data.html"><a href="manipulating-data.html#tidy-data"><i class="fa fa-check"></i><b>4.7</b> Tidy Data</a></li>
<li class="chapter" data-level="4.8" data-path="manipulating-data.html"><a href="manipulating-data.html#tibbles-1"><i class="fa fa-check"></i><b>4.8</b> tibbles</a></li>
</ul></li>
<li class="chapter" data-level="5" data-path="functions.html"><a href="functions.html"><i class="fa fa-check"></i><b>5</b> Functions</a><ul>
<li class="chapter" data-level="5.1" data-path="functions.html"><a href="functions.html#function-basics---an-example-using-str"><i class="fa fa-check"></i><b>5.1</b> Function Basics - an example using str()</a></li>
<li class="chapter" data-level="5.2" data-path="functions.html"><a href="functions.html#common-base-functions"><i class="fa fa-check"></i><b>5.2</b> Common (base) functions</a><ul>
<li class="chapter" data-level="5.2.1" data-path="functions.html"><a href="functions.html#strings"><i class="fa fa-check"></i><b>5.2.1</b> Strings</a></li>
<li class="chapter" data-level="5.2.2" data-path="functions.html"><a href="functions.html#mathematical"><i class="fa fa-check"></i><b>5.2.2</b> Mathematical</a></li>
<li class="chapter" data-level="5.2.3" data-path="functions.html"><a href="functions.html#properties-lookups"><i class="fa fa-check"></i><b>5.2.3</b> Properties & Lookups</a></li>
<li class="chapter" data-level="5.2.4" data-path="functions.html"><a href="functions.html#transformation"><i class="fa fa-check"></i><b>5.2.4</b> Transformation</a></li>
<li class="chapter" data-level="5.2.5" data-path="functions.html"><a href="functions.html#other-useful-functions"><i class="fa fa-check"></i><b>5.2.5</b> Other useful functions</a></li>
</ul></li>
<li class="chapter" data-level="5.3" data-path="functions.html"><a href="functions.html#stringr---more-string-functions"><i class="fa fa-check"></i><b>5.3</b> Stringr - more string functions</a></li>
<li class="chapter" data-level="5.4" data-path="functions.html"><a href="functions.html#lubridate---more-date-functions"><i class="fa fa-check"></i><b>5.4</b> Lubridate - more date functions</a></li>
<li class="chapter" data-level="5.5" data-path="functions.html"><a href="functions.html#converting-data-to-other-formats"><i class="fa fa-check"></i><b>5.5</b> Converting data to other formats</a><ul>
<li class="chapter" data-level="5.5.1" data-path="functions.html"><a href="functions.html#json"><i class="fa fa-check"></i><b>5.5.1</b> JSON</a></li>
</ul></li>
<li class="chapter" data-level="5.6" data-path="functions.html"><a href="functions.html#user-defined-functions"><i class="fa fa-check"></i><b>5.6</b> User Defined Functions</a></li>
<li class="chapter" data-level="5.7" data-path="functions.html"><a href="functions.html#chainingpiping"><i class="fa fa-check"></i><b>5.7</b> Chaining/Piping</a></li>
</ul></li>
<li class="chapter" data-level="6" data-path="loops.html"><a href="loops.html"><i class="fa fa-check"></i><b>6</b> Loops</a><ul>
<li class="chapter" data-level="6.1" data-path="loops.html"><a href="loops.html#basic-for-loop-structure"><i class="fa fa-check"></i><b>6.1</b> Basic for Loop structure</a></li>
<li class="chapter" data-level="6.2" data-path="loops.html"><a href="loops.html#looping-through-a-vector"><i class="fa fa-check"></i><b>6.2</b> Looping through a vector</a></li>
<li class="chapter" data-level="6.3" data-path="loops.html"><a href="loops.html#creating-a-list-in-a-loop"><i class="fa fa-check"></i><b>6.3</b> Creating a List in a Loop</a></li>
<li class="chapter" data-level="6.4" data-path="loops.html"><a href="loops.html#looping-with-apply-functions"><i class="fa fa-check"></i><b>6.4</b> Looping with apply functions</a></li>
</ul></li>
<li class="chapter" data-level="7" data-path="tables.html"><a href="tables.html"><i class="fa fa-check"></i><b>7</b> Tables</a><ul>
<li class="chapter" data-level="7.1" data-path="tables.html"><a href="tables.html#displaying-tables---htmltable"><i class="fa fa-check"></i><b>7.1</b> Displaying tables - htmlTable</a></li>
<li class="chapter" data-level="7.2" data-path="tables.html"><a href="tables.html#displaying-interactive-tables---dt"><i class="fa fa-check"></i><b>7.2</b> Displaying Interactive tables - DT</a></li>
<li class="chapter" data-level="7.3" data-path="tables.html"><a href="tables.html#creating-tabulations"><i class="fa fa-check"></i><b>7.3</b> Creating Tabulations</a></li>
</ul></li>
<li class="chapter" data-level="8" data-path="charts.html"><a href="charts.html"><i class="fa fa-check"></i><b>8</b> Charts</a><ul>
<li class="chapter" data-level="8.0.1" data-path="charts.html"><a href="charts.html#static-charts-with-ggplot2"><i class="fa fa-check"></i><b>8.0.1</b> Static charts with ggplot2</a></li>
<li class="chapter" data-level="8.0.2" data-path="charts.html"><a href="charts.html#changing-axes"><i class="fa fa-check"></i><b>8.0.2</b> Changing Axes</a></li>
<li class="chapter" data-level="8.0.3" data-path="charts.html"><a href="charts.html#chart-themes-styles"><i class="fa fa-check"></i><b>8.0.3</b> Chart Themes & Styles</a></li>
<li class="chapter" data-level="8.0.4" data-path="charts.html"><a href="charts.html#line-charts-geom_line"><i class="fa fa-check"></i><b>8.0.4</b> Line Charts (geom_line)</a></li>
<li class="chapter" data-level="8.0.5" data-path="charts.html"><a href="charts.html#bar-charts-geom_bar-geom_histogram"><i class="fa fa-check"></i><b>8.0.5</b> Bar Charts (geom_bar & geom_histogram)</a></li>
<li class="chapter" data-level="8.0.6" data-path="charts.html"><a href="charts.html#adding-features"><i class="fa fa-check"></i><b>8.0.6</b> Adding features</a></li>
<li class="chapter" data-level="8.0.7" data-path="charts.html"><a href="charts.html#heatmaps"><i class="fa fa-check"></i><b>8.0.7</b> Heatmaps</a></li>
<li class="chapter" data-level="8.0.8" data-path="charts.html"><a href="charts.html#boxplots"><i class="fa fa-check"></i><b>8.0.8</b> Boxplots</a></li>
<li class="chapter" data-level="8.0.9" data-path="charts.html"><a href="charts.html#combination-charts"><i class="fa fa-check"></i><b>8.0.9</b> Combination Charts</a></li>
<li class="chapter" data-level="8.0.10" data-path="charts.html"><a href="charts.html#scales"><i class="fa fa-check"></i><b>8.0.10</b> Scales</a></li>
<li class="chapter" data-level="8.0.11" data-path="charts.html"><a href="charts.html#ggplot-wizard"><i class="fa fa-check"></i><b>8.0.11</b> ggplot wizard</a></li>
<li class="chapter" data-level="8.0.12" data-path="charts.html"><a href="charts.html#interactive-charts-with-ggiraph"><i class="fa fa-check"></i><b>8.0.12</b> Interactive charts with ggiraph</a></li>
<li class="chapter" data-level="8.0.13" data-path="charts.html"><a href="charts.html#interactive-charts-with-dygraph"><i class="fa fa-check"></i><b>8.0.13</b> Interactive charts with dygraph</a></li>
</ul></li>
<li class="chapter" data-level="9" data-path="maps.html"><a href="maps.html"><i class="fa fa-check"></i><b>9</b> Maps</a><ul>
<li class="chapter" data-level="9.1" data-path="maps.html"><a href="maps.html#leaflet"><i class="fa fa-check"></i><b>9.1</b> Leaflet</a><ul>
<li class="chapter" data-level="9.1.1" data-path="maps.html"><a href="maps.html#display-a-basic-map"><i class="fa fa-check"></i><b>9.1.1</b> Display a basic map</a></li>
<li class="chapter" data-level="9.1.2" data-path="maps.html"><a href="maps.html#add-markers-shapes-and-popups-to-a-map"><i class="fa fa-check"></i><b>9.1.2</b> Add markers, shapes and popups to a map</a></li>
<li class="chapter" data-level="9.1.3" data-path="maps.html"><a href="maps.html#add-boundaries"><i class="fa fa-check"></i><b>9.1.3</b> Add Boundaries</a></li>
<li class="chapter" data-level="9.1.4" data-path="maps.html"><a href="maps.html#add-interactivity"><i class="fa fa-check"></i><b>9.1.4</b> Add Interactivity</a></li>
<li class="chapter" data-level="9.1.5" data-path="maps.html"><a href="maps.html#attach-extra-information-to-your-boundary-dataset"><i class="fa fa-check"></i><b>9.1.5</b> Attach extra information to your boundary dataset</a></li>
<li class="chapter" data-level="9.1.6" data-path="maps.html"><a href="maps.html#add-colour-based-on-area-data"><i class="fa fa-check"></i><b>9.1.6</b> Add colour based on area data</a></li>
</ul></li>
<li class="chapter" data-level="9.2" data-path="maps.html"><a href="maps.html#inserting-html-javascript-directly-into-r"><i class="fa fa-check"></i><b>9.2</b> Inserting HTML & Javascript directly into R</a></li>
</ul></li>
<li class="chapter" data-level="10" data-path="rmarkdown-knitr.html"><a href="rmarkdown-knitr.html"><i class="fa fa-check"></i><b>10</b> RMarkdown & Knitr</a><ul>
<li class="chapter" data-level="10.1" data-path="rmarkdown-knitr.html"><a href="rmarkdown-knitr.html#knitr"><i class="fa fa-check"></i><b>10.1</b> Knitr</a></li>
<li class="chapter" data-level="10.2" data-path="rmarkdown-knitr.html"><a href="rmarkdown-knitr.html#yaml"><i class="fa fa-check"></i><b>10.2</b> YAML</a></li>
<li class="chapter" data-level="10.3" data-path="rmarkdown-knitr.html"><a href="rmarkdown-knitr.html#modular-coding"><i class="fa fa-check"></i><b>10.3</b> Modular coding</a></li>
</ul></li>
<li class="chapter" data-level="11" data-path="machine-learning-predictive-analytics.html"><a href="machine-learning-predictive-analytics.html"><i class="fa fa-check"></i><b>11</b> Machine Learning & Predictive Analytics</a><ul>
<li class="chapter" data-level="11.0.1" data-path="machine-learning-predictive-analytics.html"><a href="machine-learning-predictive-analytics.html#decision-trees"><i class="fa fa-check"></i><b>11.0.1</b> Decision Trees</a></li>
</ul></li>
<li class="chapter" data-level="12" data-path="text-mining-nlp.html"><a href="text-mining-nlp.html"><i class="fa fa-check"></i><b>12</b> Text Mining & NLP</a><ul>
<li class="chapter" data-level="12.0.1" data-path="text-mining-nlp.html"><a href="text-mining-nlp.html#text-data"><i class="fa fa-check"></i><b>12.0.1</b> Text data</a></li>
<li class="chapter" data-level="12.0.2" data-path="text-mining-nlp.html"><a href="text-mining-nlp.html#tidy-text"><i class="fa fa-check"></i><b>12.0.2</b> Tidy text</a></li>
<li class="chapter" data-level="12.0.3" data-path="text-mining-nlp.html"><a href="text-mining-nlp.html#stemming"><i class="fa fa-check"></i><b>12.0.3</b> Stemming</a></li>
<li class="chapter" data-level="12.0.4" data-path="text-mining-nlp.html"><a href="text-mining-nlp.html#basic-text-stats"><i class="fa fa-check"></i><b>12.0.4</b> Basic text stats</a></li>
<li class="chapter" data-level="12.0.5" data-path="text-mining-nlp.html"><a href="text-mining-nlp.html#n-grams-analysis"><i class="fa fa-check"></i><b>12.0.5</b> N-Grams analysis</a></li>
<li class="chapter" data-level="12.0.6" data-path="text-mining-nlp.html"><a href="text-mining-nlp.html#text-preparation-with-tm"><i class="fa fa-check"></i><b>12.0.6</b> Text Preparation with <strong>tm</strong></a></li>
<li class="chapter" data-level="12.1" data-path="text-mining-nlp.html"><a href="text-mining-nlp.html#create-a-term-document-matrix"><i class="fa fa-check"></i><b>12.1</b> Create a Term Document Matrix</a><ul>
<li class="chapter" data-level="12.1.1" data-path="text-mining-nlp.html"><a href="text-mining-nlp.html#from-a-corpus-by-default-words-are-converted-to-lowercase-less-than-3-characters-are-excluded"><i class="fa fa-check"></i><b>12.1.1</b> From a Corpus (by default words are converted to lowercase less than 3 characters are excluded)</a></li>
<li class="chapter" data-level="12.1.2" data-path="text-mining-nlp.html"><a href="text-mining-nlp.html#matrix-managment"><i class="fa fa-check"></i><b>12.1.2</b> Matrix Managment</a></li>
</ul></li>
<li class="chapter" data-level="12.2" data-path="text-mining-nlp.html"><a href="text-mining-nlp.html#topic-modelling-latent-dirichlet-allocation-lda"><i class="fa fa-check"></i><b>12.2</b> Topic Modelling & Latent Dirichlet Allocation (LDA)</a></li>
<li class="chapter" data-level="12.3" data-path="text-mining-nlp.html"><a href="text-mining-nlp.html#clustering---similarity-between-topics"><i class="fa fa-check"></i><b>12.3</b> Clustering - Similarity between topics</a></li>
<li class="chapter" data-level="12.4" data-path="text-mining-nlp.html"><a href="text-mining-nlp.html#calculating-distance-between-objects"><i class="fa fa-check"></i><b>12.4</b> Calculating distance between objects</a></li>
<li class="chapter" data-level="12.5" data-path="text-mining-nlp.html"><a href="text-mining-nlp.html#non-euclidian-distances---jensen-shannon"><i class="fa fa-check"></i><b>12.5</b> Non-Euclidian Distances - Jensen Shannon</a></li>
<li class="chapter" data-level="12.6" data-path="text-mining-nlp.html"><a href="text-mining-nlp.html#scaling---principal-components"><i class="fa fa-check"></i><b>12.6</b> Scaling - Principal Components</a></li>
<li class="chapter" data-level="12.7" data-path="text-mining-nlp.html"><a href="text-mining-nlp.html#eigen-vectors"><i class="fa fa-check"></i><b>12.7</b> Eigen vectors</a></li>
<li class="chapter" data-level="12.8" data-path="text-mining-nlp.html"><a href="text-mining-nlp.html#k-means"><i class="fa fa-check"></i><b>12.8</b> K-Means</a></li>
<li class="chapter" data-level="12.9" data-path="text-mining-nlp.html"><a href="text-mining-nlp.html#naive-bayes-classifiers"><i class="fa fa-check"></i><b>12.9</b> Naive Bayes Classifiers</a></li>
<li class="chapter" data-level="12.10" data-path="text-mining-nlp.html"><a href="text-mining-nlp.html#tf-idf-classifiers-supervised"><i class="fa fa-check"></i><b>12.10</b> TF-IDF Classifiers (Supervised)</a></li>
<li class="chapter" data-level="12.11" data-path="text-mining-nlp.html"><a href="text-mining-nlp.html#sentiment-analysis"><i class="fa fa-check"></i><b>12.11</b> Sentiment Analysis</a></li>
<li class="chapter" data-level="12.12" data-path="text-mining-nlp.html"><a href="text-mining-nlp.html#word-bubble"><i class="fa fa-check"></i><b>12.12</b> Word Bubble</a></li>
</ul></li>
<li class="chapter" data-level="13" data-path="shiny.html"><a href="shiny.html"><i class="fa fa-check"></i><b>13</b> Shiny</a><ul>
<li class="chapter" data-level="13.1" data-path="shiny.html"><a href="shiny.html#basic-shiny-structure"><i class="fa fa-check"></i><b>13.1</b> Basic Shiny Structure</a></li>
<li class="chapter" data-level="13.2" data-path="shiny.html"><a href="shiny.html#user-interface"><i class="fa fa-check"></i><b>13.2</b> User Interface</a><ul>
<li class="chapter" data-level="13.2.1" data-path="shiny.html"><a href="shiny.html#layout"><i class="fa fa-check"></i><b>13.2.1</b> Layout</a></li>
<li class="chapter" data-level="13.2.2" data-path="shiny.html"><a href="shiny.html#inputs"><i class="fa fa-check"></i><b>13.2.2</b> Inputs</a></li>
<li class="chapter" data-level="13.2.3" data-path="shiny.html"><a href="shiny.html#outputs"><i class="fa fa-check"></i><b>13.2.3</b> Outputs</a></li>
</ul></li>
<li class="chapter" data-level="13.3" data-path="shiny.html"><a href="shiny.html#server"><i class="fa fa-check"></i><b>13.3</b> Server</a><ul>
<li class="chapter" data-level="13.3.1" data-path="shiny.html"><a href="shiny.html#linking-to-ui-outputs"><i class="fa fa-check"></i><b>13.3.1</b> Linking to UI outputs</a></li>
<li class="chapter" data-level="13.3.2" data-path="shiny.html"><a href="shiny.html#linking-to-ui-inputs"><i class="fa fa-check"></i><b>13.3.2</b> Linking to UI inputs</a></li>
</ul></li>
<li class="chapter" data-level="13.4" data-path="shiny.html"><a href="shiny.html#shiny-dashboard"><i class="fa fa-check"></i><b>13.4</b> Shiny Dashboard</a></li>
</ul></li>
<li class="chapter" data-level="14" data-path="bookdown.html"><a href="bookdown.html"><i class="fa fa-check"></i><b>14</b> Bookdown</a><ul>
<li class="chapter" data-level="14.1" data-path="bookdown.html"><a href="bookdown.html#set-up"><i class="fa fa-check"></i><b>14.1</b> Set-up</a></li>
<li class="chapter" data-level="14.2" data-path="bookdown.html"><a href="bookdown.html#create-the-book"><i class="fa fa-check"></i><b>14.2</b> Create the book</a></li>
</ul></li>
<li class="chapter" data-level="15" data-path="creating-packages.html"><a href="creating-packages.html"><i class="fa fa-check"></i><b>15</b> Creating Packages</a><ul>
<li class="chapter" data-level="15.1" data-path="creating-packages.html"><a href="creating-packages.html#setup-folders"><i class="fa fa-check"></i><b>15.1</b> Setup Folders</a></li>
<li class="chapter" data-level="15.2" data-path="creating-packages.html"><a href="creating-packages.html#r-folder"><i class="fa fa-check"></i><b>15.2</b> R folder</a></li>
<li class="chapter" data-level="15.3" data-path="creating-packages.html"><a href="creating-packages.html#man-folder"><i class="fa fa-check"></i><b>15.3</b> man folder</a></li>
<li class="chapter" data-level="15.4" data-path="creating-packages.html"><a href="creating-packages.html#package-details"><i class="fa fa-check"></i><b>15.4</b> Package details</a></li>
<li class="chapter" data-level="15.5" data-path="creating-packages.html"><a href="creating-packages.html#package-dependencies"><i class="fa fa-check"></i><b>15.5</b> Package Dependencies</a></li>
<li class="chapter" data-level="15.6" data-path="creating-packages.html"><a href="creating-packages.html#build-the-package"><i class="fa fa-check"></i><b>15.6</b> Build the package</a></li>
<li class="chapter" data-level="15.7" data-path="creating-packages.html"><a href="creating-packages.html#making-changes"><i class="fa fa-check"></i><b>15.7</b> Making changes</a></li>
</ul></li>
<li class="chapter" data-level="16" data-path="git-version-control.html"><a href="git-version-control.html"><i class="fa fa-check"></i><b>16</b> Git - Version Control</a><ul>
<li class="chapter" data-level="16.1" data-path="git-version-control.html"><a href="git-version-control.html#set-up-1"><i class="fa fa-check"></i><b>16.1</b> Set-up</a></li>
<li class="chapter" data-level="16.2" data-path="git-version-control.html"><a href="git-version-control.html#link-git-to-a-github-account"><i class="fa fa-check"></i><b>16.2</b> Link Git to a GitHub account</a></li>
<li class="chapter" data-level="16.3" data-path="git-version-control.html"><a href="git-version-control.html#starting-a-project-with-github"><i class="fa fa-check"></i><b>16.3</b> Starting a project with Github</a></li>
<li class="chapter" data-level="16.4" data-path="git-version-control.html"><a href="git-version-control.html#hosting-a-site-on-github"><i class="fa fa-check"></i><b>16.4</b> Hosting a site on Github</a></li>
</ul></li>
<li class="chapter" data-level="17" data-path="other-languages.html"><a href="other-languages.html"><i class="fa fa-check"></i><b>17</b> Other Languages</a><ul>
<li class="chapter" data-level="17.1" data-path="other-languages.html"><a href="other-languages.html#sql"><i class="fa fa-check"></i><b>17.1</b> SQL</a></li>
<li class="chapter" data-level="17.2" data-path="other-languages.html"><a href="other-languages.html#python"><i class="fa fa-check"></i><b>17.2</b> Python</a></li>
</ul></li>
</ul>
</nav>
</div>
<div class="book-body">
<div class="body-inner">
<div class="book-header" role="navigation">
<h1>
<i class="fa fa-circle-o-notch fa-spin"></i><a href="./">R User Notebook</a>
</h1>
</div>
<div class="page-wrapper" tabindex="-1" role="main">
<div class="page-inner">
<section class="normal" id="section-">
<div id="text-mining-nlp" class="section level1">
<h1><span class="header-section-number">12</span> Text Mining & NLP</h1>
<p>There are various types of analysis you can do with text data, such as n-grams, sentiemnt analysis, and topic modelling. Various packages are available, the main ones are <strong>tm</strong> and <strong>NLP</strong>.</p>
<div id="text-data" class="section level3">
<h3><span class="header-section-number">12.0.1</span> Text data</h3>
<p>The source data for your text may come in various formats, for example a single string or a dataframe. Ideally you will put these into a tidy format</p>
<div class="sourceCode" id="cb171"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb171-1" data-line-number="1"><span class="co"># Text data as a list of strings </span></a>
<a class="sourceLine" id="cb171-2" data-line-number="2">textdoc <-<span class="st"> </span><span class="kw">c</span>(<span class="st">"Once upon a time"</span>, <span class="st">"in a galaxy far far away"</span>, <span class="st">"On a dark and stormy night"</span>)</a>
<a class="sourceLine" id="cb171-3" data-line-number="3"></a>
<a class="sourceLine" id="cb171-4" data-line-number="4"><span class="co"># Text data as a tibble/dataframe </span></a>
<a class="sourceLine" id="cb171-5" data-line-number="5">textdoc <-<span class="st"> </span><span class="kw">tibble</span>(<span class="st">'line'</span>=<span class="kw">c</span>(<span class="dv">1</span>,<span class="dv">2</span>,<span class="dv">3</span>),</a>
<a class="sourceLine" id="cb171-6" data-line-number="6"> <span class="st">'text'</span>=<span class="kw">c</span>(<span class="st">"Once upon a time"</span>, <span class="st">"in a galaxy far far away."</span>, <span class="st">"on a dark and stormy night"</span>))</a></code></pre></div>
</div>
<div id="tidy-text" class="section level3">
<h3><span class="header-section-number">12.0.2</span> Tidy text</h3>
<p>Ideally you will convert your text data into a tidy format using the <strong>tidytext</strong> package.</p>
<div class="sourceCode" id="cb172"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb172-1" data-line-number="1"><span class="kw">library</span>(tidytext)</a></code></pre></div>
<pre><code>## Warning: package 'tidytext' was built under R version 3.4.4</code></pre>
<div class="sourceCode" id="cb174"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb174-1" data-line-number="1">textdoc <span class="op">%>%</span><span class="st"> </span><span class="kw">unnest_tokens</span>(<span class="dt">input=</span>text, <span class="dt">output=</span>word, <span class="dt">token=</span><span class="st">"words"</span>, <span class="dt">to_lower =</span> <span class="ot">TRUE</span>) -><span class="st"> </span>tidytext</a>
<a class="sourceLine" id="cb174-2" data-line-number="2"><span class="co"># This splits up your text into 'tokens'.</span></a>
<a class="sourceLine" id="cb174-3" data-line-number="3"><span class="co"># By default a token is a word, but other options include "characters", "sentences","ngrams".</span></a>
<a class="sourceLine" id="cb174-4" data-line-number="4"><span class="co"># By default all text will be converted to lowercase, and punctuation (eg .,!?£$&) will be removed.</span></a>
<a class="sourceLine" id="cb174-5" data-line-number="5"><span class="co"># Numbers are not removed.</span></a></code></pre></div>
<p>tidytext includes a stop_words tibble, which contain stopwords sourced from SMART, snowball, and onix.
You can remove these stopwords from you tidy data by doing an anti join, or create your own custom stopword tibble.</p>
<div class="sourceCode" id="cb175"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb175-1" data-line-number="1"><span class="kw">library</span>(dplyr)</a>
<a class="sourceLine" id="cb175-2" data-line-number="2">stop_words</a></code></pre></div>
<pre><code>## # A tibble: 1,149 x 2
## word lexicon
## <chr> <chr>
## 1 a SMART
## 2 a's SMART
## 3 able SMART
## 4 about SMART
## 5 above SMART
## 6 according SMART
## 7 accordingly SMART
## 8 across SMART
## 9 actually SMART
## 10 after SMART
## # ... with 1,139 more rows</code></pre>
<div class="sourceCode" id="cb177"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb177-1" data-line-number="1"><span class="co"># Remove all common stopwords from your data</span></a>
<a class="sourceLine" id="cb177-2" data-line-number="2">tidytext <span class="op">%>%</span><span class="st"> </span><span class="kw">anti_join</span>(stop_words) -><span class="st"> </span>tidytext2</a></code></pre></div>
<pre><code>## Joining, by = "word"</code></pre>
<div class="sourceCode" id="cb179"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb179-1" data-line-number="1"><span class="co"># Remove stopwords from a particular source (eg. snowball) from your data</span></a>
<a class="sourceLine" id="cb179-2" data-line-number="2">tidytext <span class="op">%>%</span><span class="st"> </span><span class="kw">anti_join</span>(<span class="kw">filter</span>(stop_words, lexicon<span class="op">==</span><span class="st">"snowball"</span>)) -><span class="st"> </span>tidytext2</a></code></pre></div>
<pre><code>## Joining, by = "word"</code></pre>
<div class="sourceCode" id="cb181"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb181-1" data-line-number="1"><span class="co"># Custom stopwords</span></a>
<a class="sourceLine" id="cb181-2" data-line-number="2">custom_sw <-<span class="st"> </span><span class="kw">tibble</span>(<span class="dt">word=</span><span class="kw">c</span>(<span class="st">"a"</span>,<span class="st">"in"</span>,<span class="st">"on"</span>,<span class="st">"and"</span>))</a>
<a class="sourceLine" id="cb181-3" data-line-number="3">tidytext <span class="op">%>%</span><span class="st"> </span><span class="kw">anti_join</span>(custom_sw) -><span class="st"> </span>tidytext2</a></code></pre></div>
<pre><code>## Joining, by = "word"</code></pre>
<div class="sourceCode" id="cb183"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb183-1" data-line-number="1"><span class="co"># Nb. the tm package also includes a stopword function with the same list of words </span></a></code></pre></div>
</div>
<div id="stemming" class="section level3">
<h3><span class="header-section-number">12.0.3</span> Stemming</h3>
<div class="sourceCode" id="cb184"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb184-1" data-line-number="1"><span class="kw">library</span>(SnowballC)</a>
<a class="sourceLine" id="cb184-2" data-line-number="2">tidytext2 <-<span class="st"> </span>tidytext2 <span class="op">%>%</span><span class="st"> </span><span class="kw">mutate</span>(<span class="dt">word_stem =</span> <span class="kw">wordStem</span>(word, <span class="dt">language=</span><span class="st">"english"</span>))</a></code></pre></div>
</div>
<div id="basic-text-stats" class="section level3">
<h3><span class="header-section-number">12.0.4</span> Basic text stats</h3>
<p>If the text data is in a tidy format you can easily to use dplyr to manipulate data, such as produce word frequency:</p>
<div class="sourceCode" id="cb185"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb185-1" data-line-number="1">tidytext <span class="op">%>%</span><span class="st"> </span><span class="kw">count</span>(word, <span class="dt">sort =</span> <span class="ot">TRUE</span>) <span class="co"># Produces token frequency using dplyr count() function</span></a></code></pre></div>
<pre><code>## # A tibble: 13 x 2
## word n
## <chr> <int>
## 1 a 3
## 2 far 2
## 3 and 1
## 4 away 1
## 5 dark 1
## 6 galaxy 1
## 7 in 1
## 8 night 1
## 9 on 1
## 10 once 1
## 11 stormy 1
## 12 time 1
## 13 upon 1</code></pre>
</div>
<div id="n-grams-analysis" class="section level3">
<h3><span class="header-section-number">12.0.5</span> N-Grams analysis</h3>
<p>To look at neighbouring words you need to use the unnest_token function again:</p>
<div class="sourceCode" id="cb187"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb187-1" data-line-number="1">textdoc <span class="op">%>%</span><span class="st"> </span><span class="kw">unnest_tokens</span>(<span class="dt">input=</span>text, <span class="dt">output=</span>ngram, <span class="dt">token=</span><span class="st">"ngrams"</span>, <span class="dt">n=</span><span class="dv">2</span>) -><span class="st"> </span>ngramdoc</a>
<a class="sourceLine" id="cb187-2" data-line-number="2"><span class="co"># All text will be converted to lowercase, and punctuation (eg .,!?£$&) will be removed, but numbers are not removed.</span></a>
<a class="sourceLine" id="cb187-3" data-line-number="3"></a>
<a class="sourceLine" id="cb187-4" data-line-number="4"><span class="co">#N-gram frequency</span></a>
<a class="sourceLine" id="cb187-5" data-line-number="5">ngramdoc <span class="op">%>%</span><span class="st"> </span><span class="kw">count</span>(ngram, <span class="dt">sort =</span> <span class="ot">TRUE</span>) </a></code></pre></div>
<pre><code>## # A tibble: 13 x 2
## ngram n
## <chr> <int>
## 1 a dark 1
## 2 a galaxy 1
## 3 a time 1
## 4 and stormy 1
## 5 dark and 1
## 6 far away 1
## 7 far far 1
## 8 galaxy far 1
## 9 in a 1
## 10 on a 1
## 11 once upon 1
## 12 stormy night 1
## 13 upon a 1</code></pre>
</div>
<div id="text-preparation-with-tm" class="section level3">
<h3><span class="header-section-number">12.0.6</span> Text Preparation with <strong>tm</strong></h3>
<p>An alternative way of working with text data is to use the <strong>tm</strong> package. This includes text cleaning functions.<br />
This pacakge uses a data structure Corpus.</p>
<div class="sourceCode" id="cb189"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb189-1" data-line-number="1"><span class="kw">library</span>(tm) <span class="co"># Also loads NLP</span></a></code></pre></div>
<pre><code>## Warning: package 'tm' was built under R version 3.4.4</code></pre>
<pre><code>## Loading required package: NLP</code></pre>
<pre><code>##
## Attaching package: 'NLP'</code></pre>
<pre><code>## The following object is masked from 'package:ggplot2':
##
## annotate</code></pre>
<div class="sourceCode" id="cb194"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb194-1" data-line-number="1"><span class="co">#Convert yor text data into a Corpus</span></a>
<a class="sourceLine" id="cb194-2" data-line-number="2">myCorpus <-<span class="st"> </span><span class="kw">VCorpus</span>(<span class="kw">VectorSource</span>(textdoc))</a>
<a class="sourceLine" id="cb194-3" data-line-number="3"></a>
<a class="sourceLine" id="cb194-4" data-line-number="4"><span class="co"># You can clean your corpus using the tm_map() function. This has various options:</span></a>
<a class="sourceLine" id="cb194-5" data-line-number="5"><span class="kw">tm_map</span>(myCorpus, <span class="kw">content_transformer</span>(tolower)) <span class="co"># Changes text to lowercase</span></a></code></pre></div>
<pre><code>## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 2</code></pre>
<div class="sourceCode" id="cb196"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb196-1" data-line-number="1"><span class="kw">tm_map</span>(myCorpus, removePunctuation) <span class="co"># Removes all punctuations [.,':;] from your Corpus</span></a></code></pre></div>
<pre><code>## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 2</code></pre>
<div class="sourceCode" id="cb198"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb198-1" data-line-number="1"><span class="kw">tm_map</span>(myCorpus, removeNumbers) <span class="co"># Removes any numbers from your Corpus</span></a></code></pre></div>
<pre><code>## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 2</code></pre>
<div class="sourceCode" id="cb200"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb200-1" data-line-number="1"><span class="kw">tm_map</span>(myCorpus, stripWhitespace) <span class="co"># Removes multiple whitespace</span></a></code></pre></div>
<pre><code>## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 2</code></pre>
<div class="sourceCode" id="cb202"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb202-1" data-line-number="1"><span class="kw">tm_map</span>(myCorpus, removeWords, <span class="kw">c</span>(<span class="st">"i"</span>, <span class="st">"a"</span>, <span class="st">"is"</span>, <span class="st">"the"</span>, <span class="st">"and"</span>, <span class="st">"but"</span>) ) <span class="co"># Removes custom stopwords</span></a></code></pre></div>
<pre><code>## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 2</code></pre>
<div class="sourceCode" id="cb204"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb204-1" data-line-number="1"><span class="co"># A stopwords() function is available, and can be added to the list above, for example</span></a>
<a class="sourceLine" id="cb204-2" data-line-number="2"><span class="kw">tm_map</span>(myCorpus, removeWords, <span class="kw">c</span>(<span class="kw">stopwords</span>(<span class="st">"en"</span>)) ) <span class="co"># Removes stopwords in the snowball list</span></a></code></pre></div>
<pre><code>## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 2</code></pre>
</div>
<div id="create-a-term-document-matrix" class="section level2">
<h2><span class="header-section-number">12.1</span> Create a Term Document Matrix</h2>
<p>Transforming text data into a matrix allows you to do further modelling such as LDA, Naive Bayes, regression.
### From tidy data</p>
<div class="sourceCode" id="cb206"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb206-1" data-line-number="1"><span class="co"># First, the tidy data needs to be summarised so that it contains the count of each token per document.</span></a>
<a class="sourceLine" id="cb206-2" data-line-number="2">tidytext <span class="op">%>%</span><span class="st"> </span><span class="kw">count</span>(line, word, <span class="dt">sort =</span> <span class="ot">TRUE</span>) -><span class="st"> </span>tidytext_count</a></code></pre></div>
<div class="sourceCode" id="cb207"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb207-1" data-line-number="1"><span class="co"># Now use the cast_dtm function to convert this into a Document Term Matrix.</span></a>
<a class="sourceLine" id="cb207-2" data-line-number="2">tidytext_count <span class="op">%>%</span><span class="st"> </span><span class="kw">cast_dtm</span>(line, word, n) -><span class="st"> </span>myDTM</a></code></pre></div>
<pre><code>## Warning: Trying to compute distinct() for variables not found in the data:
## - `row_col`, `column_col`
## This is an error, but only a warning is raised for compatibility reasons.
## The operation will return the input unchanged.</code></pre>
<div class="sourceCode" id="cb209"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb209-1" data-line-number="1"><span class="kw">inspect</span>(myDTM)</a></code></pre></div>
<pre><code>## <<DocumentTermMatrix (documents: 3, terms: 13)>>
## Non-/sparse entries: 15/24
## Sparsity : 62%
## Maximal term length: 6
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs a and away dark far galaxy in once time upon
## 1 1 0 0 0 0 0 0 1 1 1
## 2 1 0 1 0 2 1 1 0 0 0
## 3 1 1 0 1 0 0 0 0 0 0</code></pre>
<div id="from-a-corpus-by-default-words-are-converted-to-lowercase-less-than-3-characters-are-excluded" class="section level3">
<h3><span class="header-section-number">12.1.1</span> From a Corpus (by default words are converted to lowercase less than 3 characters are excluded)</h3>
<div class="sourceCode" id="cb211"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb211-1" data-line-number="1">myDTM <-<span class="st"> </span>tm<span class="op">::</span><span class="kw">DocumentTermMatrix</span>(myCorpus)</a>
<a class="sourceLine" id="cb211-2" data-line-number="2"></a>
<a class="sourceLine" id="cb211-3" data-line-number="3"><span class="co">## By default words will be converted to lowercase, and words with <3 characters are removed. To alter defaults, you can specify a control list.</span></a>
<a class="sourceLine" id="cb211-4" data-line-number="4"></a>
<a class="sourceLine" id="cb211-5" data-line-number="5">myDTM <-<span class="st"> </span>tm<span class="op">::</span><span class="kw">DocumentTermMatrix</span>(myCorpus, <span class="dt">control =</span> <span class="kw">list</span>(<span class="dt">tolower=</span><span class="ot">FALSE</span>,</a>
<a class="sourceLine" id="cb211-6" data-line-number="6"> <span class="dt">wordLengths=</span><span class="kw">c</span>(<span class="dv">1</span>,<span class="ot">Inf</span>),</a>
<a class="sourceLine" id="cb211-7" data-line-number="7"> <span class="dt">removePunctuation=</span><span class="ot">FALSE</span></a>
<a class="sourceLine" id="cb211-8" data-line-number="8"> ))</a></code></pre></div>
</div>
<div id="matrix-managment" class="section level3">
<h3><span class="header-section-number">12.1.2</span> Matrix Managment</h3>
<p>To perform calculations on very large matrices you may need to use the <em>slam</em> package.</p>
<div class="sourceCode" id="cb212"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb212-1" data-line-number="1"><span class="co">## Sum rows of a matrix</span></a>
<a class="sourceLine" id="cb212-2" data-line-number="2">slam<span class="op">::</span><span class="kw">row_sums</span>(myDTM)</a></code></pre></div>
<pre><code>## 1 2
## 3 16</code></pre>
<div class="sourceCode" id="cb214"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb214-1" data-line-number="1"><span class="co">## Sum columns of a matrix</span></a>
<a class="sourceLine" id="cb214-2" data-line-number="2">slam<span class="op">::</span><span class="kw">col_sums</span>(myDTM)</a></code></pre></div>
<pre><code>## 1 2 3 a and away. dark far galaxy in
## 1 1 1 3 1 1 1 2 1 1
## night on Once stormy time upon
## 1 1 1 1 1 1</code></pre>
</div>
</div>
<div id="topic-modelling-latent-dirichlet-allocation-lda" class="section level2">
<h2><span class="header-section-number">12.2</span> Topic Modelling & Latent Dirichlet Allocation (LDA)</h2>
<p>Topic Modelling is a common method for discovering topics from text, such as comments.
Most topic modelling techniques, such as LDA (Latent Dirichlet Allocation), require you to choose the number of topics, and will then use an algorithm to createthe topics.</p>
<ul>
<li>You must interpret what these topic mean</li>
<li>Some words are equally likely to appear across topics; so a word like “account” could appear in both topic lists.</li>
</ul>
<p>You can build an unsupervised LDA model using a DocumentTermMatrix</p>
<div class="sourceCode" id="cb216"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb216-1" data-line-number="1"><span class="kw">library</span>(topicmodels) <span class="co"># Not available #</span></a>
<a class="sourceLine" id="cb216-2" data-line-number="2"></a>
<a class="sourceLine" id="cb216-3" data-line-number="3">my_lda <-<span class="st"> </span><span class="kw">LDA</span>(myDTM, <span class="dt">k=</span><span class="dv">4</span>, <span class="dt">control =</span> <span class="kw">list</span>(<span class="dt">seed=</span><span class="dv">87533</span>)) <span class="co"># k is the number of topics you want. </span></a>
<a class="sourceLine" id="cb216-4" data-line-number="4"> </a>
<a class="sourceLine" id="cb216-5" data-line-number="5"><span class="co"># Option 1 : using tidytext to examine topic probabilities</span></a>
<a class="sourceLine" id="cb216-6" data-line-number="6">topics_beta <-<span class="st"> </span><span class="kw">tidy</span>(my_lda, <span class="dt">matrix=</span><span class="st">"beta"</span>) <span class="co"># beta represents word/topic probabilities</span></a>
<a class="sourceLine" id="cb216-7" data-line-number="7">topicstats <-<span class="st"> </span>topics_beta <span class="op">%>%</span><span class="st"> </span><span class="kw">group_by</span>(topic) <span class="op">%>%</span><span class="st"> </span><span class="kw">top_n</span>(<span class="dv">10</span>,beta) <span class="op">%>%</span><span class="st"> </span><span class="kw">ungroup</span>() <span class="op">%>%</span><span class="st"> </span><span class="kw">arrange</span>(topic, <span class="op">-</span>beta) <span class="co">#Top terms for each topic</span></a>
<a class="sourceLine" id="cb216-8" data-line-number="8"></a>
<a class="sourceLine" id="cb216-9" data-line-number="9"></a>
<a class="sourceLine" id="cb216-10" data-line-number="10">topics_gamma <-<span class="st"> </span><span class="kw">tidy</span>(my_lda, <span class="dt">matrix=</span><span class="st">"gamma"</span>) <span class="co"># gamma represents document/topic probabilities</span></a>
<a class="sourceLine" id="cb216-11" data-line-number="11">classification <-<span class="st"> </span>topics_gamma <span class="op">%>%</span><span class="st"> </span><span class="kw">group_by</span>(document) <span class="op">%>%</span><span class="st"> </span><span class="kw">top_n</span>(<span class="dv">1</span>,gamma) <span class="op">%>%</span><span class="st"> </span><span class="kw">ungroup</span>() <span class="co"># Most likely topic for each document</span></a>
<a class="sourceLine" id="cb216-12" data-line-number="12"></a>
<a class="sourceLine" id="cb216-13" data-line-number="13"></a>
<a class="sourceLine" id="cb216-14" data-line-number="14"><span class="co"># Merge back to original document</span></a>
<a class="sourceLine" id="cb216-15" data-line-number="15">classification <-<span class="st"> </span><span class="kw">mutate</span>(classification, <span class="dt">id=</span><span class="kw">as.numeric</span>(document))</a>
<a class="sourceLine" id="cb216-16" data-line-number="16">final <-<span class="st"> </span><span class="kw">left_join</span>(textdoc, classification, <span class="dt">by=</span><span class="st">"id"</span>)</a></code></pre></div>
<p>Initially, each word from each document is randomly assigned to a topic. <em>Gibbs sampling</em> is used to re-assign topics, which involves taking each document in turn, and calculating the % of words that are currently assigned to each topic (eg. 12%/24%/36%/28% from topic A/B/C/D). It also looks at each word in the document, and calculates how often that word appears in each topic (eg. 2.5%/2.1%/1.7%/0.5% of topic A/B/C/D). These two sets %’s are multiplied together, and used to are used as weights to randomly re-assign a word to a new topic. This process is repeated for every word, at least 2000 times.
[NB. During this process the word being assessed is temporarily removed from all caculations.]</p>
<p><em>LDA Implications</em>
A word is more likely to be re-assigned to another topic if lots of neighbouring words alreading belong to it, or if another topic has a higher concentration of that word.
A word is more likely to keep its existing topic if it is part of the majority topic within its document, or becasue the word is spread evenly across topics.
Words that only appear once in the corpus shouldnt have a significant effect on the creaton of topics. Relatively uncommon words (that appear 2-3 times in the corpus) should be assigned the same topic quite quickly.</p>
<p><em>LDA terms</em>
<em>phi</em> : the lokeihood that a word appears in atpoic (ie. the frequency of ‘w’ in a topic, divided by frequency of ‘w’ across the corpus).
<em>theta</em> : the proportion of words in a document that wer assigned to each opic (nb. alpha has been added, so 0% is not possible).
<em>alpha</em> :</p>
<p>The LDA function stores results in the following attributes :<br />
<span class="citation">@n</span> : The total number of words in the corpus<br />
<span class="citation">@terms</span> : A simple list of all the distinct words in the corpus<br />
<span class="citation">@beta</span> : A table (words x topics) containg the <em>log</em> of phi<br />
<span class="citation">@gamma</span> : A table (documents x topics) containing theta</p>
<p>More complex analysis of LDA including graph</p>
<div class="sourceCode" id="cb217"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb217-1" data-line-number="1">topicstats <span class="op">%>%</span><span class="st"> </span><span class="kw">mutate</span>(<span class="dt">term =</span> <span class="kw">reorder</span>(term, beta)) <span class="op">%>%</span><span class="st"> </span></a>
<a class="sourceLine" id="cb217-2" data-line-number="2"><span class="st"> </span><span class="kw">ggplot</span>(<span class="kw">aes</span>(term, beta, <span class="dt">fill =</span> <span class="kw">factor</span>(topic))) <span class="op">+</span><span class="st"> </span><span class="co"># plot beta by theme</span></a>
<a class="sourceLine" id="cb217-3" data-line-number="3"><span class="st"> </span><span class="kw">geom_col</span>(<span class="dt">show.legend =</span> <span class="ot">FALSE</span>) <span class="op">+</span><span class="st"> </span><span class="co"># as a bar plot</span></a>
<a class="sourceLine" id="cb217-4" data-line-number="4"><span class="st"> </span><span class="kw">facet_wrap</span>(<span class="op">~</span><span class="st"> </span>topic, <span class="dt">scales =</span> <span class="st">"free"</span>) <span class="op">+</span><span class="st"> </span><span class="co"># which each topic in a seperate plot</span></a>
<a class="sourceLine" id="cb217-5" data-line-number="5"><span class="st"> </span><span class="kw">labs</span>(<span class="dt">x =</span> <span class="ot">NULL</span>, <span class="dt">y =</span> <span class="st">"Beta"</span>) <span class="op">+</span><span class="st"> </span><span class="co"># no x label, change y label </span></a>
<a class="sourceLine" id="cb217-6" data-line-number="6"><span class="st"> </span><span class="kw">coord_flip</span>() <span class="co"># turn bars sideways</span></a></code></pre></div>
<p>Since topic models are usually unsupervised, it can be difficult to assess how effective/reliable they are.
The topics generated by LDA may not make much sense if most comments:
* contain few words
* contain too many words covering multiple themes
* are too general/generic/non specific
LDA can be sensitive, so could produce very different results depending on number of topics or when re-running with additional data.</p>
</div>
<div id="clustering---similarity-between-topics" class="section level2">
<h2><span class="header-section-number">12.3</span> Clustering - Similarity between topics</h2>
<p>Objects with multiple features/dimensions (such as comments) can be grouped together based on how similar they are.
First we need to measure the <em>distances</em> between all the objects.</p>
</div>
<div id="calculating-distance-between-objects" class="section level2">
<h2><span class="header-section-number">12.4</span> Calculating distance between objects</h2>
<p>The example below shows 3 simple objects (A,B,C) with x and y coordinates.</p>
<table>
<thead>
<tr class="header">
<th>Obs</th>
<th>x</th>
<th>y</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td>A</td>
<td>1</td>
<td>1</td>
</tr>
<tr class="even">
<td>B</td>
<td>4</td>
<td>1</td>
</tr>
<tr class="odd">
<td>C</td>
<td>4</td>
<td>5</td>
</tr>
</tbody>
</table>
<div class="sourceCode" id="cb218"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb218-1" data-line-number="1">twoDimensions <-<span class="st"> </span>(<span class="kw">data.frame</span>(<span class="dt">x=</span><span class="kw">c</span>(<span class="dv">1</span>,<span class="dv">4</span>,<span class="dv">4</span>),<span class="dt">y=</span><span class="kw">c</span>(<span class="dv">1</span>,<span class="dv">1</span>,<span class="dv">5</span>)))</a>
<a class="sourceLine" id="cb218-2" data-line-number="2"><span class="kw">plot</span>(twoDimensions)</a></code></pre></div>
<p><img src="_main_files/figure-html4/unnamed-chunk-113-1.png" width="480" /></p>
<p>For simple 2D objects you can calculate the standard (Euclidean) distance using trigonometry. This can be done with the <strong>dist()</strong> function.</p>
<div class="sourceCode" id="cb219"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb219-1" data-line-number="1">proxy<span class="op">::</span><span class="kw">dist</span>(<span class="dt">x =</span> twoDimensions) <span class="co"># Euclidean distance method by default</span></a></code></pre></div>
<pre><code>## 1 2
## 2 3
## 3 5 4</code></pre>
<p>This produces a matrix showing the distance between each object:</p>
<ul>
<li>Between 1(A) and 2(B) the distance is 3<br />
</li>
<li>Between 1(A) and 3(C) the distance is 5<br />
</li>
<li>Between 2(B) and 3(C) the distance is 4</li>
</ul>
</div>
<div id="non-euclidian-distances---jensen-shannon" class="section level2">
<h2><span class="header-section-number">12.5</span> Non-Euclidian Distances - Jensen Shannon</h2>
<p>The topics generated by a topic model consist of 000’s of words (dimensions) as shown in the <strong>phi</strong> matrix. We can also use <strong>dist()</strong> to calculate distance between <em>multi-dimensional</em> objects, but you may want to use a different method to calculate distance.</p>
<p>Since <strong>phi</strong> contains probability distributions, a divergence measure such as Kullback-Liebler or Jensen-Shannon can be used. <a href="https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence">https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence</a> These compare the relative values of <strong>phi</strong> for each element of a topic (eg. the relative likelihood of ‘tax’ appearing in Topic 1 versus Topic 2), and compute an overall differences.</p>
<div class="sourceCode" id="cb221"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb221-1" data-line-number="1">distanceMatrix <-<span class="st"> </span>proxy<span class="op">::</span><span class="kw">dist</span>(<span class="dt">x =</span> phi, <span class="dt">method=</span><span class="st">"Kullback"</span>)</a>
<a class="sourceLine" id="cb221-2" data-line-number="2">distanceMatrix</a></code></pre></div>
<p>This produces a matrix showing the distance between each Topic:</p>
<ul>
<li>Between Topic 1 and 2 the distance is 2.038<br />
</li>
<li>Between Topic 1 and 3 the distance is 2.297<br />
</li>
<li>Between Topic 2 and 3 the distance is 1.817
Nb. The units of distance may not be meaningful</li>
</ul>
<p><strong>This means that Topics that share a similar word (phi) distribution will be closer to one another.</strong></p>
<p>The Jensen-Shannon is similar to Kullback-Liebler, however is compares each distribution to the average rather than directly (ie. P vs average(P+Q) rather than P vs Q). This is meant to mitigate the effects of noise in the data.</p>
<p>jsPCA <- function(phi) {
# first, we compute a pairwise distance between topic distributions
# using a symmetric version of KL-divergence
# <a href="http://en.wikipedia.org/wiki/Jensen%E2%80%93Shannon_divergence">http://en.wikipedia.org/wiki/Jensen%E2%80%93Shannon_divergence</a>
jensenShannon <- function(x, y) {
m <- 0.5 * (x + y)
lhs <- ifelse(x == 0, 0, x * (log(x) - log(m)))
rhs <- ifelse(y == 0, 0, y * (log(y) - log(m)))
0.5 * sum(lhs) + 0.5 * sum(rhs)
}</p>
</div>
<div id="scaling---principal-components" class="section level2">
<h2><span class="header-section-number">12.6</span> Scaling - Principal Components</h2>
<p>If you have used a method to calculate the distance between various objects, it will still have multiple-dimensions. You can use Principal Components Analysis (multi-dimensional scaling) to reduce these, eg. to 2 or 3 dimensions. This will make the data more manageable and suitable for visualisation.</p>
<div class="sourceCode" id="cb222"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb222-1" data-line-number="1"><span class="co"># Multidimensional Scaling - reduces the K by K proximity matrix down to K by 2 components</span></a>
<a class="sourceLine" id="cb222-2" data-line-number="2">pca <-<span class="st"> </span>stats<span class="op">::</span><span class="kw">cmdscale</span>(distanceMatrix, <span class="dt">k =</span> <span class="dv">2</span>, <span class="dt">eig=</span><span class="ot">TRUE</span>)</a>
<a class="sourceLine" id="cb222-3" data-line-number="3">pca<span class="op">$</span>points</a>
<a class="sourceLine" id="cb222-4" data-line-number="4"><span class="kw">plot</span>(pca<span class="op">$</span>points)</a></code></pre></div>
<p>The coordinates produced by the scaling are not very meaningful, however the distance between objects (as calculated previously) will be preserved as much as possible.</p>
</div>
<div id="eigen-vectors" class="section level2">
<h2><span class="header-section-number">12.7</span> Eigen vectors</h2>
</div>
<div id="k-means" class="section level2">
<h2><span class="header-section-number">12.8</span> K-Means</h2>
<p>K-Means Clustering requires a DTM.</p>
<div class="sourceCode" id="cb223"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb223-1" data-line-number="1"><span class="kw">kmeans</span>(DTM, <span class="co"># The Document Term Matrix</span></a>
<a class="sourceLine" id="cb223-2" data-line-number="2"> <span class="dv">10</span>, <span class="co"># The number of clusters</span></a>
<a class="sourceLine" id="cb223-3" data-line-number="3"> <span class="dt">iter.max =</span> <span class="dv">10</span>, <span class="co"># </span></a>
<a class="sourceLine" id="cb223-4" data-line-number="4"> <span class="dt">nstart =</span> <span class="dv">3</span>, <span class="co"># </span></a>
<a class="sourceLine" id="cb223-5" data-line-number="5"> <span class="dt">trace =</span> <span class="ot">TRUE</span>) <span class="co"># </span></a></code></pre></div>
<p>The kmeans function stores results in the following attributes :<br />
$cluster : The cluster assigned to each document<br />
$centers : The position of the cluster centre</p>
</div>
<div id="naive-bayes-classifiers" class="section level2">
<h2><span class="header-section-number">12.9</span> Naive Bayes Classifiers</h2>
<p>library(e1071)</p>
</div>
<div id="tf-idf-classifiers-supervised" class="section level2">
<h2><span class="header-section-number">12.10</span> TF-IDF Classifiers (Supervised)</h2>
</div>
<div id="sentiment-analysis" class="section level2">
<h2><span class="header-section-number">12.11</span> Sentiment Analysis</h2>
</div>
<div id="word-bubble" class="section level2">
<h2><span class="header-section-number">12.12</span> Word Bubble</h2>
<p>To create a word bubble visualisation we can use <strong>ggplot2</strong> and the <strong>packcircles</strong> package, which decides the size and position of bubbles</p>
<div class="sourceCode" id="cb224"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb224-1" data-line-number="1">mytext <-<span class="st"> </span><span class="kw">tibble</span>(<span class="st">'Comment'</span>=<span class="kw">c</span>(<span class="st">'very good, easy to use once setup, couldnt do what i wanted, very quick way of cahnging details. it took only a few minutes to do what i needed. not all my information was correct. lots of confusing information, really good. quick and easy'</span>) )</a>
<a class="sourceLine" id="cb224-2" data-line-number="2"></a>
<a class="sourceLine" id="cb224-3" data-line-number="3"><span class="co"># Use tidytext to extract all the words and remove stopwords</span></a>
<a class="sourceLine" id="cb224-4" data-line-number="4"><span class="kw">library</span>(tidytext)</a>
<a class="sourceLine" id="cb224-5" data-line-number="5">mytext <span class="op">%>%</span><span class="st"> </span><span class="kw">unnest_tokens</span>(<span class="dt">input=</span>Comment, <span class="dt">output=</span>word, <span class="dt">token=</span><span class="st">"words"</span>, <span class="dt">to_lower =</span> <span class="ot">TRUE</span>) -><span class="st"> </span>tidytext</a>
<a class="sourceLine" id="cb224-6" data-line-number="6">tidytext <span class="op">%>%</span><span class="st"> </span><span class="kw">anti_join</span>(<span class="kw">filter</span>(stop_words, lexicon<span class="op">==</span><span class="st">"snowball"</span>)) -><span class="st"> </span>tidytext2</a></code></pre></div>
<pre><code>## Joining, by = "word"</code></pre>
<div class="sourceCode" id="cb226"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb226-1" data-line-number="1"><span class="co"># Calculate word frequency and average score</span></a>
<a class="sourceLine" id="cb226-2" data-line-number="2">tidytext3 <-<span class="st"> </span><span class="kw">group_by</span>(tidytext2, word) <span class="op">%>%</span><span class="st"> </span><span class="kw">summarise</span>(<span class="dt">freq=</span><span class="kw">n</span>())</a>
<a class="sourceLine" id="cb226-3" data-line-number="3"></a>
<a class="sourceLine" id="cb226-4" data-line-number="4"><span class="co"># The packcircles packages decides how to arrange a group a circles, automatically calculating their size and coordinates </span></a>
<a class="sourceLine" id="cb226-5" data-line-number="5"><span class="kw">library</span>(packcircles)</a></code></pre></div>
<pre><code>## Warning: package 'packcircles' was built under R version 3.4.4</code></pre>
<div class="sourceCode" id="cb228"><pre class="sourceCode r"><code class="sourceCode r"><a class="sourceLine" id="cb228-1" data-line-number="1">circles <-<span class="st"> </span><span class="kw">circleProgressiveLayout</span>(tidytext3<span class="op">$</span>freq, <span class="dt">sizetype=</span><span class="st">'area'</span>) <span class="co"># Circle size is proportional to frequeny</span></a>
<a class="sourceLine" id="cb228-2" data-line-number="2"></a>
<a class="sourceLine" id="cb228-3" data-line-number="3"><span class="co"># Add coordinates back to list of words </span></a>
<a class="sourceLine" id="cb228-4" data-line-number="4">tidytext3 =<span class="st"> </span><span class="kw">cbind</span>(tidytext3, circles)</a>
<a class="sourceLine" id="cb228-5" data-line-number="5"></a>
<a class="sourceLine" id="cb228-6" data-line-number="6"><span class="co"># Prodcue vertices so that the circles acan be constructed </span></a>
<a class="sourceLine" id="cb228-7" data-line-number="7">circles2 <-<span class="st"> </span><span class="kw">circleLayoutVertices</span>(circles, <span class="dt">npoints=</span><span class="dv">40</span>) <span class="co"># Option to choose how many vertices - more means better drawn circle</span></a>
<a class="sourceLine" id="cb228-8" data-line-number="8"></a>
<a class="sourceLine" id="cb228-9" data-line-number="9"></a>
<a class="sourceLine" id="cb228-10" data-line-number="10"><span class="co"># Plot circles using ggplot2 & ggiraph</span></a>
<a class="sourceLine" id="cb228-11" data-line-number="11"><span class="kw">library</span>(ggiraph)</a>
<a class="sourceLine" id="cb228-12" data-line-number="12"><span class="kw">library</span>(ggplot2)</a>
<a class="sourceLine" id="cb228-13" data-line-number="13">mybc <-<span class="st"> </span><span class="kw">ggplot</span>() <span class="op">+</span><span class="st"> </span></a>
<a class="sourceLine" id="cb228-14" data-line-number="14"><span class="st"> </span><span class="kw">geom_polygon_interactive</span>(<span class="dt">data =</span> circles2, <span class="kw">aes</span>(x, y, <span class="dt">group =</span> id, <span class="dt">data_id =</span> id), <span class="dt">alpha =</span> <span class="fl">0.6</span>) <span class="op">+</span></a>
<a class="sourceLine" id="cb228-15" data-line-number="15"><span class="st"> </span><span class="kw">scale_fill_manual</span>(<span class="dt">values=</span><span class="st">"steelblue"</span>) <span class="op">+</span></a>
<a class="sourceLine" id="cb228-16" data-line-number="16"><span class="st"> </span><span class="kw">geom_text</span>(<span class="dt">data =</span> tidytext3, <span class="kw">aes</span>(x, y, <span class="dt">size =</span> freq, <span class="dt">label =</span> word)) <span class="op">+</span></a>
<a class="sourceLine" id="cb228-17" data-line-number="17"><span class="st"> </span><span class="kw">scale_size_continuous</span>(<span class="dt">range =</span> <span class="kw">c</span>(<span class="dv">1</span>,<span class="dv">13</span>)) <span class="op">+</span></a>
<a class="sourceLine" id="cb228-18" data-line-number="18"><span class="st"> </span><span class="kw">theme_void</span>() <span class="op">+</span><span class="st"> </span></a>
<a class="sourceLine" id="cb228-19" data-line-number="19"><span class="st"> </span><span class="kw">theme</span>(<span class="dt">legend.position=</span><span class="st">"none"</span>) <span class="op">+</span></a>
<a class="sourceLine" id="cb228-20" data-line-number="20"><span class="st"> </span><span class="kw">coord_equal</span>()</a>
<a class="sourceLine" id="cb228-21" data-line-number="21"></a>
<a class="sourceLine" id="cb228-22" data-line-number="22"><span class="kw">ggiraph</span>(<span class="dt">ggobj =</span> mybc, <span class="dt">width_svg =</span> <span class="dv">12</span>, <span class="dt">height_svg =</span> <span class="dv">12</span>)</a></code></pre></div>
<pre><code>## Warning: package 'gdtools' was built under R version 3.4.4</code></pre>
<div id="htmlwidget-8e8a0e070a2bb9754baa" style="width:672px;height:480px;" class="ggiraph html-widget"></div>
<script type="application/json" data-for="htmlwidget-8e8a0e070a2bb9754baa">{"x":{"html":"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<svg xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\" id=\"svg_1\" viewBox=\"0 0 864.00 864.00\">\n <g>\n <defs>\n <clipPath id=\"cl1_0\">\n <rect x=\"0.00\" y=\"864.00\" width=\"0.00\" height=\"0.00\"/>\n <\/clipPath>\n <\/defs>\n <rect x=\"0.00\" y=\"0.00\" width=\"864.00\" height=\"864.00\" id=\"1\" clip-path=\"url(#cl1_0)\" fill=\"#FFFFFF\" fill-opacity=\"1\" stroke-width=\"0.75\" stroke=\"#FFFFFF\" stroke-opacity=\"1\" stroke-linejoin=\"round\" stroke-linecap=\"round\"/>\n <defs>\n <clipPath id=\"cl1_1\">\n <rect x=\"0.00\" y=\"0.00\" width=\"864.00\" height=\"864.00\"/>\n <\/clipPath>\n <\/defs>\n <defs>\n <clipPath id=\"cl1_2\">\n <rect x=\"0.00\" y=\"17.46\" width=\"864.00\" height=\"829.09\"/>\n <\/clipPath>\n <\/defs>\n <polygon points=\"442.66,444.94 441.83,434.42 439.36,424.15 435.32,414.40 429.81,405.40 422.95,397.37 414.92,390.51 405.92,384.99 396.16,380.95 385.90,378.49 375.37,377.66 364.84,378.49 354.58,380.95 344.82,384.99 335.82,390.51 327.79,397.37 320.94,405.40 315.42,414.40 311.38,424.15 308.91,434.42 308.08,444.94 308.91,455.47 311.38,465.74 315.42,475.49 320.94,484.49 327.79,492.52 335.82,499.38 344.82,504.90 354.58,508.94 364.84,511.40 375.37,512.23 385.90,511.40 396.16,508.94 405.92,504.90 414.92,499.38 422.95,492.52 429.81,484.49 435.32,475.49 439.36,465.74 441.83,455.47 442.66,444.94\" id=\"2\" clip-path=\"url(#cl1_2)\" fill=\"#333333\" fill-opacity=\"0.6\" stroke=\"none\"/>\n <polygon points=\"577.23,444.94 576.40,434.42 573.94,424.15 569.89,414.40 564.38,405.40 557.52,397.37 549.49,390.51 540.49,384.99 530.74,380.95 520.47,378.49 509.94,377.66 499.42,378.49 489.15,380.95 479.40,384.99 470.39,390.51 462.36,397.37 455.51,405.40 449.99,414.40 445.95,424.15 443.48,434.42 442.66,444.94 443.48,455.47 445.95,465.74 449.99,475.49 455.51,484.49 462.36,492.52 470.39,499.38 479.40,504.90 489.15,508.94 499.42,511.40 509.94,512.23 520.47,511.40 530.74,508.94 540.49,504.90 549.49,499.38 557.52,492.52 564.38,484.49 569.89,475.49 573.94,465.74 576.40,455.47 577.23,444.94\" id=\"3\" clip-path=\"url(#cl1_2)\" fill=\"#333333\" fill-opacity=\"0.6\" stroke=\"none\"/>\n <polygon points=\"509.94,561.49 509.11,550.96 506.65,540.70 502.61,530.94 497.09,521.94 490.23,513.91 482.21,507.05 473.20,501.54 463.45,497.49 453.18,495.03 442.66,494.20 432.13,495.03 421.86,497.49 412.11,501.54 403.11,507.05 395.08,513.91 388.22,521.94 382.70,530.94 378.66,540.70 376.20,550.96 375.37,561.49 376.20,572.01 378.66,582.28 382.70,592.03 388.22,601.04 395.08,609.07 403.11,615.92 412.11,621.44 421.86,625.48 432.13,627.95 442.66,628.77 453.18,627.95 463.45,625.48 473.20,621.44 482.21,615.92 490.23,609.07 497.09,601.04 502.61,592.03 506.65,582.28 509.11,572.01 509.94,561.49\" id=\"4\" clip-path=\"url(#cl1_2)\" fill=\"#333333\" fill-opacity=\"0.6\" stroke=\"none\"/>\n <polygon points=\"375.37,561.49 374.54,550.96 372.08,540.70 368.04,530.94 362.52,521.94 355.66,513.91 347.63,507.05 338.63,501.54 328.88,497.49 318.61,495.03 308.08,494.20 297.56,495.03 287.29,497.49 277.54,501.54 268.53,507.05 260.51,513.91 253.65,521.94 248.13,530.94 244.09,540.70 241.63,550.96 240.80,561.49 241.63,572.01 244.09,582.28 248.13,592.03 253.65,601.04 260.51,609.07 268.53,615.92 277.54,621.44 287.29,625.48 297.56,627.95 308.08,628.77 318.61,627.95 328.88,625.48 338.63,621.44 347.63,615.92 355.66,609.07 362.52,601.04 368.04,592.03 372.08,582.28 374.54,572.01 375.37,561.49\" id=\"5\" clip-path=\"url(#cl1_2)\" fill=\"#333333\" fill-opacity=\"0.6\" stroke=\"none\"/>\n <polygon points=\"308.08,444.94 307.26,434.42 304.79,424.15 300.75,414.40 295.23,405.40 288.38,397.37 280.35,390.51 271.35,384.99 261.59,380.95 251.32,378.49 240.80,377.66 230.27,378.49 220.01,380.95 210.25,384.99 201.25,390.51 193.22,397.37 186.36,405.40 180.85,414.40 176.81,424.15 174.34,434.42 173.51,444.94 174.34,455.47 176.81,465.74 180.85,475.49 186.36,484.49 193.22,492.52 201.25,499.38 210.25,504.90 220.01,508.94 230.27,511.40 240.80,512.23 251.32,511.40 261.59,508.94 271.35,504.90 280.35,499.38 288.38,492.52 295.23,484.49 300.75,475.49 304.79,465.74 307.26,455.47 308.08,444.94\" id=\"6\" clip-path=\"url(#cl1_2)\" fill=\"#333333\" fill-opacity=\"0.6\" stroke=\"none\"/>\n <polygon points=\"403.24,297.09 402.07,282.21 398.58,267.69 392.87,253.89 385.07,241.16 375.37,229.81 364.02,220.11 351.28,212.31 337.49,206.59 322.97,203.11 308.08,201.94 293.20,203.11 278.68,206.59 264.88,212.31 252.15,220.11 240.80,229.81 231.10,241.16 223.30,253.89 217.59,267.69 214.10,282.21 212.93,297.09 214.10,311.98 217.59,326.50 223.30,340.29 231.10,353.02 240.80,364.38 252.15,374.08 264.88,381.88 278.68,387.59 293.20,391.08 308.08,392.25 322.97,391.08 337.49,387.59 351.28,381.88 364.02,374.08 375.37,364.38 385.07,353.02 392.87,340.29 398.58,326.50 402.07,311.98 403.24,297.09\" id=\"7\" clip-path=\"url(#cl1_2)\" fill=\"#333333\" fill-opacity=\"0.6\" stroke=\"none\"/>\n <polygon points=\"593.03,282.95 591.86,268.07 588.37,253.55 582.66,239.75 574.86,227.02 565.16,215.67 553.80,205.97 541.07,198.17 527.28,192.45 512.76,188.97 497.87,187.79 482.99,188.97 468.47,192.45 454.67,198.17 441.94,205.97 430.59,215.67 420.89,227.02 413.09,239.75 407.37,253.55 403.89,268.07 402.72,282.95 403.89,297.84 407.37,312.36 413.09,326.15 420.89,338.88 430.59,350.24 441.94,359.93 454.67,367.74 468.47,373.45 482.99,376.94 497.87,378.11 512.76,376.94 527.28,373.45 541.07,367.74 553.80,359.93 565.16,350.24 574.86,338.88 582.66,326.15 588.37,312.36 591.86,297.84 593.03,282.95\" id=\"8\" clip-path=\"url(#cl1_2)\" fill=\"#333333\" fill-opacity=\"0.6\" stroke=\"none\"/>\n <polygon points=\"755.13,382.66 753.96,367.78 750.47,353.26 744.76,339.46 736.96,326.73 727.26,315.38 715.90,305.68 703.17,297.88 689.38,292.17 674.86,288.68 659.97,287.51 645.09,288.68 630.57,292.17 616.77,297.88 604.04,305.68 592.69,315.38 582.99,326.73 575.19,339.46 569.47,353.26 565.99,367.78 564.82,382.66 565.99,397.55 569.47,412.07 575.19,425.87 582.99,438.60 592.69,449.95 604.04,459.65 616.77,467.45 630.57,473.16 645.09,476.65 659.97,477.82 674.86,476.65 689.38,473.16 703.17,467.45 715.90,459.65 727.26,449.95 736.96,438.60 744.76,425.87 750.47,412.07 753.96,397.55 755.13,382.66\" id=\"9\" clip-path=\"url(#cl1_2)\" fill=\"#333333\" fill-opacity=\"0.6\" stroke=\"none\"/>\n <polygon points=\"675.67,536.70 674.84,526.17 672.38,515.91 668.34,506.15 662.82,497.15 655.96,489.12 647.93,482.26 638.93,476.75 629.18,472.71 618.91,470.24 608.38,469.41 597.86,470.24 587.59,472.71 577.84,476.75 568.84,482.26 560.81,489.12 553.95,497.15 548.43,506.15 544.39,515.91 541.93,526.17 541.10,536.70 541.93,547.22 544.39,557.49 548.43,567.25 553.95,576.25 560.81,584.28 568.84,591.13 577.84,596.65 587.59,600.69 597.86,603.16 608.38,603.98 618.91,603.16 629.18,600.69 638.93,596.65 647.93,591.13 655.96,584.28 662.82,576.25 668.34,567.25 672.38,557.49 674.84,547.22 675.67,536.70\" id=\"10\" clip-path=\"url(#cl1_2)\" fill=\"#333333\" fill-opacity=\"0.6\" stroke=\"none\"/>\n <polygon points=\"608.38,653.24 607.56,642.72 605.09,632.45 601.05,622.69 595.53,613.69 588.68,605.66 580.65,598.81 571.65,593.29 561.89,589.25 551.62,586.78 541.10,585.96 530.57,586.78 520.31,589.25 510.55,593.29 501.55,598.81 493.52,605.66 486.66,613.69 481.15,622.69 477.11,632.45 474.64,642.72 473.81,653.24 474.64,663.77 477.11,674.03 481.15,683.79 486.66,692.79 493.52,700.82 501.55,707.68 510.55,713.19 520.31,717.23 530.57,719.70 541.10,720.53 551.62,719.70 561.89,717.23 571.65,713.19 580.65,707.68 588.68,700.82 595.53,692.79 601.05,683.79 605.09,674.03 607.56,663.77 608.38,653.24\" id=\"11\" clip-path=\"url(#cl1_2)\" fill=\"#333333\" fill-opacity=\"0.6\" stroke=\"none\"/>\n <polygon points=\"479.70,692.62 478.87,682.09 476.41,671.83 472.37,662.07 466.85,653.07 459.99,645.04 451.97,638.18 442.96,632.67 433.21,628.63 422.94,626.16 412.42,625.33 401.89,626.16 391.62,628.63 381.87,632.67 372.87,638.18 364.84,645.04 357.98,653.07 352.46,662.07 348.42,671.83 345.96,682.09 345.13,692.62 345.96,703.14 348.42,713.41 352.46,723.17 357.98,732.17 364.84,740.20 372.87,747.05 381.87,752.57 391.62,756.61 401.89,759.08 412.42,759.90 422.94,759.08 433.21,756.61 442.96,752.57 451.97,747.05 459.99,740.20 466.85,732.17 472.37,723.17 476.41,713.41 478.87,703.14 479.70,692.62\" id=\"12\" clip-path=\"url(#cl1_2)\" fill=\"#333333\" fill-opacity=\"0.6\" stroke=\"none\"/>\n <polygon points=\"346.50,713.70 345.33,698.81 341.85,684.29 336.13,670.50 328.33,657.77 318.63,646.41 307.28,636.72 294.55,628.91 280.75,623.20 266.23,619.71 251.35,618.54 236.46,619.71 221.94,623.20 208.15,628.91 195.42,636.72 184.06,646.41 174.36,657.77 166.56,670.50 160.85,684.29 157.36,698.81 156.19,713.70 157.36,728.59 160.85,743.11 166.56,756.90 174.36,769.63 184.06,780.99 195.42,790.68 208.15,798.49 221.94,804.20 236.46,807.69 251.35,808.86 266.23,807.69 280.75,804.20 294.55,798.49 307.28,790.68 318.63,780.99 328.33,769.63 336.13,756.90 341.85,743.11 345.33,728.59 346.50,713.70\" id=\"13\" clip-path=\"url(#cl1_2)\" fill=\"#333333\" fill-opacity=\"0.6\" stroke=\"none\"/>\n <polygon points=\"460.48,158.73 459.65,148.21 457.19,137.94 453.15,128.18 447.63,119.18 440.77,111.15 432.75,104.30 423.74,98.78 413.99,94.74 403.72,92.27 393.20,91.45 382.67,92.27 372.40,94.74 362.65,98.78 353.65,104.30 345.62,111.15 338.76,119.18 333.24,128.18 329.20,137.94 326.74,148.21 325.91,158.73 326.74,169.26 329.20,179.52 333.24,189.28 338.76,198.28 345.62,206.31 353.65,213.17 362.65,218.68 372.40,222.72 382.67,225.19 393.20,226.02 403.72,225.19 413.99,222.72 423.74,218.68 432.75,213.17 440.77,206.31 447.63,198.28 453.15,189.28 457.19,179.52 459.65,169.26 460.48,158.73\" id=\"14\" clip-path=\"url(#cl1_2)\" fill=\"#333333\" fill-opacity=\"0.6\" stroke=\"none\"/>\n <polygon points=\"590.06,122.43 589.24,111.90 586.77,101.64 582.73,91.88 577.21,82.88 570.36,74.85 562.33,67.99 553.33,62.48 543.57,58.44 533.30,55.97 522.78,55.14 512.25,55.97 501.99,58.44 492.23,62.48 483.23,67.99 475.20,74.85 468.34,82.88 462.83,91.88 458.79,101.64 456.32,111.90 455.49,122.43 456.32,132.96 458.79,143.22 462.83,152.98 468.34,161.98 475.20,170.01 483.23,176.86 492.23,182.38 501.99,186.42 512.25,188.89 522.78,189.72 533.30,188.89 543.57,186.42 553.33,182.38 562.33,176.86 570.36,170.01 577.21,161.98 582.73,152.98 586.77,143.22 589.24,132.96 590.06,122.43\" id=\"15\" clip-path=\"url(#cl1_2)\" fill=\"#333333\" fill-opacity=\"0.6\" stroke=\"none\"/>\n <polygon points=\"702.55,196.29 701.73,185.77 699.26,175.50 695.22,165.74 689.70,156.74 682.85,148.71 674.82,141.86 665.82,136.34 656.06,132.30 645.79,129.83 635.27,129.01 624.74,129.83 614.48,132.30 604.72,136.34 595.72,141.86 587.69,148.71 580.83,156.74 575.32,165.74 571.28,175.50 568.81,185.77 567.98,196.29 568.81,206.82 571.28,217.08 575.32,226.84 580.83,235.84 587.69,243.87 595.72,250.73 604.72,256.24 614.48,260.28 624.74,262.75 635.27,263.58 645.79,262.75 656.06,260.28 665.82,256.24 674.82,250.73 682.85,243.87 689.70,235.84 695.22,226.84 699.26,217.08 701.73,206.82 702.55,196.29\" id=\"16\" clip-path=\"url(#cl1_2)\" fill=\"#333333\" fill-opacity=\"0.6\" stroke=\"none\"/>\n <polygon points=\"824.73,252.71 823.90,242.19 821.43,231.92 817.39,222.17 811.88,213.16 805.02,205.13 796.99,198.28 787.99,192.76 778.23,188.72 767.97,186.26 757.44,185.43 746.92,186.26 736.65,188.72 726.89,192.76 717.89,198.28 709.86,205.13 703.01,213.16 697.49,222.17 693.45,231.92 690.98,242.19 690.16,252.71 690.98,263.24 693.45,273.51 697.49,283.26 703.01,292.26 709.86,300.29 717.89,307.15 726.89,312.67 736.65,316.71 746.92,319.17 757.44,320.00 767.97,319.17 778.23,316.71 787.99,312.67 796.99,307.15 805.02,300.29 811.88,292.26 817.39,283.26 821.43,273.51 823.90,263.24 824.73,252.71\" id=\"17\" clip-path=\"url(#cl1_2)\" fill=\"#333333\" fill-opacity=\"0.6\" stroke=\"none\"/>\n <polygon points=\"241.13,570.94 240.30,560.41 237.84,550.15 233.80,540.39 228.28,531.39 221.42,523.36 213.39,516.50 204.39,510.99 194.64,506.95 184.37,504.48 173.84,503.65 163.32,504.48 153.05,506.95 143.30,510.99 134.30,516.50 126.27,523.36 119.41,531.39 113.89,540.39 109.85,550.15 107.39,560.41 106.56,570.94 107.39,581.46 109.85,591.73 113.89,601.49 119.41,610.49 126.27,618.52 134.30,625.37 143.30,630.89 153.05,634.93 163.32,637.40 173.84,638.22 184.37,637.40 194.64,634.93 204.39,630.89 213.39,625.37 221.42,618.52 228.28,610.49 233.80,601.49 237.84,591.73 240.30,581.46 241.13,570.94\" id=\"18\" clip-path=\"url(#cl1_2)\" fill=\"#333333\" fill-opacity=\"0.6\" stroke=\"none\"/>\n <polygon points=\"173.84,454.40 173.02,443.87 170.55,433.60 166.51,423.85 160.99,414.85 154.14,406.82 146.11,399.96 137.11,394.44 127.35,390.40 117.08,387.94 106.56,387.11 96.03,387.94 85.77,390.40 76.01,394.44 67.01,399.96 58.98,406.82 52.12,414.85 46.61,423.85 42.57,433.60 40.10,443.87 39.27,454.40 40.10,464.92 42.57,475.19 46.61,484.94 52.12,493.95 58.98,501.97 67.01,508.83 76.01,514.35 85.77,518.39 96.03,520.85 106.56,521.68 117.08,520.85 127.35,518.39 137.11,514.35 146.11,508.83 154.14,501.97 160.99,493.95 166.51,484.94 170.55,475.19 173.02,464.92 173.84,454.40\" id=\"19\" clip-path=\"url(#cl1_2)\" fill=\"#333333\" fill-opacity=\"0.6\" stroke=\"none\"/>\n <g clip-path=\"url(#cl1_2)\">\n <text x=\"369.28\" y=\"446.02\" id=\"20\" font-size=\"2.13pt\" font-family=\"Arial\">cahnging<\/text>\n <\/g>\n <g clip-path=\"url(#cl1_2)\">\n <text x=\"503.52\" y=\"446.02\" id=\"21\" font-size=\"2.13pt\" font-family=\"Arial\">confusing<\/text>\n <\/g>\n <g clip-path=\"url(#cl1_2)\">\n <text x=\"438.07\" y=\"562.56\" id=\"22\" font-size=\"2.13pt\" font-family=\"Arial\">correct<\/text>\n <\/g>\n <g clip-path=\"url(#cl1_2)\">\n <text x=\"303.25\" y=\"562.56\" id=\"23\" font-size=\"2.13pt\" font-family=\"Arial\">couldnt<\/text>\n <\/g>\n <g clip-path=\"url(#cl1_2)\">\n <text x=\"236.46\" y=\"446.02\" id=\"24\" font-size=\"2.13pt\" font-family=\"Arial\">details<\/text>\n <\/g>\n <g clip-path=\"url(#cl1_2)\">\n <text x=\"269.01\" y=\"310.33\" id=\"25\" font-size=\"27.74pt\" font-family=\"Arial\">easy<\/text>\n <\/g>\n <g clip-path=\"url(#cl1_2)\">\n <text x=\"456.72\" y=\"296.19\" id=\"26\" font-size=\"27.74pt\" font-family=\"Arial\">good<\/text>\n <\/g>\n <g clip-path=\"url(#cl1_2)\">\n <text x=\"568.46\" y=\"395.91\" id=\"27\" font-size=\"27.74pt\" font-family=\"Arial\">information<\/text>\n <\/g>\n <g clip-path=\"url(#cl1_2)\">\n <text x=\"606.05\" y=\"537.77\" id=\"28\" font-size=\"2.13pt\" font-family=\"Arial\">lots<\/text>\n <\/g>\n <g clip-path=\"url(#cl1_2)\">\n <text x=\"535.85\" y=\"654.31\" id=\"29\" font-size=\"2.13pt\" font-family=\"Arial\">minutes<\/text>\n <\/g>\n <g clip-path=\"url(#cl1_2)\">\n <text x=\"407.41\" y=\"693.69\" id=\"30\" font-size=\"2.13pt\" font-family=\"Arial\">needed<\/text>\n <\/g>\n <g clip-path=\"url(#cl1_2)\">\n <text x=\"208.16\" y=\"726.94\" id=\"31\" font-size=\"27.74pt\" font-family=\"Arial\">quick<\/text>\n <\/g>\n <g clip-path=\"url(#cl1_2)\">\n <text x=\"389.61\" y=\"159.80\" id=\"32\" font-size=\"2.13pt\" font-family=\"Arial\">really<\/text>\n <\/g>\n <g clip-path=\"url(#cl1_2)\">\n <text x=\"519.11\" y=\"123.50\" id=\"33\" font-size=\"2.13pt\" font-family=\"Arial\">setup<\/text>\n <\/g>\n <g clip-path=\"url(#cl1_2)\">\n <text x=\"632.43\" y=\"197.36\" id=\"34\" font-size=\"2.13pt\" font-family=\"Arial\">took<\/text>\n <\/g>\n <g clip-path=\"url(#cl1_2)\">\n <text x=\"755.02\" y=\"253.78\" id=\"35\" font-size=\"2.13pt\" font-family=\"Arial\">use<\/text>\n <\/g>\n <g clip-path=\"url(#cl1_2)\">\n <text x=\"169.01\" y=\"572.01\" id=\"36\" font-size=\"2.13pt\" font-family=\"Arial\">wanted<\/text>\n <\/g>\n <g clip-path=\"url(#cl1_2)\">\n <text x=\"103.89\" y=\"455.47\" id=\"37\" font-size=\"2.13pt\" font-family=\"Arial\">way<\/text>\n <\/g>\n <defs>\n <clipPath id=\"cl1_3\">\n <rect x=\"0.00\" y=\"0.00\" width=\"864.00\" height=\"864.00\"/>\n <\/clipPath>\n <\/defs>\n <\/g>\n<\/svg>\n","css":".tooltip_svg_1 {position:absolute;pointer-events:none;z-index:999;padding:5px;background:black;color:white;border-radius:2px 2px 2px 2px;}\n.hover_svg_1{fill:orange;stroke:gray;}\n.clicked_svg_1{fill:orange;stroke:gray;}","ui_html":"<div class='ggiraph-toolbar'><div class='ggiraph-toolbar-block shinyonly'><a class='ggiraph-toolbar-icon neutral' title='lasso selection' href='javascript:lasso_on(\"svg_1\", true, \"array_selected_svg_1\", \"clicked_svg_1\");'><svg width='15pt' height='15pt' viewBox='0 0 230 230'><g><ellipse ry='65.5' rx='86.5' cy='94' cx='115.5' stroke-width='20' fill='transparent'/><ellipse ry='11.500001' rx='10.5' cy='153' cx='91.5' stroke-width='20' fill='transparent'/><line y2='210.5' x2='105' y1='164.5' x1='96' stroke-width='20'/><\/g><\/svg><\/a><a class='ggiraph-toolbar-icon drop' title='lasso anti-selection' href='javascript:lasso_on(\"svg_1\", false, \"array_selected_svg_1\", \"clicked_svg_1\");'><svg width='15pt' height='15pt' viewBox='0 0 230 230'><g><ellipse ry='65.5' rx='86.5' cy='94' cx='115.5' stroke-width='20' fill='transparent'/><ellipse ry='11.500001' rx='10.5' cy='153' cx='91.5' stroke-width='20' fill='transparent'/><line y2='210.5' x2='105' y1='164.5' x1='96' stroke-width='20'/><\/g><\/svg><\/a><\/div><\/div>","uid":"svg_1","width":"75%","funname":"init_prop_svg_1","sel_array_name":"array_selected_svg_1","selected_class":"clicked_svg_1","tooltip_opacity":0.9,"tooltip_offx":10,"tooltip_offy":0,"zoom_max":1,"selection_type":"multiple"},"evals":[],"jsHooks":[]}</script>
</div>
</div>
</section>
</div>
</div>
</div>
<a href="machine-learning-predictive-analytics.html" class="navigation navigation-prev " aria-label="Previous page"><i class="fa fa-angle-left"></i></a>
<a href="shiny.html" class="navigation navigation-next " aria-label="Next page"><i class="fa fa-angle-right"></i></a>
</div>
</div>
<script src="libs/gitbook-2.6.7/js/app.min.js"></script>
<script src="libs/gitbook-2.6.7/js/lunr.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-search.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-sharing.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-fontsettings.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-bookdown.js"></script>
<script src="libs/gitbook-2.6.7/js/jquery.highlight.js"></script>
<script>
gitbook.require(["gitbook"], function(gitbook) {
gitbook.start({
"sharing": {
"github": false,
"facebook": true,
"twitter": true,
"google": false,
"linkedin": false,
"weibo": false,
"instapper": false,
"vk": false,
"all": ["facebook", "google", "twitter", "linkedin", "weibo", "instapaper"]
},
"fontsettings": {
"theme": "white",
"family": "sans",
"size": 2
},
"edit": {
"link": null,
"text": null
},
"download": null,
"toc": {
"collapse": "subsection"
}
});
});
</script>
</body>
</html>