-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcgu3.R
More file actions
131 lines (92 loc) · 3.21 KB
/
cgu3.R
File metadata and controls
131 lines (92 loc) · 3.21 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#############################################
# Loren Collingwood #
# UC Riverside #
# DATE: 5/22/2019 #
# Webscraping with rvest #
# Topic 2: Prison Company News Releases #
#############################################
#####################
# Clear the Console #
#####################
rm(list=ls())
############
# Packages #
############
library(rvest)
library(readxl)
library(lubridate)
library(svMisc)
library(data.table)
#################
# Set Directory #
#################
setwd("~/Dropbox/collingwood_research/cgu_workshop/text_analysis"); list.files()
######################
# Read in Links Data #
######################
links <- read.csv("cca_links.csv", header=T, stringsAsFactors = F)
links <- links$link
# Initiate list holder/container #
cca_hold <- list()
n <- length(links)
#################
# Initiate Loop #
#################
for ( i in 1:n){
if (i == 1) message("Start")
if (i == round(n*.5,0)) message("50% Done")
if (i == n) message("Done!")
#############################################
# Read in Link -- this is the main iterator #
#############################################
cca <- read_html(links[i])
################
# Extract Date #
################
text <- html_text(html_nodes(cca, "div div")); length(text)
# Date Regular Expression and Clean #
dates <- str_squish( str_extract(text, "[a-zA-Z]+ [0-9]+[,]+\\s+[0-9]{4}") )
# Further Cleaning #
dates <- dates[!is.na(dates)][1] # take out nas and select first date
####################
# Extract Headline #
####################
# Collapse Text into Vector #
text <- paste(text, collapse=" ")
text_manip <- unlist ( str_split(text, dates) )[1] # take the first split on the date
text_manip <- unlist(str_split(text_manip, "\n\n\n")) # Assumes \n\n\n is in all releases
text_manip <- text_manip[length(text_manip)] # assumes the headline is last
headline <- str_trim(text_manip) # Store into headline vector
rm(text) # Garbage Clean #
######################################
# Get Paragraph Text of News Release #
######################################
text <- html_text(html_nodes(cca, "div p")); length(par)
text <- paste(text, collapse = " ")
text <- gsub("\r","", text)
text <- gsub("\n","", text)
# Place data frame into List Item #
cca_hold[[i]] <- data.frame(dates, headline, text, stringsAsFactors = F)
}
###########################################
# Convert List of dataframes to dataframe #
###########################################
cca_df <- rbindlist(cca_hold)
########################
# Look at Column Names #
########################
names(cca_df)
#################
# Look at Dates #
#################
cca_df$dates <- lubridate::mdy(cca_df$dates) # Convert dates #
cca_df$year <- lubridate::year(cca_df$dates) # Convert dates #
cca_df$headline
############################################
# Take a quick look at Yearly Distribution #
############################################
hist(cca_df$year)
#######################
# Write out .csv File #
#######################
write.csv(cca_df[, c("dates","year", "headline")], "cnews_release.csv", row.names=F)