text_analysis/cgu3.R at master · lorenc5/text_analysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#############################################
# Loren Collingwood                         #
# UC Riverside                              #
# DATE: 5/22/2019                           #
# Webscraping with rvest                    #
# Topic 2: Prison Company News Releases     #
#############################################

#####################
# Clear the Console #
#####################

rm(list=ls())

############
# Packages #
############

library(rvest)
library(readxl)
library(lubridate)
library(svMisc)
library(data.table)

#################
# Set Directory #
#################

setwd("~/Dropbox/collingwood_research/cgu_workshop/text_analysis"); list.files()

######################
# Read in Links Data #
######################

links <- read.csv("cca_links.csv", header=T, stringsAsFactors = F)

links <- links$link

# Initiate list holder/container #
cca_hold <- list()

n <- length(links)

#################
# Initiate Loop #
#################

for ( i in 1:n){

  if (i == 1) message("Start")
  if (i == round(n*.5,0)) message("50% Done")
  if (i == n) message("Done!")

  #############################################
  # Read in Link -- this is the main iterator #
  #############################################

  cca <- read_html(links[i])

  ################
  # Extract Date #
  ################

  text <- html_text(html_nodes(cca, "div div")); length(text)

  # Date Regular Expression and Clean #
  dates <- str_squish( str_extract(text, "[a-zA-Z]+ [0-9]+[,]+\\s+[0-9]{4}") )

  # Further Cleaning #
  dates <- dates[!is.na(dates)][1] # take out nas and select first date

  ####################
  # Extract Headline #
  ####################

  # Collapse Text into Vector #
  text <- paste(text, collapse=" ")

  text_manip <- unlist ( str_split(text, dates) )[1] # take the first split on the date
  text_manip <- unlist(str_split(text_manip, "\n\n\n")) # Assumes \n\n\n is in all releases
  text_manip <- text_manip[length(text_manip)] # assumes the headline is last
  headline <- str_trim(text_manip) # Store into headline vector

  rm(text) # Garbage Clean #

  ######################################
  # Get Paragraph Text of News Release #
  ######################################

  text <- html_text(html_nodes(cca, "div p")); length(par)
  text <- paste(text, collapse = " ")
  text <- gsub("\r","", text)
  text <- gsub("\n","", text)

  # Place data frame into List Item #
  cca_hold[[i]] <- data.frame(dates, headline, text, stringsAsFactors = F)

}

###########################################
# Convert List of dataframes to dataframe #
###########################################

cca_df <- rbindlist(cca_hold)

########################
# Look at Column Names #
########################

names(cca_df)

#################
# Look at Dates #
#################

cca_df$dates <- lubridate::mdy(cca_df$dates) # Convert dates #
cca_df$year <- lubridate::year(cca_df$dates) # Convert dates #
cca_df$headline

############################################
# Take a quick look at Yearly Distribution #
############################################

hist(cca_df$year)

#######################
# Write out .csv File #
#######################

write.csv(cca_df[, c("dates","year", "headline")], "cnews_release.csv", row.names=F)