# R

R严格意义不是OOP 所以list可以被用来做container

reg<- lm() 就能极其简单的regression，reg$就有所有hints 知道该用哪个parameter

dataframe也是一个list。

```r
# matrix multiplication
%*%

#grep grepl
haystack <- c("red", "blue", "green", "blue", "green forest")


grep("green", haystack)

grep("r", haystack) # returns position
grep("r", haystack, value = TRUE) # returns value
grepl("r", haystack) # returns boolean

sub("e", "+", haystack) # replaces pattern with replacement (once)
gsub("e", "+", haystack) # replaces pattern with replacement (global)
```

### Introduction to the Tidyverse (Notes)

```r
#arrange(xx)序，arrange（desc（xx））降序

# ggplot 画彩色、不同size的scatter plot
ggplot(gapminder_1952, aes(x = pop, y = lifeExp, color = continent, size = gdpPercap)) +
  geom_point() +
  scale_x_log10()
  expand_limits(y=0)
  
# ggplot+用facet来画不同的panel，并排显示
ggplot(gapminder_1952, aes(x=pop, y=lifeExp))+
geom_point()+
scale_x_log10()+
facet_wrap(~ continent)

# bar plot
by_continent<-gapminder%>%
filter(year==1952)%>%
group_by(continent)%>%
summarize(medianGdpPercap=median(gdpPercap))
ggplot(by_continent, aes(x=continent, y=medianGdpPercap))+
geom_col()

# 用groupby和summarize 有点像SQL的aggregation function和groupby
gapminder%>%
  group_by(year)%>%
    summarize(medianLifeExp=median(lifeExp), maxGdpPercap=max(gdpPercap))

# histogram 
ggplot(gapminder_1952, aes(x=pop_by_mil))+
geom_histogram(bins=50)

#boxplot
ggplot(gapminder_1952, aes(x = continent, y = gdpPercap)) +
  geom_boxplot() +
  scale_y_log10()+
  ggtitle("Comparing GDP per capita across continents")        
```

### Import data in R 1 (Notes)

```r
# read.table对于header的default就是false
path <- file.path("data", "hotdogs.txt")

# Import the hotdogs.txt file: hotdogs
hotdogs <- read.table(path, 
                      sep = '', 
                      col.names = c("type", "calories", "sodium")
                      colClasses = c("factor", "NULL", "numeric"))

#readr 默认第一行是header 用的是delm=‘’（必须指明） class是"tbl_df" "tbl" "data.frame"
read_delim("potatoes.txt", delim='\t',col_names=properties)

# Import swimming_pools.csv default header是TRUE 
pools <- read.csv( "swimming_pools.csv", stringsAsFactors=FALSE)
# read.csv , read.csv2 ;

#readr reads tibble 不需要写stringsAsFactors 读出来多一行对数据的描述
read_csv


# default header是TRUE   如果要skip几行，那么header也会被skip 所以要设置header=FALSE
hotdogs <- read.delim("hotdogs.txt", header=FALSE)

#readr reads tibble 不需要写stringsAsFactors
potatoes <- read_tsv("potatoes.txt", col_names=properties)


#fread function 更快, 还能spicify行列， class是"data.table" "data.frame"
library(data.table)
potatoes <- fread("potatoes.csv", select = c(1, 5))


#readexcel： readxl
library(readxl)
excel_sheets(" .xlsx") #给出不同的sheets的名字
read_excel(" .xlsx") #把excel的第一个sheet读入，存成tible
read_excel(" .xlsx", sheet=2) #spicify第二个sheet 或者sheet='name'
cols <- c("country", paste0("year_", 1960:1966))
pop_b <- read_excel("urbanpop_nonames.xlsx", col_names=cols, sheet=1)


#可以用lapply一次读完所有sheet并存成list
pop_list <- lapply(excel_sheets("urbanpop.xlsx"), read_excel, path="urbanpop.xlsx")


#gdata 用perl把excel转成了csv，然后借助了read.table的wrapper read.csv()
library(gdata)
urban_pop<-read.xls("urbanpop.xls", col.names= c('country', "1967", "1968", "1969", "1970", "1971", "1972", "1973", "1974"), sheet=2)



# XLConnect   在R里操作excel
install.packages(XLConnect) #也会装上rJAVA
library(XLConnect)
my_book <- loadWorkbook("urbanpop.xlsx")
getSheets(my_book) #和上面的excel_sheets的效果一样
readWorksheet(my_book, sheet = 2, startCol=1, endCol=1)
createSheet(my_book, name="xx")
writeWorksheet(my_book, xx, sheet ="xx")
saveWorkbook(my_book, file="xx")
renameSheet(my_book, "original name", "newname")
saveWorkbook(my_book, file="xx")
removeSheet(my_book, sheet="")

my_book <- loadWorkbook("urbanpop.xlsx")
sheets <- getSheets(my_book)
all <- lapply(sheets, readWorksheet, object = my_book)
str(all)


# which可以选择某个column中的最大最小值
lily <- hotdogs[which.min(hotdogs$calories), ]

# 打印一个tibble里的每一行 注意有个，
print(urbanpop_sel[1,])

# 不要一个tibble里的第一个column [-1]
urban <- cbind(urban_sheet1, urban_sheet2[-1], urban_sheet3[-1])

# 不要NA的行
urban_clean <- na.omit(urban)

```

### Import data in R 1 (Notes)

```r
# Load the DBI package
library(DBI)

# Edit dbConnect() call   出来的是一个MySQL object
con <- dbConnect(RMySQL::MySQL(), 
                 dbname = "tweater", 
                 host = "courses.csrrinzqubik.us-east-1.rds.amazonaws.com", 
                 port = 3306,
                 user = "student",
                 password = "datacamp")
                 
dbListTables(con) #给出这里的所有relational table

dbReadTable(con, "") #给出真正的data

dbDisconnect(con) #用完之后disconnect

dbGetQuery(con, "SELECT name FROM employees WHERE started_at > "xx"") #注意SQL里的WHERE后是1个=
#这和下面的等价
subset(employees, subset=started_at>"xx", select = name)
#也和下面的等价
res <- dbSendQuery(con, "")
dbFetch(res)  #优势是可以specify数量 (res=2)


#用完之后clearresult
dbClearResult(res)
#SQL里在Where后可以用CHAR_LENGTH这个function，return string里的字母数

#R也可以直接用read.csv直接load http或者https的csv 
# 但是类似的用read_excel读xlsx暂时还不行 可以先下载再load
dest_path <- file.path("")
download.file(url, dest_path)

#还可以download RData file 现在之后只用 load("name")就行

#httr & GET
library(httr)
resp <- GET(url)
# Get the raw content of resp: raw_content
raw_content<- content(resp, as='raw') 
# content的其他格式，比如as text 可以给出内容，default可以给出R list

#jsonlite  JSON可以是一个JSON FILE、JSON ARRAY
install.packages("jsonlite")
library(jsonlite)
fromJSON("somejsonurl") #括号里的可以是jsonurl 也可以是一个keyvalue pair的dictionary
toJSON()
pretty_json=toJSON(mtcars, pretty=TRUE)

prettify()
minify()


#haven   SAS, STATA, SPSS
library(haven)
read_sas("xx")
read_stata(), read_dta()
# 注意stata出来的格式有label，给了他们顺序 在处理时需要把它们转成R的factor  
#比如 as.charcater(as_factor(xx$xx)) sugar$Date<-as.Date(as_factor(sugar$Date))
read_spss()  read_por() read_sav()

#foreign   SAS, STATA, SPSS,Systat, Weka
# 用这个package 有一个参数是convert.factor 默认是TRUE by default converte.dates True by default, 
# missing.type False by default， 如果是False，见到missing就是NA 如果是TRUE 显示missing的attributes(27种)
read.sas()

#use.value.labels= TRUE by default, to.data.frame = FALSE by default
read.spss()


nrow(subset()) # returns the number of the rows that satisfy the condition in the parenthesis 
```

### Cleaning Data in R (Notes)

R里对于variable的定义

character、numeric（NaN和Inf都是numeric）、integer（以L结尾的才是integer）、factor、logical（NA算logical）

```r
# view dimension of a dataframe, first row, second column
dim(lunch)

#look at column names
names(lunch)

#summary, median mean quantiles 
summary(lunch)

# dplyr 的package里
glimpse(lunch)

# view the top, can specify n
head(lunch)
tail(lunch) # bottom

#draw histograms
hist(lunch$per_free_red, breaks=20)
#scatter plot
plot(,)
#boxplot
boxplot(x, horizontal=TRUE)

#The followings are from tidyr package

#gather function gets key-value pairs
gather(df, key, value, -namethiscolumn) #all colomns except for the namethiscolumn

#spread function spread the key-value pairs
spread(df, key, value)

#separate a colmn to multiple columns
separate(df, columnname, c('this', 'that'), sep= '')

#unite multiple columns to one
unite(df, unitedname, columname1, columname2, sep="-")


# lubridate package 是用来处理datetime variable的
ymd("2015-08-25") ymd("2015 August 25") mdy("August 25, 2015")
hms("13:33:09") ymd_hms("2015/08/25 13.33.09")


#stringr 用来处理string
str_trim("") #删前后空格
str_pad("",width=number, side="left"， pad="number") #填充内容
str_detect(vectorname, "string") #在vector中找到指明的string return的是和vector size等长的logical
str_replace(vectorname, "oldstringname", "newname")
tolower("string")
toupper("string")

#NA的处理
is.na(df) #返回和df一样dimension的df，里面写着logical 
any(is.na(df)) #返回一个logical 
sum(is.na(df)) #返回一个total number
which(is.na(df$columnname))
complete.cases(df) #以row为单位扫df，如果有na就false
df[complete.cases(df),] #只保留所有没NA的行
na.omit(df) #也可以remove有missing的row
#table/summary也可以间接告诉NA

#一次性的mutate
mutate_all()
mutate_at(weather5, vars(CloudCover:WindDirDegrees), funs(as.numeric))
mutate_if() 

transmute_all()
transmute_at()
transmute_if()
```

### Network Science in R (Notes)

```r
library(readr)
library(igraph)

#build a network from data frame
g <- graph_from_data_frame(d=ties, directed = FALSE, vertices = nodes)

#add name attribute to the network
g$name <- "Name"

# explore the set of nodes, and print the number of nodes
V(g) #v for vertices
vcount(g)

# explore the set of ties and print the number of ties
E(g) #e for edges
ecount(g)

# add node attribute id and print the node id attribute
V(g)$id <- 1:vcount(g)

# add cluster information 
V(g)$cluster <- nodes$cluster

# print the tie 'weight' attribute
E(g)$weight

library(ggraph)
library(dplyr)
library(ggplot2)

ggraph(g, layout = 'with_kk')+  #Kamada-Kawai layout placed tied nodes at equal distances, so that all ties had roughly the same length.
    geom_edge_link(aes(alpha=weight))+ #draws edges as a straight line between nodes
    geom_node_point(ase(size=degree))   #draws each node as a point,
                    #geom_node_text() draws a text label on each node.
    geom_node_text(aes(label = id), repel = TRUE) #The label aesthetic should map to the id field.
#Set the repel argument to TRUE to prevent the labels from overlapping.
ggraph:::igraphlayouts #看到各种layout方式

# Plot with Kamada-Kawai layout
ggraph(filtered_network, layout="with_kk") + 
  # Add an edge link geom, mapping transparency to similarity
  geom_edge_link(aes(alpha=similarity))

geom_edge_link(aes(alpha = betweenness, filter = betweenness > median_betweenness))

'in_circle' #places nodes on a circle, and
"on_grid" #which places nodes on a grid.

#degree 一个node有几个edges igraph funciton
degree(g)

#strength 一个node边上edges的所有weight的加和 
strength(g)

nodes_with_centrality <- nodes %>%
  # Add a column containing the degree of each node
  mutate(degree = degree(g), strength = strength(g)) %>%
  # Arrange rows by descending degree
  arrange(desc(degree))

#how often a node lies on the shortest path between other nodes.
betweenness()
# how close a node is to all other nodes in the network in terms of shortest path distance.
closeness()

#betweeness of ties: the number of shortest paths going through a tie.
# 1. inverse of weight, call it dist_weight, then apply edge_betweeness()
dist_weight = 1/E(g)$weight
edge_betweeness(g, weights=dist_weight)

#a weak tie as a tie with a weight equal to 1
#find number and percentage of weak ties
ties %>%
  group_by(weight)%>%
  summarise(n=n(), p=n/nrow(ties))%>%  #n=n() means number of nodes
  arrange(-n)
  
tie_counts_by_weight <- ties %>% 
  # Count the number of rows with each weight
  count(weight) %>%
  # Add a column of the percentage of rows with each weight
  mutate(percentage = 100 * n / nrow(ties)) 
  
#connection patterns: using the adjacency matrix
#undirected symmetric. 
A = as_adjacency_matrix(g, attr="weight")

A[1,] #first row
A[,1] #first column

#pearson similarity: [-1,1]
ties_swapped <- ties %>%
  mutate(temp = to, to = from, from = temp) %>% 
  select(-temp)
ties_bound <- bind_rows(ties, ties_swapped)

# Using ties_bound, plot to vs. from, filled by weight
ggplot(ties_bound, aes(x = from, y = to, fill = factor(weight))) +
  # Add a raster geom
  geom_raster() + #draw a point in the plot if there is a tie 
  # Label the color scale as "weight"
  labs(fill = "weight")
  
# visualize correlation
ggplot(data=nodes, mapping=aes(x=degree, y=strength))+
  geom_point()+
  geom_smooth(method="lm", se=FALSE)
  
# graph to matrix
A<-as_adjacancy_matrix(g)
# matrix to graph
g<-graph_from_adjacency_matrix(A, mode="undirected", weighted=TRUE)
# graph to dataframe
df = as_data_frame(g, what='both')
sim_df <- igraph::as_data_frame(h, what = "edges")
# dataframe to graph
g<-graph_from_data_frame(df$ties, vertices=df$nodes)
# matrix to dataframe
df = as_data_frame(graph_from_adjacency_matrix(A), what="both")
# dataframe to matrix
A<- as_adjacency_matrix(graph_from_data_frame(df$ties, vertices=df$nodes))
# Convert df to a tibble
sim_tib <- as_tibble(sim_df)

# hierarchical clustering in R
# distance matrix from similarity matrix 
D<- 1-S #distance is defined as the complement to 1 of similarity
# distance object from distance matrix
d<- as.dist(D)
# average-linkage clustering method
cc<- hclust(d, method="average")
# cut dendrogram at 4 clusters
cutree(cc,k=4)
#Plot the dendrogram of cc with plot().
plot(cc)
#the first pair that was merged 
cc$merge[1,]

#在R里有点像SQL的操作：
nodes %>%
  # Filter rows for cluster 1
  filter(cluster==1) %>% 
  # Select the name column
  select(name)
# Calculate properties of each cluster
nodes %>%
  # Group by cluster
  group_by(cluster) %>%
  # Calculate summary statistics
  summarize(
    # Number of nodes
    size = n(), 
    # Mean degree
    avg_degree = mean(degree),
    # Mean strength
    avg_strength = mean(strength)
  ) %>% 
  # Arrange rows by decreasing size
  arrange(desc(size))
  
  
# 画图的时候也可以像ggplot的facet_wrap一样，不同panel
  facet_nodes(~cluster, scales="free")
  
#visNetwork, interactive plot
data <- toVisNetworkData(g)
head(data$nodes)
head(data$edges)
visNetwork(nodes = NULL, edges = NULL, dot = NULL, gephi = NULL,
  width = NULL, height = NULL, main = NULL, submain = NULL,
  footer = NULL, background = "rgba(0, 0, 0, 0)", ...)
#还可以为它加不同的layout
ls("package:igraph", pattern = "^layout_.")
 # Change the layout to be in a circle %>%
  visIgraphLayout(layout = "layout_with_kk") 
#还可以在选中的时候再变颜色 %>%
  visOptions(highlightNearest=TRUE)
#增加bar选择
  visOptions(nodesIdSelection=TRUE)
#增加不同颜色 用cluster类别来为颜色划分
V(g)$color <- V(g)$cluster
  visOptions(selectedBy = "group")
```

### Predictive Analytics using Networked Data in R (notes)

Number of edges in a fully connected network $$\left(\begin{array}{c}{\text {nodes}} \ {2}\end{array}\right)=\frac{\text {nodes}(\text {nodes }-1)}{2}$$&#x20;

Network Connectance P  $$p=\frac{2 \cdot e d g e s}{n o d e s(n o d e s-1)}$$ (ratio of the actual number of the edges/fully connected)

Expected Number of same label edges: $$\left(\begin{array}{c}{n q} \ {2}\end{array}\right) \cdot p=\frac{n g(n g-1)}{2} \cdot p$$&#x20;

Diadicity $$D=\frac{\text { number of same label edges }}{\text { expected number of same label edges }}$$ $$\begin{array}{l}{D>1 \Rightarrow \text { Dyadic }} \ {D \simeq 1 \Rightarrow \text { Random }} \ {D<1 \Rightarrow \text { Anti-Dyadic }}\end{array}$$&#x20;

Expected Number of cross label edges = $$n\_{w} n\_{g} p$$&#x20;

Heterophilicty: $$H=\frac{\text { number of cross label edges }}{\text { expected number of cross label edges }}$$&#x20;

$$\begin{array}{l}{\text { 1. } H>1 \Rightarrow \text { Heterophilic }} \ {\text { 2. } H \simeq 1 \Rightarrow \text { Random }} \ {\text { 3. } H<1 \Rightarrow \text { Heterophobic }}\end{array}$$&#x20;

```r
library(igraph)
plot.igraph(g, edge.label=, edge.color=, layout= , vertex.label=, vertex.color=,)

#The relational Neighbor Classifier
RelationalNeighbor <- Avector/(Avector+Bvector)
# Compute the churn probabilities
churnProb <- ChurnNeighbors / (ChurnNeighbors + NonChurnNeighbors)
# Find who is most likely to churn
mostLikelyChurners <- which(churnProb == max(churnProb))
# Extract the IDs of the most likely churners
customers$id[mostLikelyChurners]


# Create a subgraph with only churners
churnerNetwork <- induced_subgraph(network, 
                    v = V(network)[which(V(network)$churn == 1)])
                    
# Plot the churner network 
plot(churnerNetwork, vertex.label = NA, vertex.size = 2)


# First Challenge: The following doesn't work, so either featurizing or train one network and test another network
set.seed(1001)
sampleVertices<-sample(1:10, 6, replace=FALSE)
plot(induced_subgraph(g, V(g)[sampleVertices]))
plot(induced_subgraph(g, V(g)[-sampleVertices]))

# Second Challenge: data not iid
# eg. probabilistic relational neighbor classifier 
churnProb_updated <- as.vector((AdjacencyMatrix %*% churnProb) / neighbors)


# Third Challenge: collective inference, eg. Gibbs sampling, iterative classification, relaxation labeling 
#Collective inferencing is a procedure to simultaneously label nodes in interconnected data to reduce classification error.

# R里的AUC在library(pROC)里
library(pROC)
auc(customers$churn, as.vector(churnProb))   #response, predictor

# Homophily: birds of a feather flock together

# match(x, y) returns a vector with the location of x in y
edgeList$FromLabel <- customers[match(edgeList$from, customers$id), 2] #出来的就是匹配出来的那一堆0和1

# Diadicity: measures the connectedness between nodes with the same label compared to what is expected 
# in a random configuration of the network.

# Heterophilicity: connectedness between nodes with different labels compared to what is expected for a random configuration
 
```


---

# Agent Instructions: Querying This Documentation

If you need additional information that is not directly available in this page, you can query the documentation dynamically by asking a question.

Perform an HTTP GET request on the current page URL with the `ask` query parameter:

```
GET https://louisazhou.gitbook.io/notes/project-code-walk-throughs/chapter-7.-r.md?ask=<question>
```

The question should be specific, self-contained, and written in natural language.
The response will contain a direct answer to the question and relevant excerpts and sources from the documentation.

Use this mechanism when the answer is not explicitly present in the current page, you need clarification or additional context, or you want to retrieve related documentation sections.
