# Linear / Logistic Regression

``````# install needed R packages
remotes::update_packages(c('magrittr', 'tidyverse', 'viridis', 'plot3D'), upgrade = TRUE)``````

# 1 getting startted

``````library(magrittr)
library(ggplot2)

theme_set(theme_classic(20) + theme(plot.title = element_text(hjust = 0.5)))
set.seed(0)``````

This guide mainly show the code and plot, for more explanation, please refer to this PPT .

# 2 linear regression

``modelr::sim1``
``ggplot(modelr::sim1, aes(x, y)) + geom_point() + labs(title = '一元线性回归原始数据')``

``````modelr::sim1 %>%
tibble::add_column(dodge = rep(c(-1, 0, 1) / 20, 10)) %>%
dplyr::mutate(x = x + dodge, pred = 7 + x*1.5) %>%
ggplot(aes(x, y)) +
geom_point(colour = "grey40") +
geom_abline(intercept = 7, slope = 1.5, colour = "grey40") +
geom_linerange(aes(ymin = y, ymax = pred), colour = "#3366FF") +
labs(title = '均方误差：衡量拟合效果')``````

``````sim1_dist <- function(k, b) {
predicted <- modelr::sim1 %>% {.[ , -ncol(.)]} %>% as.list() %T>%
{names(.) <- NULL} %>% purrr::pmap_dbl(. %>% {k*. + b})
(modelr::sim1 %>% {.[[ncol(.)]]} - predicted) %>% .^2 %>% mean
}

len <- 25;
sim1_grid <- expand.grid(b = seq(-5, 20, length = len), k = seq(1, 3, length = len))``````
``````sim1_grid %<>% dplyr::mutate(dist = purrr::map2_dbl(k, b, sim1_dist))

sim1_grid %>% ggplot(aes(k, b)) +
geom_point(aes(color = dist)) +
scale_colour_gradient(low = "#56B1F7", high = "#132B43", name = '均方误差') +
labs(title = '穷举法优化均方误差')``````

``````sim1_grid %>% ggplot(aes(k, b)) +
geom_point(data = dplyr::filter(sim1_grid, rank(dist) <= 10), size = 4, colour = "red") +
geom_point(aes(color = dist)) +
scale_colour_gradient(low = "#56B1F7", high = "#132B43", name = '均方误差') +
labs(title = '穷举法寻找最小的10个均方误差')``````

``````linear_aes <- aes(intercept = b, slope = k)

ggplot(modelr::sim1, aes(x, y)) +
geom_point(color = 'red') +
geom_abline(linear_aes, alpha = 0.1,
data = tibble::tibble(b = runif(len*10, -20, 40), k = runif(len*10, -5, 5))) +
geom_abline(linear_aes, color = 'blue', dplyr::filter(sim1_grid, rank(dist) <= 10)) +
labs(title = '随机模型 vs 均方误差最小模型')``````

``````sim1_mod <- lm(y ~ x, modelr::sim1)
sim1_coef <- sim1_mod\$coefficients``````
``````ggplot(modelr::sim1, aes(x, y)) +
geom_point() + ylim(c(0, NA)) +
geom_abline(slope = sim1_coef[2], intercept = sim1_coef[1],
color = 'blue', size = 0.8) +
labs(title = sprintf('y = %.3f * x + %.3f', sim1_coef[2], sim1_coef[1]))``````

# 3 general linear regression

``````set.seed(0)

sim0 <- runif(20, 0, 10) %>% {tibble::tibble(x = ., y = 2*x + 5 + runif(20, 0, 4))} %>%
dplyr::mutate(y = 1.25 ^ y)
sim0``````
``````ggplot(sim0, aes(x, y)) + geom_point() + ylim(c(0, NA)) +
labs(title = '广义线性回归原始数据')``````

``````ggplot(sim0, aes(x, log(y))) + geom_point() + ylim(c(0, NA)) +
labs(y = 'ln(y)', title = '对因变量做对数变换')``````

``````sim0_mod <- lm(log(y) ~ x, sim0)
sim0_coef <- sim0_mod\$coefficients;``````
``````ggplot(sim0, aes(x, log(y))) +
geom_point() + ylim(c(1, NA)) + labs(y = 'ln(y)') +
geom_abline(slope = sim0_coef[2], intercept = sim0_coef[1],
color = 'blue', size = 0.8) +
labs(title = sprintf('ln(y) = %.3f * x + %.3f', sim0_coef[2], sim0_coef[1])) +
theme(plot.title = element_text(hjust = 0.5))``````

``````ggplot(sim0, aes(x, y)) +
geom_point() + ylim(c(0, NA)) +
stat_function(fun = . %>% {exp(sim0_coef[2] * . + sim0_coef[1])},
color = 'blue', size = 0.8) +
labs(title = sprintf('y = exp(%.3f * x + %.3f)', sim0_coef[2], sim0_coef[1])) +
theme(plot.title = element_text(hjust = 0.5))``````

# 4 logistic regression

``````iris_df <- iris %>% dplyr::as_tibble() %>%
dplyr::filter(Species != 'setosa') %>%
dplyr::mutate(versicolor = as.integer(Species == 'versicolor')) %>%
dplyr::select(2, 4, versicolor)
iris_df``````
``````ggplot(iris_df) +
geom_point(aes(Sepal.Width, Petal.Width, size = factor(versicolor))) +
scale_size_manual(name = 'versicolor', values = c('0' = 0.5, '1' = 2), labels = c('no', 'yes')) +
labs(title = '对数几率回归原始数据')``````

``````iris_model <- glm(versicolor ~ Sepal.Width + Petal.Width + 0, data = iris_df, family = 'binomial')
iris_coef <- iris_model\$coefficients``````
``````logistic_plot_base <- iris_df %>%
{tibble::add_column(., probability = predict(iris_model, .[ , 1:2], 'response'))} %>%
ggplot(aes(Sepal.Width, Petal.Width)) +
geom_point(aes(color = probability, size = factor(versicolor))) +
viridis::scale_color_viridis(name = 'versicolor') +
scale_size_manual(values = c('0' = 1, '1' = 2), labels = c('no', 'yes'))

logistic_plot_base + labs(title = '使用 R 进行对数几率回归')``````

``````logistic_plot_base +
geom_abline(slope = -iris_coef[1]/iris_coef[2], intercept = log(1)/iris_coef[2]) +
labs(title = '对数几率回归中的“线性”部分')``````

``````logistic_plot_base + xlim(c(2, 4.5)) +
geom_abline(slope = -6/-10, intercept = log(1)/-20) +
geom_abline(slope = -3/-12, intercept = log(1)/-12) +
labs(title = '两种对数几率模型中“线性”部分的比较')``````

# 5 appendix

I have tried to use 3D plot to show min MSE:

I preserve the following code to show how to plot 3D surface with your own data

``head(sim1_grid)``
``````sim1_grid %>% tidyr::spread('b', 'dist') %>% tibble::column_to_rownames('k') %>% as.matrix() %>%
{plot3D::surf3D(
matrix(rep(rownames(.), len) %>% as.numeric(), ncol = len),
matrix(rep(colnames(.), len) %>% as.numeric(), ncol = len, byrow = T),
.
)} ``````