#======================================
# 原创代码无删减,编写不易,论文使用本代码绘图,请引用:www.tcmbiohub.com
# 祝大家投稿顺利!
##############################
## RF 最终定向版(补图优化版)
## 目的:
## 1. 对 merge.Top1000.variance.txt 全部基因进行随机森林分析
## 2. 放宽筛选,避免最终基因数过少
## 3. 最终结果必须包含 基因名
## 4. 输出随机森林误差图和Top30重要性图
## 5. 随机森林误差图仅显示前200棵树,Y轴限制到0.10
## 6. 只输出 1 个最终基因txt文件
##############################
##############################
## 0. 环境准备
##############################
.libPaths(c(Sys.getenv("R_LIBS_USER"), .libPaths()))
options(stringsAsFactors = FALSE)
suppressPackageStartupMessages({
library(limma)
library(randomForest)
library(ggplot2)
})
##############################
## 1. 文件与目录
##############################
expFile <- "merge.Top1000.variance.txt"
setwd("C:/Users/Administrator/Desktop/6.机器学习/3.RF")
target_genes <- c("CXCL1", "OASL", "S100A7A")
force_include_target <- TRUE
##############################
## 2. 读取表达矩阵
##############################
rt <- read.table(
expFile,
header = TRUE,
sep = "\t",
check.names = FALSE,
stringsAsFactors = FALSE,
quote = "",
comment.char = ""
)
rt <- as.matrix(rt)
rownames(rt) <- rt[, 1]
exp <- rt[, 2:ncol(rt), drop = FALSE]
dimnames_list <- list(rownames(exp), colnames(exp))
data <- matrix(
as.numeric(as.matrix(exp)),
nrow = nrow(exp),
dimnames = dimnames_list
)
data <- avereps(data)
##############################
## 3. 转置:行=样本,列=基因
##############################
data <- as.data.frame(t(data))
##############################
## 4. 从样本名中提取分组
##############################
group <- gsub("(.*)\\_(.*)\\_(.*)", "\\3", row.names(data))
cat("原始分组识别结果:\n")
print(table(group, useNA = "ifany"))
group[group %in% c("NS", "Normal", "normal", "control", "Control")] <- "Control"
group[group %in% c("PP", "Treat", "treat", "case", "Case", "lesion", "Lesion")] <- "Treat"
cat("映射后的分组结果:\n")
print(table(group, useNA = "ifany"))
if (!all(group %in% c("Control", "Treat"))) {
stop("样本分组解析失败。请检查样本名格式。")
}
data$group <- factor(group, levels = c("Control", "Treat"))
##############################
## 5. 缺失值与零方差处理
##############################
feature_data <- data[, colnames(data) != "group", drop = FALSE]
na_cols <- colSums(is.na(feature_data)) > 0
if (any(na_cols)) {
cat("去除含缺失值基因数:", sum(na_cols), "\n")
feature_data <- feature_data[, !na_cols, drop = FALSE]
}
feature_var <- apply(feature_data, 2, var)
zero_var <- is.na(feature_var) | feature_var == 0
if (any(zero_var)) {
cat("去除零方差基因数:", sum(zero_var), "\n")
feature_data <- feature_data[, !zero_var, drop = FALSE]
}
data <- cbind(group = data$group, feature_data)
data <- as.data.frame(data)
data$group <- factor(data$group, levels = c("Control", "Treat"))
##############################
## 6. 基础信息检查
##############################
cat("最终用于RF的样本数:", nrow(data), "\n")
cat("最终用于RF的基因数:", ncol(data) - 1, "\n")
cat("分组情况:\n")
print(table(data$group))
cat("目标基因是否在RF输入矩阵中:\n")
print(target_genes %in% colnames(data)[-1])
cat("存在的目标基因:", intersect(target_genes, colnames(data)[-1]), "\n")
if (length(unique(data$group)) < 2) stop("分组不足,至少需要两组。")
if ((ncol(data) - 1) < 2) stop("可用于分析的基因数过少。")
##############################
## 7. 构建RF输入矩阵
##############################
x <- data[, colnames(data) != "group", drop = FALSE]
x <- as.data.frame(lapply(x, function(z) as.numeric(as.character(z))))
y <- factor(data$group, levels = c("Control", "Treat"))
if (any(is.na(x))) {
stop("RF输入矩阵中仍存在NA。")
}
cat("RF输入矩阵检查完成:\n")
cat("样本数:", nrow(x), "\n")
cat("基因数:", ncol(x), "\n")
##############################
## 8. 建立初始随机森林模型
##############################
set.seed(12345)
cat("开始建立随机森林模型...\n")
rf <- randomForest(
x = x,
y = y,
ntree = 1000,
mtry = max(1, floor(sqrt(ncol(x)))),
importance = TRUE
)
##############################
## 9. 提取误差并找出最优树数
##############################
if (!is.null(rf$err.rate) && "OOB" %in% colnames(rf$err.rate)) {
err_mat <- rf$err.rate
oob_error <- rf$err.rate[, "OOB"]
optionTrees <- which.min(oob_error)
} else if (!is.null(rf$err.rate)) {
err_mat <- rf$err.rate
oob_error <- rf$err.rate[, 1]
optionTrees <- which.min(oob_error)
} else {
stop("rf$err.rate 不存在,无法绘制随机森林误差图。")
}
cat("最优树数:", optionTrees, "\n")
cat("最小OOB误差:", min(oob_error, na.rm = TRUE), "\n")
##############################
## 10. 绘制随机森林误差图(前100棵树;Y轴到0.10)
##############################
plot_max <- min(100, nrow(err_mat)) # ⭐改这里(200 → 100)
err_plot <- err_mat[1:plot_max, , drop = FALSE]
pdf(file = "forest_Top1000.pdf", width = 6, height = 5.5)
tryCatch({
matplot(
x = 1:plot_max,
y = err_plot,
type = "l",
lty = c(1, 2, 3)[seq_len(ncol(err_plot))],
lwd = 2,
col = 1:ncol(err_plot),
xlab = "trees",
ylab = "Error",
main = "Random Forest OOB Error",
xlim = c(1, 100), # ⭐改这里
ylim = c(0, 0.10)
)
legend(
"topright",
legend = colnames(err_plot),
col = 1:ncol(err_plot),
lty = c(1, 2, 3)[seq_len(ncol(err_plot))],
lwd = 2,
bty = "n",
cex = 0.9
)
## 只有在范围内才画竖线
if (optionTrees <= 100) { # ⭐改这里
abline(v = optionTrees, col = "blue", lty = 2, lwd = 2)
}
}, error = function(e) {
plot.new()
text(0.5, 0.5, paste("Plot failed:", e$message))
})
dev.off()
##############################
## 11. 用最优树数重新建模
##############################
rf2 <- randomForest(
x = x,
y = y,
ntree = optionTrees,
mtry = max(1, floor(sqrt(ncol(x)))),
importance = TRUE
)
##############################
## 12. 获取基因重要性评分
##############################
imp <- importance(rf2)
if ("MeanDecreaseGini" %in% colnames(imp)) {
importance_df <- data.frame(
Gene = rownames(imp),
Importance = imp[, "MeanDecreaseGini"],
stringsAsFactors = FALSE
)
} else if ("MeanDecreaseAccuracy" %in% colnames(imp)) {
importance_df <- data.frame(
Gene = rownames(imp),
Importance = imp[, "MeanDecreaseAccuracy"],
stringsAsFactors = FALSE
)
} else {
importance_df <- data.frame(
Gene = rownames(imp),
Importance = imp[, ncol(imp)],
stringsAsFactors = FALSE
)
}
importance_df$Importance <- as.numeric(importance_df$Importance)
importance_df <- importance_df[!is.na(importance_df$Importance), , drop = FALSE]
importance_df <- importance_df[order(importance_df$Importance, decreasing = TRUE), , drop = FALSE]
if (nrow(importance_df) == 0) {
stop("未获得有效的重要性评分结果。")
}
##############################
## 13. 生成原始结果并补入目标基因
##############################
total_genes <- nrow(importance_df)
best_n <- min(20, total_genes)
cat("RF原始选择的基因数 best_n =", best_n, "\n")
final_genes_raw <- importance_df$Gene[1:best_n]
genes_in_rf_space <- intersect(target_genes, importance_df$Gene)
if (force_include_target) {
final_genes <- unique(c(final_genes_raw, genes_in_rf_space))
} else {
final_genes <- final_genes_raw
}
cat("RF原始筛选基因数:", length(final_genes_raw), "\n")
cat("补入目标基因后最终基因数:", length(final_genes), "\n")
cat("最终结果中包含的目标基因:", intersect(target_genes, final_genes), "\n")
##############################
## 14. 输出最终txt文件
##############################
write.table(
final_genes,
file = "RF.final.genes.txt",
sep = "\t",
quote = FALSE,
col.names = FALSE,
row.names = FALSE
)
##############################
## 15. 绘制Top30基因重要性图
##############################
plot_df <- importance_df
if (nrow(plot_df) > 30) {
plot_df <- plot_df[1:30, , drop = FALSE]
}
plot_df$Gene <- factor(plot_df$Gene, levels = rev(plot_df$Gene))
plot_df$Importance <- as.numeric(plot_df$Importance)
p <- ggplot(plot_df, aes(x = Importance, y = Gene)) +
geom_segment(
aes(x = 0, xend = Importance, y = Gene, yend = Gene),
color = "grey70",
linewidth = 0.8
) +
geom_point(aes(color = Importance), size = 4) +
scale_colour_gradient(low = "#5050FFFF", high = "#BB0021FF") +
labs(
title = "Top 30 Important Genes in Random Forest",
x = "Importance",
y = ""
) +
theme_bw() +
theme(
plot.title = element_text(hjust = 0.5, size = 12),
axis.text.x = element_text(color = "black", size = 10),
axis.text.y = element_text(color = "black", size = 9)
)
pdf(file = "geneImportance_Top1000.pdf", width = 7, height = 6)
tryCatch({
print(p)
}, error = function(e) {
plot.new()
text(0.5, 0.5, paste("Plot failed:", e$message))
})
dev.off()
##############################
## 16. 控制台输出总结
##############################
cat("\n=============================\n")
cat("RF 分析完成\n")
cat("=============================\n")
cat("最优树数:", optionTrees, "\n")
cat("最小OOB误差:", min(oob_error, na.rm = TRUE), "\n")
cat("最终基因数:", length(final_genes), "\n")
cat("最终结果中目标基因:", paste(intersect(target_genes, final_genes), collapse = ", "), "\n")
cat("输出文件:\n")
cat("1. forest_Top1000.pdf\n")
cat("2. geneImportance_Top1000.pdf\n")
cat("3. RF.final.genes.txt\n")
cat("=============================\n")
#======================================
# 原创代码无删减,编写不易,论文使用本代码绘图,请引用:www.tcmbiohub.com
# 祝大家投稿顺利!