rm(list = ls())
setwd("C:\\Users\\Administrator\\Desktop\\machine learning\\Network") #设置工作目录
library(dplyr)
library(tidyr)
library(readr)# 读取文件
data <- readLines('1.txt')# 定义分类等级的前缀和列名
prefixes <- c("k__", "p__", "c__", "o__", "f__", "g__", "s__")
levels <- c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species")# 拆分并填充缺失的分类
split_data <- lapply(data, function(line) {# 初始化分类向量classification <- setNames(rep("unclassified", length(levels)), levels)# 遍历每个前缀for(i in seq_along(prefixes)) {prefix <- prefixes[i]pattern <- paste0(prefix, "([^|]+)")matches <- regmatches(line, gregexpr(pattern, line))# 如果找到匹配项,则更新分类if(length(matches[[1]]) > 0) {name <- gsub(prefix, "", matches[[1]])classification[levels[i]] <- name}}return(c(line, classification))
})# 将结果转换为数据框
result_df <- do.call(rbind, split_data)# 添加列名
colnames(result_df) <- c("Original_Classification", levels)# 转换为字符矩阵以便输出
result_matrix <- apply(result_df, 2, as.character)# 输出到txt文件,包括列名
write.table(result_matrix, 'path_to_output_file.txt', quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE)