package com.ruoyi.project.service.impl; import java.io.IOException; import java.io.InputStream; import java.net.HttpURLConnection; import java.net.MalformedURLException; import java.net.URL; import java.util.*; import java.util.concurrent.*; import java.util.regex.Matcher; import java.util.regex.Pattern; import com.alibaba.fastjson.JSONObject; import com.github.pagehelper.Page; import com.github.pagehelper.PageHelper; import com.hankcs.hanlp.HanLP; import com.hankcs.hanlp.seg.common.Term; import com.ruoyi.common.utils.SensitiveInfoDetector; import com.ruoyi.common.utils.http.HttpUtils; import com.ruoyi.common.utils.security.Md5Utils; import com.ruoyi.project.domain.*; import com.ruoyi.project.mapper.*; import com.ruoyi.project.utils.AdvancedSensitiveFilter; import com.ruoyi.project.utils.HtmlUtils; import com.ruoyi.project.utils.JiaoduiUtils; import okhttp3.OkHttpClient; import okhttp3.Request; import okhttp3.Response; import org.ahocorasick.trie.Emit; import org.ahocorasick.trie.Trie; import org.elasticsearch.action.bulk.BulkRequest; import org.elasticsearch.action.bulk.BulkResponse; import org.elasticsearch.action.index.IndexRequest; import org.elasticsearch.action.search.SearchRequest; import org.elasticsearch.action.search.SearchResponse; import org.elasticsearch.client.RequestOptions; import org.elasticsearch.client.RestHighLevelClient; import org.elasticsearch.common.unit.TimeValue; import org.elasticsearch.index.query.BoolQueryBuilder; import org.elasticsearch.index.query.QueryBuilders; import org.elasticsearch.search.builder.SearchSourceBuilder; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Service; import com.ruoyi.project.service.ITrsSiteconfigService; import com.ruoyi.common.core.text.Convert; import javax.annotation.PreDestroy; /** * 配置管理Service业务层处理 * * @author ruoyi * @date 2025-03-19 */ @Service public class TrsSiteconfigServiceImpl implements ITrsSiteconfigService { @Autowired private TrsSiteconfigMapper trsSiteconfigMapper; @Autowired private TrsArticleInfoMapper trsArticleInfoMapper; @Autowired private TrsSensitiveWordsMapper trsSensitiveWordsMapper; @Autowired private TrsSensitiveResultsMapper trsSensitiveResultsMapper; // @Autowired // private RestHighLevelClientUtils restHighLevelClientUtils; @Autowired private RestHighLevelClient client; @Autowired private TrsTechnicalResultsMapper trsTechnicalResultsMapper; @Autowired private TrsPolicyResultsMapper trsPolicyResultsMapper; @Autowired private TrsNameResultsMapper trsNameResultsMapper; @Autowired private TrsDomainResultsMapper trsDomainResultsMapper; @Autowired private TrsSensitiveWordsTenantMapper trsSensitiveWordsTenantMapper; @Autowired private TrsPolicyWordsTenantMapper trsPolicyWordsTenantMapper; @Autowired private TrsTechnicalWordsMapper trsTechnicalWordsMapper; @Autowired private TrsPolicyWordsMapper trsPolicyWordsMapper; @Autowired private TrsNameWordsMapper trsNameWordsMapper; @Autowired private TrsDomainWordsMapper trsDomainWordsMapper; @Autowired private JiaoduiUtils jiaoduiUtils; @Autowired private OkHttpClient okHttpClient; @Autowired private TrsUrlResultMapper trsUrlResultMapper; @Autowired private TrsErrorUrlMapper trsErrorUrlMapper; @Autowired private TrsPersonResultMapper trsPersonResultMapper; /** * 查询配置管理 * * @param id 配置管理主键 * @return 配置管理 */ @Override public TrsSiteconfig selectTrsSiteconfigById(Long id) { return trsSiteconfigMapper.selectTrsSiteconfigById(id); } /** * 查询配置管理列表 * * @param trsSiteconfig 配置管理 * @return 配置管理 */ @Override public List selectTrsSiteconfigList(TrsSiteconfig trsSiteconfig) { return trsSiteconfigMapper.selectTrsSiteconfigList(trsSiteconfig); } /** * 新增配置管理 * * @param trsSiteconfig 配置管理 * @return 结果 */ @Override public int insertTrsSiteconfig(TrsSiteconfig trsSiteconfig) { return trsSiteconfigMapper.insertTrsSiteconfig(trsSiteconfig); } /** * 修改配置管理 * * @param trsSiteconfig 配置管理 * @return 结果 */ @Override public int updateTrsSiteconfig(TrsSiteconfig trsSiteconfig) { return trsSiteconfigMapper.updateTrsSiteconfig(trsSiteconfig); } /** * 批量删除配置管理 * * @param ids 需要删除的配置管理主键 * @return 结果 */ @Override public int deleteTrsSiteconfigByIds(String ids) { return trsSiteconfigMapper.deleteTrsSiteconfigByIds(Convert.toStrArray(ids)); } /** * 删除配置管理信息 * * @param id 配置管理主键 * @return 结果 */ @Override public int deleteTrsSiteconfigById(Long id) { return trsSiteconfigMapper.deleteTrsSiteconfigById(id); } @Override public Integer dosysn(Long id) { int currentPage = 1; int BATCH_SIZE = 1000; while (true) { // 使用 PageHelper 分页 Page page = PageHelper.startPage(currentPage, BATCH_SIZE); // 执行查询 TrsArticleInfo trsArticleInfo = new TrsArticleInfo(); trsArticleInfo.setRecord(id); List articles = trsArticleInfoMapper.selectTrsArticleInfoList(trsArticleInfo); if (articles.isEmpty()) { break; // 数据已全部处理完毕 } // 处理数据(例如插入 Elasticsearch) processArticles(articles); // 更新页码 // 重要:检查是否已经是最后一页 if (articles.size() < BATCH_SIZE) { break; // 数据已全部处理完毕 } else { currentPage++; } System.out.println("已处理 " + (currentPage - 1) * BATCH_SIZE + " 条数据"); } return 1; } private void processArticles(List articles) { BulkRequest bulkRequest = new BulkRequest(); for (TrsArticleInfo article : articles) { IndexRequest indexRequest = new IndexRequest("articles") .id(String.valueOf(article.getId())) .source( "content", article.getContent(), "page", article.getUrl(), "record", article.getRecord() ); //.source("content", article.getContent(), XContentType.JSON); bulkRequest.add(indexRequest); } try { BulkResponse bulkResponse = client.bulk(bulkRequest, RequestOptions.DEFAULT); if (bulkResponse.hasFailures()) { System.err.println("批量插入失败:" + bulkResponse.buildFailureMessage()); } } catch (IOException e) { e.printStackTrace(); } } @Override public void doscan(Long id, String type, String sitename) { // TrsSiteconfig trsSiteconfig = trsSiteconfigMapper.selectTrsSiteconfigById2(id); Map startTimeMap = new HashMap<>(); Map endTimeMap = new HashMap<>(); // 缓存敏感词的正确词和错误类型 Map correctWordMap = new HashMap<>(); Map errorTypeMap = new HashMap<>(); Map errorGradeMap = new HashMap<>(); // Map technicalErrorTypeMap = new HashMap<>(); Map policyErrorRuleMap = new HashMap<>(); Map policyErrorTypeMap = new HashMap<>(); Map nameCountryMap = new HashMap<>(); Map nameWholeMap = new HashMap<>(); Map nameShortMap = new HashMap<>(); Map nameSortMap = new HashMap<>(); Map domainMap = new HashMap<>(); loadWhitelist(); // 加载敏感词库 Trie trie = loadSensitiveWords(correctWordMap, errorTypeMap, errorGradeMap, startTimeMap, endTimeMap); // Trie technicalTrie = loadTechnicalWords(technicalErrorTypeMap); Trie policyTrie = loadPolicyWords(policyErrorRuleMap, policyErrorTypeMap, startTimeMap, endTimeMap); // Trie nameTrie = loadNameWords(nameCountryMap, nameWholeMap,nameShortMap,nameSortMap); // Trie domainTrie = loadDomainWords(domainMap); if("1".equals(type)){//第一次扫描 // 设置分页参数 int pageNum = 1; // 当前页码 int pageSize = 500; // 每页大小,根据实际情况调整 boolean hasMoreData = true; TrsSiteconfig siteconfig = new TrsSiteconfig(); siteconfig.setId(id); // List list = trsSiteconfigMapper.selectTrsSiteconfigList2(siteconfig); List list = new ArrayList<>(); TrsSiteconfig trsSiteconfig = trsSiteconfigMapper.selectTrsSiteconfigById(id); list.add(trsSiteconfig); for (TrsSiteconfig config : list) { TrsArticleInfo trsArticleInfo = new TrsArticleInfo(); trsArticleInfo.setRecord(config.getRecord()); trsArticleInfo.setSitename(config.getSitename()); while (hasMoreData) { try { // 1. 设置分页参数(关键步骤) PageHelper.startPage(pageNum, pageSize); // 2. 紧跟着的第一个查询会被分页 List articles = trsArticleInfoMapper.selectTrsArticleInfoList(trsArticleInfo); if (articles == null || articles.isEmpty()) { hasMoreData = false; } else { // 4. 处理当前页数据 detectAndSaveResults(trsSensitiveResultsMapper, TrsSensitiveResults.class, trie, articles, correctWordMap, errorTypeMap, null, errorGradeMap, startTimeMap, endTimeMap, config); detectAndSaveResults(trsPolicyResultsMapper, TrsPolicyResults.class, policyTrie, articles, null, policyErrorTypeMap, policyErrorRuleMap, errorGradeMap, startTimeMap, endTimeMap, config); // 5. 准备查询下一页 // 重要:检查是否已经是最后一页 if (articles.size() < pageSize) { hasMoreData = false; } else { pageNum++; } } } finally { // 7. 确保每次循环后清除分页参数(重要!) PageHelper.clearPage(); } } } /*TrsArticleInfo trsArticleInfo = new TrsArticleInfo(); trsArticleInfo.setRecord(trsSiteconfig.getRecord()); List articles = trsArticleInfoMapper.selectTrsArticleInfoList(trsArticleInfo); detectAndSaveResults(trsSensitiveResultsMapper, TrsSensitiveResults.class, trie, articles, correctWordMap, errorTypeMap, null, errorGradeMap,startTimeMap,endTimeMap, trsSiteconfig); detectAndSaveResults(trsPolicyResultsMapper, TrsPolicyResults.class, policyTrie, articles, null, policyErrorTypeMap, policyErrorRuleMap, errorGradeMap,startTimeMap,endTimeMap, trsSiteconfig);*/ }else if("3".equals(type)){//定时任务自动更新 if("新媒体".equals(sitename)){ TrsSiteconfig siteconfig = new TrsSiteconfig(); siteconfig.setMsg("weixin"); List weixinList = trsSiteconfigMapper.selectTrsSiteconfigList2(siteconfig); for (TrsSiteconfig weixin : weixinList) { TrsArticleInfo trsArticleInfo = new TrsArticleInfo(); trsArticleInfo.setSitename(weixin.getSitename()); trsArticleInfo.setRecord(weixin.getRecord()); Map params = new HashMap<>(); params.put("beginTime","2025-08-01"); // params.put("beginTime","2000-08-01"); params.put("endTime","2050-04-20"); trsArticleInfo.setParams(params); List articles = trsArticleInfoMapper.selectTrsArticleInfoList(trsArticleInfo); detectAndSaveResults(trsSensitiveResultsMapper, TrsSensitiveResults.class, trie, articles, correctWordMap, errorTypeMap, null, errorGradeMap,startTimeMap,endTimeMap, weixin); detectAndSaveResults(trsPolicyResultsMapper, TrsPolicyResults.class, policyTrie, articles, null, policyErrorTypeMap, policyErrorRuleMap, errorGradeMap,startTimeMap,endTimeMap, weixin); } TrsSiteconfig siteconfig2 = new TrsSiteconfig(); siteconfig2.setMsg("weibo"); List weiboList = trsSiteconfigMapper.selectTrsSiteconfigList2(siteconfig2); for (TrsSiteconfig weixin : weiboList) { TrsArticleInfo trsArticleInfo = new TrsArticleInfo(); trsArticleInfo.setSitename(weixin.getSitename()); trsArticleInfo.setRecord(weixin.getRecord()); Map params = new HashMap<>(); params.put("beginTime","2025-08-01"); params.put("endTime","2050-04-20"); trsArticleInfo.setParams(params); List articles = trsArticleInfoMapper.selectTrsArticleInfoList(trsArticleInfo); detectAndSaveResults(trsSensitiveResultsMapper, TrsSensitiveResults.class, trie, articles, correctWordMap, errorTypeMap, null, errorGradeMap,startTimeMap,endTimeMap, weixin); detectAndSaveResults(trsPolicyResultsMapper, TrsPolicyResults.class, policyTrie, articles, null, policyErrorTypeMap, policyErrorRuleMap, errorGradeMap,startTimeMap,endTimeMap, weixin); } TrsSiteconfig siteconfig3 = new TrsSiteconfig(); siteconfig3.setMsg("dsp"); List dspList = trsSiteconfigMapper.selectTrsSiteconfigList2(siteconfig3); for (TrsSiteconfig dsp : dspList) { TrsArticleInfo trsArticleInfo = new TrsArticleInfo(); trsArticleInfo.setSitename(dsp.getSitename()); trsArticleInfo.setRecord(dsp.getRecord()); Map params = new HashMap<>(); params.put("beginTime","2025-08-01"); params.put("endTime","2050-04-20"); trsArticleInfo.setParams(params); List articles = trsArticleInfoMapper.selectTrsArticleInfoList(trsArticleInfo); detectAndSaveResults(trsSensitiveResultsMapper, TrsSensitiveResults.class, trie, articles, correctWordMap, errorTypeMap, null, errorGradeMap,startTimeMap,endTimeMap, dsp); detectAndSaveResults(trsPolicyResultsMapper, TrsPolicyResults.class, policyTrie, articles, null, policyErrorTypeMap, policyErrorRuleMap, errorGradeMap,startTimeMap,endTimeMap, dsp); } TrsSiteconfig siteconfig4 = new TrsSiteconfig(); siteconfig4.setMsg("toutiao"); List toutiaoList = trsSiteconfigMapper.selectTrsSiteconfigList2(siteconfig4); for (TrsSiteconfig toutiao : toutiaoList) { TrsArticleInfo trsArticleInfo = new TrsArticleInfo(); trsArticleInfo.setSitename(toutiao.getSitename()); trsArticleInfo.setRecord(toutiao.getRecord()); Map params = new HashMap<>(); params.put("beginTime","2025-08-01"); params.put("endTime","2050-04-20"); trsArticleInfo.setParams(params); List articles = trsArticleInfoMapper.selectTrsArticleInfoList(trsArticleInfo); detectAndSaveResults(trsSensitiveResultsMapper, TrsSensitiveResults.class, trie, articles, correctWordMap, errorTypeMap, null, errorGradeMap,startTimeMap,endTimeMap, toutiao); detectAndSaveResults(trsPolicyResultsMapper, TrsPolicyResults.class, policyTrie, articles, null, policyErrorTypeMap, policyErrorRuleMap, errorGradeMap,startTimeMap,endTimeMap, toutiao); } }else if("网站".equals(sitename)){ TrsSiteconfig siteconfig = new TrsSiteconfig(); siteconfig.setMsg("site"); List siteList = trsSiteconfigMapper.selectTrsSiteconfigList2(siteconfig); for (TrsSiteconfig site : siteList) { // 设置分页参数 int pageNum = 1; // 当前页码 int pageSize = 500; // 每页大小,根据实际情况调整 boolean hasMoreData = true; TrsArticleInfo trsArticleInfo = new TrsArticleInfo(); trsArticleInfo.setSitename(site.getSitename()); trsArticleInfo.setRecord(site.getRecord()); Map params = new HashMap<>(); params.put("beginTime","2025-08-01"); params.put("endTime","2050-04-20"); // trsArticleInfo.setTitle("习近平夫妇会见柬埔寨国王西哈莫尼和太后莫尼列"); trsArticleInfo.setParams(params); while (hasMoreData) { try { // 1. 设置分页参数(关键步骤) PageHelper.startPage(pageNum, pageSize); // 2. 紧跟着的第一个查询会被分页 List articles = trsArticleInfoMapper.selectTrsArticleInfoList(trsArticleInfo); if (articles == null || articles.isEmpty()) { hasMoreData = false; } else { // 4. 处理当前页数据 detectAndSaveResults(trsSensitiveResultsMapper, TrsSensitiveResults.class, trie, articles, correctWordMap, errorTypeMap, null, errorGradeMap,startTimeMap,endTimeMap, site); detectAndSaveResults(trsPolicyResultsMapper, TrsPolicyResults.class, policyTrie, articles, null, policyErrorTypeMap, policyErrorRuleMap, errorGradeMap,startTimeMap,endTimeMap, site); // 5. 准备查询下一页 // 重要:检查是否已经是最后一页 if (articles.size() < pageSize) { hasMoreData = false; } else { pageNum++; } } } finally { // 7. 确保每次循环后清除分页参数(重要!) PageHelper.clearPage(); } } } } }else if("4".equals(type)){//定时任务自动复核 if("新媒体".equals(sitename)){ TrsSiteconfig siteconfig = new TrsSiteconfig(); siteconfig.setMsg("weixin"); List weixinList = trsSiteconfigMapper.selectTrsSiteconfigList2(siteconfig); for (TrsSiteconfig weixin : weixinList) { fuhe(weixin.getSitename(),"新媒体"); } TrsSiteconfig siteconfig2 = new TrsSiteconfig(); siteconfig2.setMsg("weibo"); List weiboList = trsSiteconfigMapper.selectTrsSiteconfigList2(siteconfig2); for (TrsSiteconfig weixin : weiboList) { fuhe(weixin.getSitename(), ""); } TrsSiteconfig siteconfig3 = new TrsSiteconfig(); siteconfig3.setMsg("dsp"); List dspList = trsSiteconfigMapper.selectTrsSiteconfigList2(siteconfig3); for (TrsSiteconfig dsp : dspList) { fuhe(dsp.getSitename(), ""); } TrsSiteconfig siteconfig4 = new TrsSiteconfig(); siteconfig4.setMsg("toutiao"); List toutiaoList = trsSiteconfigMapper.selectTrsSiteconfigList2(siteconfig4); for (TrsSiteconfig toutiao : toutiaoList) { fuhe(toutiao.getSitename(), ""); } }else{ TrsSiteconfig siteconfig = new TrsSiteconfig(); siteconfig.setMsg("site"); List siteList = trsSiteconfigMapper.selectTrsSiteconfigList2(siteconfig); for (TrsSiteconfig site : siteList) { fuhe(site.getSitename(),""); } } }else if("2".equals(type)){//手动复核 //get请求调用第三方系统接口 String url = "http://10.0.8.14:8082/hycloud/start"; String resultStr = HttpUtils.sendGet(url); JSONObject jsonObject = JSONObject.parseObject(resultStr); if("任务执行成功".equals(jsonObject.get("msg"))){ List> articles = trsArticleInfoMapper.selectTrsArticleInfoReviewList(); for (Map article : articles) { try { String content = article.get("content")+""; String title = article.get("title")+""; String wrong_word = article.get("wrong_word")+""; String correct_word = article.get("correct_word")+""; String datakey = article.get("datakey")+""; String msg = article.get("msg")+""; String resultid = article.get("id")+""; /*Date time = null; if(article.get("time")!=null){ time = (Date) article.get("time"); }else { time = new Date(); }*/ List jiaodui = jiaoduiUtils.getJiaodui(content, title, type); boolean flag=true; TrsSensitiveResults trsresult = new TrsSensitiveResults(); trsresult.setDatakey(datakey); //trsresult.setTime(time); trsresult.setId(Long.parseLong(resultid)); for (TrsResults trsResults : jiaodui) { if(trsResults.getWrongWord().equals(wrong_word)) flag=false; if(trsResults.getWrongWord().equals(wrong_word)||trsResults.getCorrectWord().equals(correct_word)){ trsresult.setWrongWord(trsResults.getWrongWord()); trsresult.setCorrectWord(trsResults.getCorrectWord()); } } if(flag){ if("其他".equals(msg)){ trsresult.setStatus("其他"); }else { trsresult.setStatus("已处理"); } }else { trsresult.setStatus("待处理"); } trsSensitiveResultsMapper.updateTrsSensitiveResults(trsresult); }catch (Exception e){ e.printStackTrace(); } } List> articles2 = trsArticleInfoMapper.selectTrsArticleInfoReviewList2(); for (Map article : articles2) { try { String content = article.get("content")+""; String title = article.get("title")+""; String wrong_word = article.get("wrong_word")+""; String datakey = article.get("datakey")+""; String msg = article.get("msg")+""; String resultid = article.get("id")+""; /*Date time = null; if(article.get("time")!=null){ time = (Date) article.get("time"); }else { time = new Date(); }*/ List jiaodui = jiaoduiUtils.getJiaodui(content, title, type); boolean flag=true; TrsPolicyResults trsresult = new TrsPolicyResults(); trsresult.setDatakey(datakey); //trsresult.setTime(time); trsresult.setId(Long.parseLong(resultid)); for (TrsResults trsResults : jiaodui) { if(!trsResults.getWrongWord().equals(wrong_word))flag=false; } if(flag){ if("其他".equals(msg)){ trsresult.setStatus("其他"); }else { trsresult.setStatus("已处理"); } }else { trsresult.setStatus("待处理"); } trsPolicyResultsMapper.updateTrsPolicyResults(trsresult); }catch (Exception e){ e.printStackTrace(); } } } } /* // 初始化 Scroll String scrollId = initScroll(id); // 分批处理文章 while (true) { List articles = loadArticlesBatch(scrollId,id); TrsArticleInfo trsArticleInfo = new TrsArticleInfo(); trsArticleInfo.setRecord(id); List articles = trsArticleInfoMapper.selectTrsArticleInfoList(trsArticleInfo); if (articles.isEmpty()) break; // 检测敏感词并生成结果 *//*List results = detect(trie, articles, correctWordMap, errorTypeMap, trsSiteconfig.getSitename()); // 保存结果到 MySQL results.forEach(result -> { trsSensitiveResultsMapper.insertTrsSensitiveResults(result); });*//* // 依次检测不同类型的敏感信息 detectAndSaveResults(trsSensitiveResultsMapper, TrsSensitiveResults.class, trie, articles, correctWordMap, errorTypeMap, null, null, trsSiteconfig); // detectAndSaveResults(trsTechnicalResultsMapper, TrsTechnicalResults.class, technicalTrie, articles, null, technicalErrorTypeMap, null, null, trsSiteconfig); // detectAndSaveResults(trsPolicyResultsMapper, TrsPolicyResults.class, policyTrie, articles, null, policyErrorTypeMap, policyErrorRuleMap, null, trsSiteconfig); // detectAndSaveResults(trsNameResultsMapper, TrsNameResults.class, nameTrie, articles, nameCountryMap, nameWholeMap, nameShortMap, nameSortMap, trsSiteconfig); // detectAndSaveResults(trsDomainResultsMapper, TrsDomainResults.class, domainTrie, articles, null, domainMap, null, null, trsSiteconfig); if (articles.size()<100) break; } // 清理 Scroll ClearScrollRequest clearRequest = new ClearScrollRequest(); clearRequest.addScrollId(scrollId); try { client.clearScroll(clearRequest, RequestOptions.DEFAULT); } catch (IOException e) { e.printStackTrace(); }*/ } private void fuhe(String sitename, String type) { TrsSensitiveResults trsSensitiveResults = new TrsSensitiveResults(); // trsSensitiveResults.setStatus("待复核"); trsSensitiveResults.setSitename(sitename); if("新媒体".equals(type)){ Map params = new HashMap<>(); params.put("beginTime","2025-08-01"); params.put("endTime","2050-04-20"); trsSensitiveResults.setParams(params); } List trsSensitiveResults1 = trsSensitiveResultsMapper.selectTrsSensitiveResultsListAll(trsSensitiveResults); for (TrsSensitiveResults trsSensitiveResults2 : trsSensitiveResults1) { if(!"待处理".equals(trsSensitiveResults2.getStatus())&&!"待复核".equals(trsSensitiveResults2.getStatus())) continue; /*long version = trsArticleInfoMapper.selectTrsArticleInfoMaxVersion();*/ TrsArticleInfo trsArticleInfo = new TrsArticleInfo(); /*trsArticleInfo.setRecord(trsSiteconfig.getRecord());*/ /*trsArticleInfo.setVersion(version);*/ trsArticleInfo.setTitle(trsSensitiveResults2.getTitle()); trsArticleInfo.setUrl(trsSensitiveResults2.getPage()); // List articles = trsArticleInfoMapper.selectTrsArticleInfoList(trsArticleInfo); List articles = trsArticleInfoMapper.selectTrsArticleInfoFuheList(trsArticleInfo); String wrongWord = trsSensitiveResults2.getWrongWord(); if(articles.size()>0){ List jiaodui = jiaoduiUtils.getJiaodui(articles.get(0).getContent(), articles.get(0).getTitle(), "4"); boolean flag = true; for (TrsResults trsResults : jiaodui) { if(wrongWord.equals(trsResults.getWrongWord()))flag = false; } if(flag){ TrsSensitiveResults trsSensitiveResults_1 = new TrsSensitiveResults(); trsSensitiveResults_1.setStatus("已处理"); trsSensitiveResults_1.setId(trsSensitiveResults2.getId()); trsSensitiveResultsMapper.updateTrsSensitiveResults(trsSensitiveResults_1); } } } TrsPolicyResults trsPolicyResults = new TrsPolicyResults(); trsPolicyResults.setSitename(sitename); if("新媒体".equals(type)){ Map params = new HashMap<>(); params.put("beginTime","2025-08-01"); params.put("endTime","2050-04-20"); trsPolicyResults.setParams(params); } List trsPolicyResults1 = trsPolicyResultsMapper.selectTrsPolicyResultsList(trsPolicyResults); for (TrsPolicyResults trsPolicyResults2 : trsPolicyResults1) { TrsArticleInfo trsArticleInfo = new TrsArticleInfo(); /*trsArticleInfo.setRecord(trsSiteconfig.getRecord());*/ trsArticleInfo.setTitle(trsPolicyResults2.getTitle()); trsArticleInfo.setUrl(trsPolicyResults2.getPage()); List articles = trsArticleInfoMapper.selectTrsArticleInfoFuheList(trsArticleInfo); String wrongWord = trsPolicyResults2.getWrongWord(); if(articles.size()>0){ List jiaodui = jiaoduiUtils.getJiaodui(articles.get(0).getContent(), articles.get(0).getTitle(), "4"); boolean flag = true; for (TrsResults trsResults : jiaodui) { if(wrongWord.equals(trsResults.getWrongWord()))flag = false; } if(flag){ TrsPolicyResults trsSensitiveResults_1 = new TrsPolicyResults(); trsSensitiveResults_1.setStatus("已处理"); trsSensitiveResults_1.setId(trsPolicyResults2.getId()); trsPolicyResultsMapper.updateTrsPolicyResults(trsSensitiveResults_1); } } } } private void detectAndSaveResults(Object mapper, Class clazz, Trie trie, List articles, Map correctWordMap, Map errorTypeMap, Map extraMap1, Map extraMap2,Map startTimeMap,Map endTimeMap, TrsSiteconfig trsSiteconfig) { if (mapper == null) { throw new RuntimeException("Mapper 为空,无法插入数据:" + clazz.getName()); } Map selectedErrorMap = errorTypeMap; Map selectedExtraMap1 = extraMap1; Map selectedExtraMap2 = extraMap2; List results = detect(clazz, trie, articles, correctWordMap, selectedErrorMap, selectedExtraMap1, selectedExtraMap2,startTimeMap,endTimeMap, trsSiteconfig); // 通过反射调用 insert 方法 for (Object result : results) { if (mapper instanceof TrsSensitiveResultsMapper) { Long id = ((TrsSensitiveResultsMapper) mapper).selectId(((TrsSensitiveResults)result).getDatakey()); if (id != null) { TrsSensitiveResults result1 = (TrsSensitiveResults)result; result1.setId(id); ((TrsSensitiveResultsMapper) mapper).updateTrsSensitiveResults(result1); }else { TrsSensitiveResults result1 = (TrsSensitiveResults) result; result1.setStatus("待处理"); ((TrsSensitiveResultsMapper) mapper).insertTrsSensitiveResults(result1); } } else if (mapper instanceof TrsTechnicalResultsMapper) { ((TrsTechnicalResultsMapper) mapper).insertTrsTechnicalResults((TrsTechnicalResults) result); } else if (mapper instanceof TrsPolicyResultsMapper) { Long id = ((TrsPolicyResultsMapper) mapper).selectId(((TrsPolicyResults)result).getDatakey()); if (id != null) { TrsPolicyResults result1 = (TrsPolicyResults)result; result1.setId(id); ((TrsPolicyResultsMapper) mapper).updateTrsPolicyResults(result1); }else { TrsPolicyResults result1 = (TrsPolicyResults) result; result1.setStatus("待处理"); ((TrsPolicyResultsMapper) mapper).insertTrsPolicyResults(result1); } } else if (mapper instanceof TrsNameResultsMapper) { ((TrsNameResultsMapper) mapper).insertTrsNameResults((TrsNameResults) result); } else if (mapper instanceof TrsDomainResultsMapper) { ((TrsDomainResultsMapper) mapper).insertTrsDomainResults((TrsDomainResults) result); } else { throw new RuntimeException("未找到适配的 Mapper:" + mapper.getClass().getSimpleName()); } } } private List detect(Class clazz, Trie trie, List articles, Map correctWordMap, Map errorTypeMap, Map extraMap1, Map extraMap2,Map startTimeMap,Map endTimeMap, TrsSiteconfig trsSiteconfig) { List resultSet = new ArrayList<>(); for (TrsArticleInfo article : articles) { String content = article.getContent(); if (content == null) { continue; } String title = article.getTitle(); Date time = article.getTime(); Date scantime = article.getScantime(); String sitename = article.getSitename(); Long record = article.getRecord(); // resultSet.addAll(detectContent(clazz, trie, content, correctWordMap, errorTypeMap, sitename, article.getUrl())); // resultSet.addAll(detectSegmentedContent(clazz, trie, content, correctWordMap, errorTypeMap, sitename, article.getUrl())); List ts = detectContent(clazz, trie, content, correctWordMap, errorTypeMap, extraMap1, extraMap2, startTimeMap, endTimeMap, trsSiteconfig, article.getUrl(), title, time, scantime, sitename, record); resultSet.addAll(ts); System.out.println("执行几次"); // resultSet.addAll(detectSegmentedContent(clazz, trie, content, correctWordMap, errorTypeMap, extraMap1, extraMap2, trsSiteconfig, article.getUrl())); } return new ArrayList<>(resultSet); } private List detectContent(Class clazz, Trie trie, String content, Map correctWordMap, Map errorTypeMap, Map extraMap1, Map extraMap2,Map startTimeMap,Map endTimeMap, TrsSiteconfig trsSiteconfig, String url, String title, Date time, Date scantime, String sitename, Long record) { List results = new ArrayList<>(); for (Emit emit : trie.parseText(content)) { String wrongWord = emit.getKeyword(); String correctWord = ""; if(correctWordMap!=null){ correctWord = correctWordMap.getOrDefault(wrongWord, "未知"); } // 如果匹配到的词在白名单中,则跳过 if (isInWhitelist(content, emit.getStart(), emit.getEnd() + 1)) { continue; } if(!AdvancedSensitiveFilter.needsFiltering(content, wrongWord)){ continue; } // 只匹配独立的词,确保不是误匹配 Matcher matcher = Pattern.compile( Pattern.quote(wrongWord)).matcher(content); if (matcher.find()) { // 先对文章进行分词 boolean flag = true; //时效性 if(startTimeMap!=null && time!=null && startTimeMap.get(wrongWord)!=null && time.before(startTimeMap.get(wrongWord)) ){ flag = false; } if(endTimeMap!=null && time!=null && endTimeMap.get(wrongWord)!=null && time.after(endTimeMap.get(wrongWord)) ){ flag = false; } String subString = extractContext(content, emit.getStart(), emit.getEnd()); if(correctWordMap!=null){ if(subString!=null&&subString.contains(correctWord)&&subString.contains(wrongWord)&&!wrongWord.contains(correctWord)){ flag=false; } } List words = segmentText(wrongWord); for(String word:words){ if (!wrongWord.contains(word)) { flag=false; } } List subStringseg = segmentText(subString); /*if (!subStringseg.contains(wrongWord)&&!subStringseg.stream().anyMatch(words::contains)) { flag=false; }*/ if (!subStringseg.contains(wrongWord)) { if(!subStringseg.containsAll(words)) flag=false; } if(flag){ T result = createResultInstance(clazz, url, wrongWord, correctWordMap, errorTypeMap, extraMap1, extraMap2, trsSiteconfig, content, emit.getStart(), emit.getEnd(), title, time, scantime, sitename, record); results.add(result); } } } return results; } private List detectSegmentedContent(Class clazz, Trie trie, String content, Map correctWordMap, Map errorTypeMap, Map extraMap1, Map extraMap2, TrsSiteconfig trsSiteconfig, String url, String title, Date time, Date scantime, String sitename) { List results = new ArrayList<>(); // 先对文章进行分词 List words = segmentText(content); // 逐个分词进行敏感词匹配 for (String word : words) { for (Emit emit : trie.parseText(word)) { String wrongWord = emit.getKeyword(); // 如果匹配到的词在白名单中,则跳过 if (isInWhitelist(content, emit.getStart(), emit.getEnd() + 1)) { continue; } // 只匹配独立的词,确保不是误匹配 Matcher matcher = Pattern.compile("\\b" + Pattern.quote(wrongWord) + "\\b").matcher(word); if (matcher.find()) { // 确定分词的起始和结束位置 int start = content.indexOf(word); int end = start + word.length(); T result = createResultInstance(clazz, url, wrongWord, correctWordMap, errorTypeMap, extraMap1, extraMap2, trsSiteconfig, content, start, end, title, time,scantime, sitename, 99L); results.add(result); } } } return results; } private T createResultInstance(Class clazz, String url, String wrongWord, Map correctWordMap, Map errorTypeMap, Map extraMap1, Map extraMap2, TrsSiteconfig trsSiteconfig, String content, int start, int end, String title, Date time,Date scantime, String sitename, Long record) { try { T result = clazz.getDeclaredConstructor().newInstance(); if (result instanceof TrsSensitiveResults) { TrsSensitiveResults r = (TrsSensitiveResults) result; /*if("1".equals(type)){ }*/ r.setPage(url); r.setTitle(title); r.setTime(time); r.setScantime(scantime); r.setWrongWord(wrongWord); r.setCorrectWord(correctWordMap.getOrDefault(wrongWord, "")); r.setErrorType(errorTypeMap.getOrDefault(wrongWord, "")); r.setErrorGrade(extraMap2.getOrDefault(wrongWord, "一般性错误")); r.setContext(extractContext(content, start, end)); r.setAllcontext(content); r.setSitename(sitename); r.setTenantId(trsSiteconfig.getTenantId()); if(97==record){ r.setSitetag("weibo"); } else if (98==record) { r.setSitetag("weixin"); }else if (99==record) { r.setSitetag("site"); }else if (96==record) { r.setSitetag("dsp"); }else if (95==record) { r.setSitetag("toutiao"); } /*r.setSitetag(trsSiteconfig.getMsg());*/ //r.setStatus("待处理"); r.setDatakey(Md5Utils.hash(wrongWord + url )); return (T) r; } else if (result instanceof TrsTechnicalResults) { TrsTechnicalResults r = (TrsTechnicalResults) result; r.setPage(url); r.setWrongWord(wrongWord); r.setErrorType(errorTypeMap.getOrDefault(wrongWord, "")); r.setContext(extractContext(content, start, end)); r.setSitename(sitename); r.setTenantId(trsSiteconfig.getTenantId()); //r.setStatus("待处理"); return (T) r; } else if (result instanceof TrsPolicyResults) { TrsPolicyResults r = (TrsPolicyResults) result; r.setPage(url); r.setTitle(title); r.setTime(time); r.setScantime(scantime); r.setWrongWord(wrongWord); r.setErrorType(errorTypeMap.getOrDefault(wrongWord, "")); r.setErrorGrade(extraMap2.getOrDefault(wrongWord, "一般性错误")); r.setContext(extractContext(content, start, end)); r.setAllcontext(content); r.setSitename(sitename); r.setTenantId(trsSiteconfig.getTenantId()); if(97==record){ r.setSitetag("weibo"); } else if (98==record) { r.setSitetag("weixin"); }else if (99==record) { r.setSitetag("site"); } /*r.setSitetag(trsSiteconfig.getMsg());*/ //r.setStatus("待处理"); r.setWrongRule(extraMap1 != null ? extraMap1.getOrDefault(wrongWord, "") : ""); r.setDatakey(Md5Utils.hash(wrongWord + url )); return (T) r; } else if (result instanceof TrsNameResults) { TrsNameResults r = (TrsNameResults) result; r.setPage(url); r.setTitle(title); r.setTime(time); r.setScantime(scantime); r.setWrongWord(wrongWord); r.setSort(extraMap2 != null ? extraMap2.getOrDefault(wrongWord, "") : ""); r.setDutyShort(extraMap1 != null ? extraMap1.getOrDefault(wrongWord, "") : ""); r.setDutyAll(errorTypeMap != null ? errorTypeMap.getOrDefault(wrongWord, "") : ""); r.setCountry(correctWordMap != null ? correctWordMap.getOrDefault(wrongWord, "") : ""); r.setContext(extractContext(content, start, end)); r.setAllcontext(content); r.setSitename(sitename); r.setTenantId(trsSiteconfig.getTenantId()); if(97==record){ r.setSitetag("weibo"); } else if (98==record) { r.setSitetag("weixin"); }else if (99==record) { r.setSitetag("site"); } /*r.setSitetag(trsSiteconfig.getMsg());*/ //r.setStatus("待处理"); r.setDatakey(Md5Utils.hash(wrongWord + url )); return (T) r; } else if (result instanceof TrsDomainResults) { TrsDomainResults r = (TrsDomainResults) result; r.setPage(url); r.setWrongWord(wrongWord); r.setDomainType(errorTypeMap != null ? errorTypeMap.getOrDefault(wrongWord, "") : ""); r.setContext(extractContext(content, start, end)); r.setSitename(sitename); r.setTenantId(trsSiteconfig.getTenantId()); //r.setStatus("待处理"); return (T) r; } else { throw new RuntimeException("未支持的结果类型:" + clazz.getName()); } /*return result;*/ } catch (Exception e) { throw new RuntimeException("无法创建实例:" + clazz.getName(), e); } } public List segmentText(String text) { List termList = HanLP.segment(text); List words = new ArrayList<>(); for (Term term : termList) { words.add(term.word); } return words; } // 检测敏感词并生成结果 /* public List detect(Trie trie, List articles, Map correctWordMap, Map errorTypeMap, String sitename) { Set resultSet = new HashSet<>(); // 使用 Set 去重 for (TrsArticleInfo article : articles) { String content = article.getContent(); // 1. 先进行整体匹配 resultSet.addAll(detectWholeContent(trie, content, correctWordMap, errorTypeMap, sitename, article.getUrl())); // 2. 再进行分词后匹配 resultSet.addAll(detectSegmentedContent(trie, content, correctWordMap, errorTypeMap, sitename, article.getUrl())); } return new ArrayList<>(resultSet); // 转换为 List 返回 } *//** * 整体匹配:直接对文章内容进行敏感词匹配 *//* private List detectWholeContent(Trie trie, String content, Map correctWordMap, Map errorTypeMap, String sitename, String url) { List results = new ArrayList<>(); for (Emit emit : trie.parseText(content)) { String wrongWord = emit.getKeyword(); // 判断错词是否在白名单短语里 if (isInWhitelist(content, emit.getStart(), emit.getEnd() + 1)) { continue; // 跳过匹配 } // 使用正则表达式匹配独立的词 String regex = "\\b" + Pattern.quote(wrongWord) + "\\b"; Pattern pattern = Pattern.compile(regex); Matcher matcher = pattern.matcher(content); // 如果匹配到独立的词 if (matcher.find()) { TrsSensitiveResults trsSensitiveResults = new TrsSensitiveResults(); trsSensitiveResults.setPage(url); trsSensitiveResults.setWrongWord(wrongWord); trsSensitiveResults.setCorrectWord(correctWordMap.get(wrongWord)); trsSensitiveResults.setErrorType(errorTypeMap.get(wrongWord)); trsSensitiveResults.setContext(extractContext(content, emit.getStart(), emit.getEnd())); trsSensitiveResults.setSitename(sitename); trsSensitiveResults.setStatus("待处理"); results.add(trsSensitiveResults); } } return results; } *//** * 分词后匹配:对文章内容进行分词,然后对每个分词结果进行敏感词匹配 *//* private List detectSegmentedContent(Trie trie, String content, Map correctWordMap, Map errorTypeMap, String sitename, String url) { List results = new ArrayList<>(); // 对内容进行分词 List words = segmentText(content); // 对每个分词结果进行敏感词匹配 for (String word : words) { for (Emit emit : trie.parseText(word)) { String wrongWord = emit.getKeyword(); // 判断错词是否在白名单短语里 if (isInWhitelist(content, emit.getStart(), emit.getEnd() + 1)) { continue; // 跳过匹配 } // 使用正则表达式匹配独立的词 String regex = "\\b" + Pattern.quote(wrongWord) + "\\b"; Pattern pattern = Pattern.compile(regex); Matcher matcher = pattern.matcher(word); // 如果匹配到独立的词 if (matcher.find()) { // 找到分词结果在原始内容中的位置 int start = content.indexOf(word); // 分词结果在原始内容中的起始位置 int end = start + word.length(); // 分词结果在原始内容中的结束位置 TrsSensitiveResults trsSensitiveResults = new TrsSensitiveResults(); trsSensitiveResults.setPage(url); trsSensitiveResults.setWrongWord(wrongWord); trsSensitiveResults.setCorrectWord(correctWordMap.get(wrongWord)); trsSensitiveResults.setErrorType(errorTypeMap.get(wrongWord)); // trsSensitiveResults.setContext(extractContext(word, emit.getStart(), emit.getEnd())); trsSensitiveResults.setContext(extractContext(content, start, end)); // 从原始内容中截取上下文 trsSensitiveResults.setSitename(sitename); trsSensitiveResults.setStatus("待处理"); results.add(trsSensitiveResults); } } } return results; }*/ // 提取上下文(示例:前后 20 字符) private String extractContext(String text, int start, int end) { int contextSize = 20; int contextStart = Math.max(0, start - contextSize); int contextEnd = Math.min(text.length(), end + contextSize); return text.substring(contextStart, contextEnd); } public void doArticleInfo(Long id) { } // 初始化 Scroll 查询 public String initScroll(Long id) { SearchRequest searchRequest = new SearchRequest("articles"); SearchSourceBuilder sourceBuilder = new SearchSourceBuilder(); BoolQueryBuilder boolQueryBuilder = QueryBuilders.boolQuery(); boolQueryBuilder.filter(QueryBuilders.termQuery("record", id)); sourceBuilder.query(boolQueryBuilder); // sourceBuilder.query(QueryBuilders.matchAllQuery()); sourceBuilder.size(100); // 每批加载 1000 篇文章 // sourceBuilder.timeout(TimeValue.timeValueSeconds(30)); // 增加超时时间 searchRequest.source(sourceBuilder); searchRequest.scroll(TimeValue.timeValueMinutes(1L)); SearchResponse response = null; try { response = client.search(searchRequest, RequestOptions.DEFAULT); } catch (IOException e) { throw new RuntimeException(e); } return response.getScrollId(); } // 从 MySQL 加载敏感词库 public Trie loadSensitiveWords(Map correctWordMap, Map errorTypeMap, Map errorGradeMap, Map startTimeMap, Map endTimeMap) { /* java Aho-Corasick算法 Aho-Corasick算法是一种用于多模式字符串匹配的算法,它可以同时在一个文本字符串中查找多个模式字符串。该算法由Alfred V. Aho和Margaret J. Corasick在1975年共同发明。相比于传统的多模式匹配方法(如KMP算法),Aho-Corasick算法在处理多个模式时具有更高的效率。 Aho-Corasick算法的工作原理 构建Trie树:首先,将所有模式字符串构建成一个Trie树(前缀树)。 构建失败链接:在Trie树上添加失败链接(也称为失效链接或故障链接),使得在匹配过程中遇到非匹配字符时可以跳转到Trie树上的其他节点,从而减少不必要的回溯。 输出链接:对于每个节点,标记其所有子节点对应的模式字符串的结束位置。这样,在遍历文本字符串时,一旦到达某个节点的输出链接,就意味着在该位置找到了一个模式字符串。 匹配过程:在文本字符串上从左到右进行扫描,利用Trie树和失败链接进行匹配。如果在某个节点上找到了一个模式字符串的结束,就记录下这个位置。 */ Trie.TrieBuilder builder = Trie.builder(); TrsSensitiveWords trsSensitiveWords = new TrsSensitiveWords(); trsSensitiveWords.setStatus("已通过"); List sensitiveWords = trsSensitiveWordsMapper.selectTrsSensitiveWordsList(trsSensitiveWords); for (TrsSensitiveWords sensitiveWord : sensitiveWords) { String wrongWord = sensitiveWord.getWrongWord(); builder.addKeyword(wrongWord); correctWordMap.put(wrongWord, sensitiveWord.getCorrectWord()); errorTypeMap.put(wrongWord, sensitiveWord.getErrorType()); errorGradeMap.put(wrongWord, sensitiveWord.getErrorGrade()); startTimeMap.put(wrongWord, sensitiveWord.getEffectstart()); endTimeMap.put(wrongWord, sensitiveWord.getEffectend()); } TrsSensitiveWordsTenant trsSensitiveWordsTenant = new TrsSensitiveWordsTenant(); trsSensitiveWordsTenant.setStatus("已通过"); List sensitiveWordsTenants = trsSensitiveWordsTenantMapper.selectTrsSensitiveWordsList(trsSensitiveWordsTenant); for (TrsSensitiveWordsTenant sensitiveWord : sensitiveWordsTenants) { String wrongWord = sensitiveWord.getWrongWord(); builder.addKeyword(wrongWord); correctWordMap.put(wrongWord, sensitiveWord.getCorrectWord()); errorTypeMap.put(wrongWord, sensitiveWord.getErrorType()); errorGradeMap.put(wrongWord, sensitiveWord.getErrorGrade()); startTimeMap.put(wrongWord, sensitiveWord.getEffectstart()); endTimeMap.put(wrongWord, sensitiveWord.getEffectend()); } return builder.build(); } public Trie loadTechnicalWords(Map errorTypeMap) { Trie.TrieBuilder builder = Trie.builder(); TrsTechnicalWords results = new TrsTechnicalWords(); results.setStatus("已通过"); List results1 = trsTechnicalWordsMapper.selectTrsTechnicalWordsList(results); for (TrsTechnicalWords sensitiveWord : results1) { String wrongWord = sensitiveWord.getWrongWord(); builder.addKeyword(wrongWord); errorTypeMap.put(wrongWord, sensitiveWord.getErrorType()); } return builder.build(); } public Trie loadPolicyWords(Map policyErrorRuleMap, Map policyErrorTypeMap, Map startTimeMap, Map endTimeMap) { Trie.TrieBuilder builder = Trie.builder(); TrsPolicyWords results = new TrsPolicyWords(); results.setStatus("已通过"); List results1 = trsPolicyWordsMapper.selectTrsPolicyWordsList(results); for (TrsPolicyWords sensitiveWord : results1) { String wrongWord = sensitiveWord.getWrongWord(); builder.addKeyword(wrongWord); policyErrorRuleMap.put(wrongWord, sensitiveWord.getErrorRule()); policyErrorTypeMap.put(wrongWord, sensitiveWord.getErrorType()); startTimeMap.put(wrongWord, sensitiveWord.getEffectstart()); endTimeMap.put(wrongWord, sensitiveWord.getEffectend()); } TrsPolicyWordsTenant trsPolicyWordsTenant = new TrsPolicyWordsTenant(); trsPolicyWordsTenant.setStatus("已通过"); List trsPolicyWordsTenants = trsPolicyWordsTenantMapper.selectTrsPolicyWordsList(trsPolicyWordsTenant); for (TrsPolicyWordsTenant sensitiveWord : trsPolicyWordsTenants) { String wrongWord = sensitiveWord.getWrongWord(); builder.addKeyword(wrongWord); policyErrorRuleMap.put(wrongWord, sensitiveWord.getErrorRule()); policyErrorTypeMap.put(wrongWord, sensitiveWord.getErrorType()); startTimeMap.put(wrongWord, sensitiveWord.getEffectstart()); endTimeMap.put(wrongWord, sensitiveWord.getEffectend()); } return builder.build(); } public Trie loadNameWords(Map nameCountryMap, Map nameWholeMap, Map nameShortMap, Map nameSortMap) { Trie.TrieBuilder builder = Trie.builder(); TrsNameWords results = new TrsNameWords(); results.setStatus("已通过"); List results1 = trsNameWordsMapper.selectTrsNameWordsList(results); for (TrsNameWords sensitiveWord : results1) { String wrongWord = sensitiveWord.getWrongWord(); builder.addKeyword(wrongWord); nameCountryMap.put(wrongWord, sensitiveWord.getCountry()); nameWholeMap.put(wrongWord, sensitiveWord.getDutyAll()); nameShortMap.put(wrongWord, sensitiveWord.getDutyShort()); nameSortMap.put(wrongWord, sensitiveWord.getSort()); } return builder.build(); } public Trie loadDomainWords(Map domainMap) { Trie.TrieBuilder builder = Trie.builder(); TrsDomainWords results = new TrsDomainWords(); results.setStatus("已通过"); List results1 = trsDomainWordsMapper.selectTrsDomainWordsList(results); for (TrsDomainWords sensitiveWord : results1) { String wrongWord = sensitiveWord.getWrongWord(); builder.addKeyword(wrongWord); domainMap.put(wrongWord, sensitiveWord.getDomainType()); } return builder.build(); } // 加载一批文章 /* public List loadArticlesBatch(String scrollId, Long id) { List articles = new ArrayList<>(); SearchScrollRequest scrollRequest = new SearchScrollRequest(scrollId); scrollRequest.scroll(TimeValue.timeValueMinutes(1L)); SearchResponse response = null; try { response = client.scroll(scrollRequest, RequestOptions.DEFAULT); } catch (IOException e) { e.printStackTrace(); } //通过response获取命中的数量 System.out.println("response获取命中的数量:" + response.getHits().getTotalHits().value); long totalHits = response.getHits().getTotalHits().value; if (totalHits < 100 && totalHits>0) { SearchRequest searchRequest = new SearchRequest("articles"); SearchSourceBuilder sourceBuilder = new SearchSourceBuilder(); BoolQueryBuilder boolQueryBuilder = QueryBuilders.boolQuery(); boolQueryBuilder.filter(QueryBuilders.termQuery("record", id)); searchRequest.source(sourceBuilder); sourceBuilder.query(boolQueryBuilder); try { response = client.search(searchRequest, RequestOptions.DEFAULT); } catch (IOException e) { e.printStackTrace(); } } for (SearchHit hit : response.getHits().getHits()) { Map source = hit.getSourceAsMap(); TrsArticleInfo article =new TrsArticleInfo(); article.setContent((String) source.get("content")); article.setUrl((String) source.get("page")); articles.add(article); } return articles; }*/ private Set whitelistPhrases = new HashSet<>(); public void loadWhitelist() { // 这里可以从数据库或配置文件加载白名单 /*whitelistPhrases.add("内蒙古自治区"); whitelistPhrases.add("中国银行"); whitelistPhrases.add("中华人民共和国");*/ TrsSensitiveWords trsSensitiveWords = new TrsSensitiveWords(); trsSensitiveWords.setFlag(0); trsSensitiveWordsMapper.selectTrsSensitiveWordsListBy(new TrsSensitiveWords()).forEach(sensitiveWord -> { whitelistPhrases.add(sensitiveWord.getWrongWord()); }); } private boolean isInWhitelist(String text, int start, int end) { for (String phrase : whitelistPhrases) { int phraseStart = start - (phrase.length() - (end - start)); // 计算完整短语的起点 if (phraseStart >= 0 && phraseStart + phrase.length() <= text.length()) { String subText = text.substring(phraseStart, phraseStart + phrase.length()); if (whitelistPhrases.contains(subText)) { return true; // 如果匹配到白名单短语,则跳过 } } } return false; } @Override public void docon(Long id) { //TrsSiteconfig trsSiteconfig = trsSiteconfigMapper.selectTrsSiteconfigById(id); //loadAllowedDomains(trsSiteconfig.getDomain()); loadTrsErrorUrls(); TrsSiteconfig siteconfig = new TrsSiteconfig(); List siteList = trsSiteconfigMapper.selectTrsSiteconfigList2(siteconfig); for (TrsSiteconfig site : siteList) { // 设置分页参数 int pageNum = 1; // 当前页码 int pageSize = 500; // 每页大小,根据实际情况调整 boolean hasMoreData = true; TrsArticleInfo trsArticleInfo = new TrsArticleInfo(); trsArticleInfo.setSitename(site.getSitename()); trsArticleInfo.setRecord(site.getRecord()); while (hasMoreData) { try { // 1. 设置分页参数(关键步骤) PageHelper.startPage(pageNum, pageSize); // 2. 紧跟着的第一个查询会被分页 List articles = trsArticleInfoMapper.selectTrsArticleInfoList(trsArticleInfo); if (articles == null || articles.isEmpty()) { hasMoreData = false; } else { // 4. 处理当前页数据 processUrls(articles,site,id); // 5. 准备查询下一页 // 重要:检查是否已经是最后一页 if (articles.size() < pageSize) { hasMoreData = false; } else { pageNum++; } } } finally { // 7. 确保每次循环后清除分页参数(重要!) PageHelper.clearPage(); } } } } private Set TRS_ERROR_URLS = new HashSet<>(); private void loadTrsErrorUrls() { TrsErrorUrl trsErrorUrl = new TrsErrorUrl(); List trsErrorUrls = trsErrorUrlMapper.selectTrsErrorUrlList(trsErrorUrl); for (TrsErrorUrl url : trsErrorUrls) { TRS_ERROR_URLS.add(url.getErrorurl()); } } private boolean isErrorLink(String url) { try { if (url == null || url.isEmpty()) { return false; } URL parsedUrl = new URL(url); String host = parsedUrl.getHost(); return TRS_ERROR_URLS.contains(host); } catch (Exception e) { e.printStackTrace(); for (String url2 : TRS_ERROR_URLS) { if (url.contains(url2)) { return true; } } return false; } } private Set ALLOWED_DOMAINS = new HashSet<>(); private ArrayList ALLOWED_DOMAINS2 = new ArrayList<>(); public void loadAllowedDomains(String domain) { ALLOWED_DOMAINS.add(domain); ALLOWED_DOMAINS2.add(domain); } /** * 判断 URL 是否是外链 * * @param url URL 地址 * @param domain * @return true 是外链,false 不是外链 */ private boolean isExternalLink(String url, String domain) { try { if (url == null || url.isEmpty()) { return false; } if ("mp.weixin.qq.com".equals(domain)&&(url.contains("res.wx.qq.com")||url.contains("captcha.gtimg.com"))) { return false; } URL parsedUrl = new URL(url); String host = parsedUrl.getHost(); // return !ALLOWED_DOMAINS.contains(host); return !host.contains(domain); } catch (Exception e) { e.printStackTrace(); return true; // 如果解析失败,默认认为是外链 } } /** * 判断 URL 是否是错断链 * * @param url URL 地址 * @return true 是错断链,false 不是错断链 */ public boolean isBrokenLink(String url) { Request request = new Request.Builder() .url(url) .head() .build(); try (Response response = okHttpClient.newCall(request).execute()) { int responseCode = response.code(); // return responseCode == 404 || responseCode >= 400; return responseCode == 404 || responseCode == 500 || responseCode == 502; } catch (IOException e) { return false; } } public String fetchHtmlContent(String url) { Request request = new Request.Builder() .url(url) .get() // 使用 GET 请求(默认方法,可省略) .build(); try (Response response = okHttpClient.newCall(request).execute()) { if (!response.isSuccessful()) { throw new IOException("请求失败,HTTP 状态码: " + response.code()); } // 返回 HTML 内容(假设响应是文本/HTML) return response.body().string(); } catch (IOException e) { e.printStackTrace(); } return null; } /* private boolean isBrokenLink(String url) { HttpURLConnection connection = null; try { URL parsedUrl = new URL(url); // Configure to ignore SSL verification for HTTPS URLs if (parsedUrl.getProtocol().equalsIgnoreCase("https")) { // Create a trust manager that does not validate certificate chains TrustManager[] trustAllCerts = new TrustManager[]{ new X509TrustManager() { public X509Certificate[] getAcceptedIssuers() { return new X509Certificate[0]; } public void checkClientTrusted(X509Certificate[] certs, String authType) { } public void checkServerTrusted(X509Certificate[] certs, String authType) { } } }; // Install the all-trusting trust manager SSLContext sc = SSLContext.getInstance("SSL"); sc.init(null, trustAllCerts, new java.security.SecureRandom()); HttpsURLConnection.setDefaultSSLSocketFactory(sc.getSocketFactory()); // Create all-trusting host name verifier HostnameVerifier allHostsValid = (hostname, session) -> true; HttpsURLConnection.setDefaultHostnameVerifier(allHostsValid); } connection = (HttpURLConnection) parsedUrl.openConnection(); connection.setRequestMethod("HEAD"); // Use HEAD method to only get headers connection.setConnectTimeout(60000); // Set connection timeout connection.setReadTimeout(60000); // Set read timeout connection.setInstanceFollowRedirects(true); // Follow redirects int responseCode = connection.getResponseCode(); if(responseCode == HttpURLConnection.HTTP_NOT_FOUND || responseCode >= 400){ System.out.println(1111111); } return responseCode == HttpURLConnection.HTTP_NOT_FOUND || responseCode >= 400; } catch (IOException e) { return false; // If request fails, consider it a broken link } catch (Exception e) { return false; // For any other exception, consider it a broken link } finally { if (connection != null) { connection.disconnect(); // Close connection } } }*/ /** * 获取 URL 的内容 * * @param url URL 地址 * @return URL 的内容,如果获取失败则返回 null */ private String getUrlContent(String url) { HttpURLConnection connection = null; try { URL parsedUrl = new URL(url); connection = (HttpURLConnection) parsedUrl.openConnection(); connection.setRequestMethod("GET"); // 使用 GET 方法获取内容 connection.setConnectTimeout(5000); // 设置连接超时 connection.setReadTimeout(5000); // 设置读取超时 int responseCode = connection.getResponseCode(); if (responseCode == HttpURLConnection.HTTP_OK) { InputStream inputStream = connection.getInputStream(); Scanner scanner = new Scanner(inputStream).useDelimiter("\\A"); return scanner.hasNext() ? scanner.next() : null; } else { return null; // 如果状态码不是 200,返回 null } } catch (IOException e) { e.printStackTrace(); return null; // 如果请求失败,返回 null } finally { if (connection != null) { connection.disconnect(); // 关闭连接 } } } /** * 处理单个 URL * * @param * @param id * @return 处理结果 */ private String processSingleUrl(TrsArticleInfo trsArticleInfo, TrsSiteconfig siteconfig, Long id) { TrsUrlResult trsUrlResult = new TrsUrlResult(); trsUrlResult.setUrl(trsArticleInfo.getUrl()); trsUrlResult.setRecord(trsArticleInfo.getRecord()); trsUrlResult.setSitename(trsArticleInfo.getSitename()); trsUrlResult.setTitle(trsArticleInfo.getTitle()); trsUrlResult.setSitetag(siteconfig.getMsg()); trsUrlResult.setMsg("-"); List trsUrlResults = trsUrlResultMapper.selectTrsUrlResultList2(trsUrlResult); boolean isExist = trsUrlResults.size() < 1; if(isExist && isErrorLink(trsArticleInfo.getUrl())){ trsUrlResult.setTenantId(siteconfig.getTenantId()); trsUrlResult.setTime(new Date()); trsUrlResult.setLinkurl(trsArticleInfo.getUrl()); trsUrlResult.setType("敏感链接"); trsUrlResultMapper.insertTrsUrlResult(trsUrlResult); } else if (isExist && isBrokenLink(trsArticleInfo.getUrl())) { trsUrlResult.setTenantId(siteconfig.getTenantId()); trsUrlResult.setTime(new Date()); trsUrlResult.setLinkurl(trsArticleInfo.getUrl()); trsUrlResult.setType("错断链"); trsUrlResultMapper.insertTrsUrlResult(trsUrlResult); return "错断链: " + trsArticleInfo.getUrl(); }else if (isExist && isExternalLink(trsArticleInfo.getUrl(),siteconfig.getDomain())) { trsUrlResult.setTenantId(siteconfig.getTenantId()); trsUrlResult.setTime(new Date()); trsUrlResult.setLinkurl(trsArticleInfo.getUrl()); trsUrlResult.setType("外链"); trsUrlResultMapper.insertTrsUrlResult(trsUrlResult); return "外链: " + trsArticleInfo.getUrl(); } else { // String content = getUrlContent(trsArticleInfo.getUrl()); // String content = trsArticleInfo.getContent(); String content = ""; if(id != null && 100 == id){ content = fetchHtmlContent(trsArticleInfo.getUrl()); }else { content = trsArticleInfo.getContent(); } if (content != null && !"".equals(content)) { for (String link : TRS_ERROR_URLS) { if(content.contains(link)){ trsUrlResult.setTenantId(siteconfig.getTenantId()); trsUrlResult.setMsg("-"); trsUrlResult.setType("敏感链接"); trsUrlResult.setLinkurl(link); List trsUrlResults2 = trsUrlResultMapper.selectTrsUrlResultList2(trsUrlResult); boolean isExist2 = trsUrlResults2.size() < 1; if(isExist2) trsUrlResultMapper.insertTrsUrlResult(trsUrlResult); } } String contentNoHtml = HtmlUtils.removeAllTags(content); if(SensitiveInfoDetector.containsIdCard(contentNoHtml)){ for (String idCard : SensitiveInfoDetector.extractIdCards(contentNoHtml)){ if(SensitiveInfoDetector.extractContext2(contentNoHtml, idCard).contains("."))continue; TrsPersonResult trsPersonResult = new TrsPersonResult(); trsPersonResult.setPage(trsArticleInfo.getUrl()); trsPersonResult.setSitename(trsArticleInfo.getSitename()); trsPersonResult.setSitetag(siteconfig.getMsg()); trsPersonResult.setTitle(trsArticleInfo.getTitle()); trsPersonResult.setWrongWord(idCard); trsPersonResult.setWrongType("身份证号码"); trsPersonResult.setStatus("待处理"); trsPersonResult.setContext(SensitiveInfoDetector.extractContext(contentNoHtml, idCard)); trsPersonResult.setAllcontext(content); trsPersonResult.setDatakey(Md5Utils.hash(idCard + trsArticleInfo.getUrl())); trsPersonResult.setTenantId(siteconfig.getTenantId()); List trsUrlResults2 = trsPersonResultMapper.selectTrsPersonResultList2(trsPersonResult); trsPersonResult.setTime(new Date()); boolean isExist2 = trsUrlResults2.size() < 1; if(isExist2) trsPersonResultMapper.insertTrsPersonResult(trsPersonResult); } } if(SensitiveInfoDetector.containsPhone(contentNoHtml)){ for (String idCard : SensitiveInfoDetector.extractPhones(contentNoHtml)){ if(SensitiveInfoDetector.extractContext2(contentNoHtml, idCard).contains("."))continue; TrsPersonResult trsPersonResult = new TrsPersonResult(); trsPersonResult.setPage(trsArticleInfo.getUrl()); trsPersonResult.setSitename(trsArticleInfo.getSitename()); trsPersonResult.setSitetag(siteconfig.getMsg()); trsPersonResult.setTitle(trsArticleInfo.getTitle()); trsPersonResult.setWrongWord(idCard); trsPersonResult.setWrongType("手机号码"); trsPersonResult.setStatus("待处理"); trsPersonResult.setContext(SensitiveInfoDetector.extractContext(contentNoHtml, idCard)); trsPersonResult.setAllcontext(content); trsPersonResult.setDatakey(Md5Utils.hash(idCard + trsArticleInfo.getUrl())); trsPersonResult.setTenantId(siteconfig.getTenantId()); List trsUrlResults2 = trsPersonResultMapper.selectTrsPersonResultList2(trsPersonResult); trsPersonResult.setTime(new Date()); boolean isExist2 = trsUrlResults2.size() < 1; if(isExist2) trsPersonResultMapper.insertTrsPersonResult(trsPersonResult); } } if(SensitiveInfoDetector.containsBankCard(contentNoHtml)){ for (String idCard : SensitiveInfoDetector.extractBankCards(contentNoHtml)){ if(SensitiveInfoDetector.extractContext2(contentNoHtml, idCard).contains("."))continue; TrsPersonResult trsPersonResult = new TrsPersonResult(); trsPersonResult.setPage(trsArticleInfo.getUrl()); trsPersonResult.setSitename(trsArticleInfo.getSitename()); trsPersonResult.setSitetag(siteconfig.getMsg()); trsPersonResult.setTitle(trsArticleInfo.getTitle()); trsPersonResult.setWrongWord(idCard); trsPersonResult.setWrongType("银行卡号"); trsPersonResult.setStatus("待处理"); trsPersonResult.setContext(SensitiveInfoDetector.extractContext(contentNoHtml, idCard)); trsPersonResult.setAllcontext(content); trsPersonResult.setDatakey(Md5Utils.hash(idCard + trsArticleInfo.getUrl())); trsPersonResult.setTenantId(siteconfig.getTenantId()); List trsUrlResults2 = trsPersonResultMapper.selectTrsPersonResultList2(trsPersonResult); trsPersonResult.setTime(new Date()); boolean isExist2 = trsUrlResults2.size() < 1; if(isExist2) trsPersonResultMapper.insertTrsPersonResult(trsPersonResult); } } // 检查内容中的链接 checkContentLinks(trsArticleInfo, content, trsUrlResult, siteconfig.getDomain(), siteconfig.getTenantId()); //更新到数据库中 // trsArticleInfo.setContent(content); // trsArticleInfoMapper.updateTrsArticleInfo(trsArticleInfo); return "URL 内容: " + content.substring(0, Math.min(content.length(), 100)) + "..."; } else { return "无法获取 URL 内容: " + trsArticleInfo.getUrl(); } } return ""; } /** * 多线程处理 URL 列表 * * @param urls URL 列表 * @return 处理结果列表 */ // 线程池 private final ExecutorService executorService = Executors.newFixedThreadPool(1); public ConcurrentHashMap processUrls(List trsArticleInfos, TrsSiteconfig siteconfig, Long id) { ConcurrentHashMap resultMap = new ConcurrentHashMap<>(); CountDownLatch latch = new CountDownLatch(trsArticleInfos.size()); // 用于等待所有任务完成 // 提交任务到线程池 for (TrsArticleInfo articleInfo : trsArticleInfos) { executorService.submit(() -> { try { String result = processSingleUrl(articleInfo,siteconfig,id); resultMap.put(articleInfo.getUrl(), result); }catch (Exception e){ e.printStackTrace(); }finally { latch.countDown(); // 任务完成,计数器减一 } }); } try { latch.await(); // 等待所有任务完成 } catch (InterruptedException e) { e.printStackTrace(); } return resultMap; } /** * 关闭线程池 */ @PreDestroy public void shutdown() { executorService.shutdown(); try { if (!executorService.awaitTermination(1, TimeUnit.MINUTES)) { executorService.shutdownNow(); } } catch (InterruptedException e) { executorService.shutdownNow(); } } private List extractLinksFromContent(String content) { List links = new ArrayList<>(); // 正则表达式匹配href和src链接 Pattern pattern = Pattern.compile("(href|src)=\"([^\"]*)\""); Matcher matcher = pattern.matcher(content); while (matcher.find()) { String link = matcher.group(2); if (link != null && !link.isEmpty() && !link.startsWith("#") && !link.startsWith("javascript:")&& !link.equals("./")) { links.add(link); } } return links; } private void checkContentLinks(TrsArticleInfo articleInfo, String content, TrsUrlResult trsUrlResult, String domain, String tenantId) { List links = extractLinksFromContent(content); for (String link : links) { try { // 处理相对路径 String absoluteLink = makeAbsoluteUrl(articleInfo.getUrl(), link); trsUrlResult.setMsg("-"); trsUrlResult.setLinkurl(absoluteLink); List trsUrlResults = trsUrlResultMapper.selectTrsUrlResultList2(trsUrlResult); boolean isExist = trsUrlResults.size() < 1; trsUrlResult.setTime(new Date()); if(isExist&&isErrorLink(absoluteLink)){ trsUrlResult.setTenantId(tenantId); trsUrlResult.setType("敏感链接"); trsUrlResultMapper.insertTrsUrlResult(trsUrlResult); } else if (isExist&&isBrokenLink(absoluteLink)) { // 记录错断链 trsUrlResult.setTenantId(tenantId); trsUrlResult.setType("错断链"); if(absoluteLink.contains("exPlay")||absoluteLink.contains("mp4"))continue; if(absoluteLink.contains("beian"))continue; if(absoluteLink.contains("img.henan"))continue; if(absoluteLink.contains("baike.baidu.com"))continue; trsUrlResultMapper.insertTrsUrlResult(trsUrlResult); //recordLinkIssue(articleInfo, absoluteLink, "错断链"); }else if (isExist&&isExternalLink(absoluteLink, domain)) { // 记录外链 trsUrlResult.setType("外链"); trsUrlResultMapper.insertTrsUrlResult(trsUrlResult); //recordLinkIssue(articleInfo, absoluteLink, "外链"); } } catch (Exception e) { // 记录无效链接 recordLinkIssue(articleInfo, link, "无效链接"); } } } private String makeAbsoluteUrl(String baseUrl, String link) throws MalformedURLException { if (link.startsWith("http://") || link.startsWith("https://")) { return link; } URL base = null; try { base = new URL(baseUrl); } catch (MalformedURLException e) { throw new RuntimeException(e); } return new URL(base, link).toString(); } private void recordLinkIssue(TrsArticleInfo articleInfo, String link, String issueType) { // 这里可以记录到数据库或日志中 System.out.println("文章ID: " + articleInfo.getId() + ", URL: " + articleInfo.getUrl() + ", 问题链接: " + link + ", 问题类型: " + issueType); // 如果需要保存到数据库,可以创建一个新的表来存储这些信息 // linkIssueMapper.insert(new LinkIssue(articleInfo.getId(), link, issueType)); } }