| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829 |
- package com.ruoyi.project.service.impl;
- import java.io.IOException;
- import java.io.InputStream;
- import java.net.HttpURLConnection;
- import java.net.MalformedURLException;
- import java.net.URL;
- import java.util.*;
- import java.util.concurrent.*;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
- import com.alibaba.fastjson.JSONObject;
- import com.github.pagehelper.Page;
- import com.github.pagehelper.PageHelper;
- import com.hankcs.hanlp.HanLP;
- import com.hankcs.hanlp.seg.common.Term;
- import com.ruoyi.common.utils.SensitiveInfoDetector;
- import com.ruoyi.common.utils.http.HttpUtils;
- import com.ruoyi.common.utils.security.Md5Utils;
- import com.ruoyi.project.domain.*;
- import com.ruoyi.project.mapper.*;
- import com.ruoyi.project.utils.AdvancedSensitiveFilter;
- import com.ruoyi.project.utils.HtmlUtils;
- import com.ruoyi.project.utils.JiaoduiUtils;
- import okhttp3.OkHttpClient;
- import okhttp3.Request;
- import okhttp3.Response;
- import org.ahocorasick.trie.Emit;
- import org.ahocorasick.trie.Trie;
- import org.elasticsearch.action.bulk.BulkRequest;
- import org.elasticsearch.action.bulk.BulkResponse;
- import org.elasticsearch.action.index.IndexRequest;
- import org.elasticsearch.action.search.SearchRequest;
- import org.elasticsearch.action.search.SearchResponse;
- import org.elasticsearch.client.RequestOptions;
- import org.elasticsearch.client.RestHighLevelClient;
- import org.elasticsearch.common.unit.TimeValue;
- import org.elasticsearch.index.query.BoolQueryBuilder;
- import org.elasticsearch.index.query.QueryBuilders;
- import org.elasticsearch.search.builder.SearchSourceBuilder;
- import org.springframework.beans.factory.annotation.Autowired;
- import org.springframework.stereotype.Service;
- import com.ruoyi.project.service.ITrsSiteconfigService;
- import com.ruoyi.common.core.text.Convert;
- import javax.annotation.PreDestroy;
- /**
- * 配置管理Service业务层处理
- *
- * @author ruoyi
- * @date 2025-03-19
- */
- @Service
- public class TrsSiteconfigServiceImpl implements ITrsSiteconfigService
- {
- @Autowired
- private TrsSiteconfigMapper trsSiteconfigMapper;
- @Autowired
- private TrsArticleInfoMapper trsArticleInfoMapper;
- @Autowired
- private TrsSensitiveWordsMapper trsSensitiveWordsMapper;
- @Autowired
- private TrsSensitiveResultsMapper trsSensitiveResultsMapper;
- // @Autowired
- // private RestHighLevelClientUtils restHighLevelClientUtils;
- @Autowired
- private RestHighLevelClient client;
- @Autowired
- private TrsTechnicalResultsMapper trsTechnicalResultsMapper;
- @Autowired
- private TrsPolicyResultsMapper trsPolicyResultsMapper;
- @Autowired
- private TrsNameResultsMapper trsNameResultsMapper;
- @Autowired
- private TrsDomainResultsMapper trsDomainResultsMapper;
- @Autowired
- private TrsSensitiveWordsTenantMapper trsSensitiveWordsTenantMapper;
- @Autowired
- private TrsPolicyWordsTenantMapper trsPolicyWordsTenantMapper;
- @Autowired
- private TrsTechnicalWordsMapper trsTechnicalWordsMapper;
- @Autowired
- private TrsPolicyWordsMapper trsPolicyWordsMapper;
- @Autowired
- private TrsNameWordsMapper trsNameWordsMapper;
- @Autowired
- private TrsDomainWordsMapper trsDomainWordsMapper;
- @Autowired
- private JiaoduiUtils jiaoduiUtils;
- @Autowired
- private OkHttpClient okHttpClient;
- @Autowired
- private TrsUrlResultMapper trsUrlResultMapper;
- @Autowired
- private TrsErrorUrlMapper trsErrorUrlMapper;
- @Autowired
- private TrsPersonResultMapper trsPersonResultMapper;
- /**
- * 查询配置管理
- *
- * @param id 配置管理主键
- * @return 配置管理
- */
- @Override
- public TrsSiteconfig selectTrsSiteconfigById(Long id)
- {
- return trsSiteconfigMapper.selectTrsSiteconfigById(id);
- }
- /**
- * 查询配置管理列表
- *
- * @param trsSiteconfig 配置管理
- * @return 配置管理
- */
- @Override
- public List<TrsSiteconfig> selectTrsSiteconfigList(TrsSiteconfig trsSiteconfig)
- {
- return trsSiteconfigMapper.selectTrsSiteconfigList(trsSiteconfig);
- }
- /**
- * 新增配置管理
- *
- * @param trsSiteconfig 配置管理
- * @return 结果
- */
- @Override
- public int insertTrsSiteconfig(TrsSiteconfig trsSiteconfig)
- {
- return trsSiteconfigMapper.insertTrsSiteconfig(trsSiteconfig);
- }
- /**
- * 修改配置管理
- *
- * @param trsSiteconfig 配置管理
- * @return 结果
- */
- @Override
- public int updateTrsSiteconfig(TrsSiteconfig trsSiteconfig)
- {
- return trsSiteconfigMapper.updateTrsSiteconfig(trsSiteconfig);
- }
- /**
- * 批量删除配置管理
- *
- * @param ids 需要删除的配置管理主键
- * @return 结果
- */
- @Override
- public int deleteTrsSiteconfigByIds(String ids)
- {
- return trsSiteconfigMapper.deleteTrsSiteconfigByIds(Convert.toStrArray(ids));
- }
- /**
- * 删除配置管理信息
- *
- * @param id 配置管理主键
- * @return 结果
- */
- @Override
- public int deleteTrsSiteconfigById(Long id)
- {
- return trsSiteconfigMapper.deleteTrsSiteconfigById(id);
- }
- @Override
- public Integer dosysn(Long id) {
- int currentPage = 1;
- int BATCH_SIZE = 1000;
- while (true) {
- // 使用 PageHelper 分页
- Page<TrsArticleInfo> page = PageHelper.startPage(currentPage, BATCH_SIZE);
- // 执行查询
- TrsArticleInfo trsArticleInfo = new TrsArticleInfo();
- trsArticleInfo.setRecord(id);
- List<TrsArticleInfo> articles = trsArticleInfoMapper.selectTrsArticleInfoList(trsArticleInfo);
- if (articles.isEmpty()) {
- break; // 数据已全部处理完毕
- }
- // 处理数据(例如插入 Elasticsearch)
- processArticles(articles);
- // 更新页码
- // 重要:检查是否已经是最后一页
- if (articles.size() < BATCH_SIZE) {
- break; // 数据已全部处理完毕
- } else {
- currentPage++;
- }
- System.out.println("已处理 " + (currentPage - 1) * BATCH_SIZE + " 条数据");
- }
- return 1;
- }
- private void processArticles(List<TrsArticleInfo> articles) {
- BulkRequest bulkRequest = new BulkRequest();
- for (TrsArticleInfo article : articles) {
- IndexRequest indexRequest = new IndexRequest("articles")
- .id(String.valueOf(article.getId()))
- .source(
- "content", article.getContent(),
- "page", article.getUrl(),
- "record", article.getRecord()
- );
- //.source("content", article.getContent(), XContentType.JSON);
- bulkRequest.add(indexRequest);
- }
- try {
- BulkResponse bulkResponse = client.bulk(bulkRequest, RequestOptions.DEFAULT);
- if (bulkResponse.hasFailures()) {
- System.err.println("批量插入失败:" + bulkResponse.buildFailureMessage());
- }
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- @Override
- public void doscan(Long id, String type, String sitename) {
- // TrsSiteconfig trsSiteconfig = trsSiteconfigMapper.selectTrsSiteconfigById2(id);
- Map<String, Date> startTimeMap = new HashMap<>();
- Map<String, Date> endTimeMap = new HashMap<>();
- // 缓存敏感词的正确词和错误类型
- Map<String, String> correctWordMap = new HashMap<>();
- Map<String, String> errorTypeMap = new HashMap<>();
- Map<String, String> errorGradeMap = new HashMap<>();
- //
- Map<String, String> technicalErrorTypeMap = new HashMap<>();
- Map<String, String> policyErrorRuleMap = new HashMap<>();
- Map<String, String> policyErrorTypeMap = new HashMap<>();
- Map<String, String> nameCountryMap = new HashMap<>();
- Map<String, String> nameWholeMap = new HashMap<>();
- Map<String, String> nameShortMap = new HashMap<>();
- Map<String, String> nameSortMap = new HashMap<>();
- Map<String, String> domainMap = new HashMap<>();
- loadWhitelist();
- // 加载敏感词库
- Trie trie = loadSensitiveWords(correctWordMap, errorTypeMap, errorGradeMap, startTimeMap, endTimeMap);
- // Trie technicalTrie = loadTechnicalWords(technicalErrorTypeMap);
- Trie policyTrie = loadPolicyWords(policyErrorRuleMap, policyErrorTypeMap, startTimeMap, endTimeMap);
- // Trie nameTrie = loadNameWords(nameCountryMap, nameWholeMap,nameShortMap,nameSortMap);
- // Trie domainTrie = loadDomainWords(domainMap);
- if("1".equals(type)){//第一次扫描
- // 设置分页参数
- int pageNum = 1; // 当前页码
- int pageSize = 500; // 每页大小,根据实际情况调整
- boolean hasMoreData = true;
- TrsSiteconfig siteconfig = new TrsSiteconfig();
- siteconfig.setId(id);
- // List<TrsSiteconfig> list = trsSiteconfigMapper.selectTrsSiteconfigList2(siteconfig);
- List<TrsSiteconfig> list = new ArrayList<>();
- TrsSiteconfig trsSiteconfig = trsSiteconfigMapper.selectTrsSiteconfigById(id);
- list.add(trsSiteconfig);
- for (TrsSiteconfig config : list) {
- TrsArticleInfo trsArticleInfo = new TrsArticleInfo();
- trsArticleInfo.setRecord(config.getRecord());
- trsArticleInfo.setSitename(config.getSitename());
- while (hasMoreData) {
- try {
- // 1. 设置分页参数(关键步骤)
- PageHelper.startPage(pageNum, pageSize);
- // 2. 紧跟着的第一个查询会被分页
- List<TrsArticleInfo> articles = trsArticleInfoMapper.selectTrsArticleInfoList(trsArticleInfo);
- if (articles == null || articles.isEmpty()) {
- hasMoreData = false;
- } else {
- // 4. 处理当前页数据
- detectAndSaveResults(trsSensitiveResultsMapper, TrsSensitiveResults.class, trie, articles,
- correctWordMap, errorTypeMap, null, errorGradeMap, startTimeMap, endTimeMap, config);
- detectAndSaveResults(trsPolicyResultsMapper, TrsPolicyResults.class, policyTrie, articles,
- null, policyErrorTypeMap, policyErrorRuleMap, errorGradeMap, startTimeMap, endTimeMap, config);
- // 5. 准备查询下一页
- // 重要:检查是否已经是最后一页
- if (articles.size() < pageSize) {
- hasMoreData = false;
- } else {
- pageNum++;
- }
- }
- } finally {
- // 7. 确保每次循环后清除分页参数(重要!)
- PageHelper.clearPage();
- }
- }
- }
- /*TrsArticleInfo trsArticleInfo = new TrsArticleInfo();
- trsArticleInfo.setRecord(trsSiteconfig.getRecord());
- List<TrsArticleInfo> articles = trsArticleInfoMapper.selectTrsArticleInfoList(trsArticleInfo);
- detectAndSaveResults(trsSensitiveResultsMapper, TrsSensitiveResults.class, trie, articles, correctWordMap, errorTypeMap, null, errorGradeMap,startTimeMap,endTimeMap, trsSiteconfig);
- detectAndSaveResults(trsPolicyResultsMapper, TrsPolicyResults.class, policyTrie, articles, null, policyErrorTypeMap, policyErrorRuleMap, errorGradeMap,startTimeMap,endTimeMap, trsSiteconfig);*/
- }else if("3".equals(type)){//定时任务自动更新
- if("新媒体".equals(sitename)){
- TrsSiteconfig siteconfig = new TrsSiteconfig();
- siteconfig.setMsg("weixin");
- List<TrsSiteconfig> weixinList = trsSiteconfigMapper.selectTrsSiteconfigList2(siteconfig);
- for (TrsSiteconfig weixin : weixinList) {
- TrsArticleInfo trsArticleInfo = new TrsArticleInfo();
- trsArticleInfo.setSitename(weixin.getSitename());
- trsArticleInfo.setRecord(weixin.getRecord());
- Map<String, Object> params = new HashMap<>();
- params.put("beginTime","2025-08-01");
- // params.put("beginTime","2000-08-01");
- params.put("endTime","2050-04-20");
- trsArticleInfo.setParams(params);
- List<TrsArticleInfo> articles = trsArticleInfoMapper.selectTrsArticleInfoList(trsArticleInfo);
- detectAndSaveResults(trsSensitiveResultsMapper, TrsSensitiveResults.class, trie, articles, correctWordMap, errorTypeMap, null, errorGradeMap,startTimeMap,endTimeMap, weixin);
- detectAndSaveResults(trsPolicyResultsMapper, TrsPolicyResults.class, policyTrie, articles, null, policyErrorTypeMap, policyErrorRuleMap, errorGradeMap,startTimeMap,endTimeMap, weixin);
- }
- TrsSiteconfig siteconfig2 = new TrsSiteconfig();
- siteconfig2.setMsg("weibo");
- List<TrsSiteconfig> weiboList = trsSiteconfigMapper.selectTrsSiteconfigList2(siteconfig2);
- for (TrsSiteconfig weixin : weiboList) {
- TrsArticleInfo trsArticleInfo = new TrsArticleInfo();
- trsArticleInfo.setSitename(weixin.getSitename());
- trsArticleInfo.setRecord(weixin.getRecord());
- Map<String, Object> params = new HashMap<>();
- params.put("beginTime","2025-08-01");
- params.put("endTime","2050-04-20");
- trsArticleInfo.setParams(params);
- List<TrsArticleInfo> articles = trsArticleInfoMapper.selectTrsArticleInfoList(trsArticleInfo);
- detectAndSaveResults(trsSensitiveResultsMapper, TrsSensitiveResults.class, trie, articles, correctWordMap, errorTypeMap, null, errorGradeMap,startTimeMap,endTimeMap, weixin);
- detectAndSaveResults(trsPolicyResultsMapper, TrsPolicyResults.class, policyTrie, articles, null, policyErrorTypeMap, policyErrorRuleMap, errorGradeMap,startTimeMap,endTimeMap, weixin);
- }
- TrsSiteconfig siteconfig3 = new TrsSiteconfig();
- siteconfig3.setMsg("dsp");
- List<TrsSiteconfig> dspList = trsSiteconfigMapper.selectTrsSiteconfigList2(siteconfig3);
- for (TrsSiteconfig dsp : dspList) {
- TrsArticleInfo trsArticleInfo = new TrsArticleInfo();
- trsArticleInfo.setSitename(dsp.getSitename());
- trsArticleInfo.setRecord(dsp.getRecord());
- Map<String, Object> params = new HashMap<>();
- params.put("beginTime","2025-08-01");
- params.put("endTime","2050-04-20");
- trsArticleInfo.setParams(params);
- List<TrsArticleInfo> articles = trsArticleInfoMapper.selectTrsArticleInfoList(trsArticleInfo);
- detectAndSaveResults(trsSensitiveResultsMapper, TrsSensitiveResults.class, trie, articles, correctWordMap, errorTypeMap, null, errorGradeMap,startTimeMap,endTimeMap, dsp);
- detectAndSaveResults(trsPolicyResultsMapper, TrsPolicyResults.class, policyTrie, articles, null, policyErrorTypeMap, policyErrorRuleMap, errorGradeMap,startTimeMap,endTimeMap, dsp);
- }
- TrsSiteconfig siteconfig4 = new TrsSiteconfig();
- siteconfig4.setMsg("toutiao");
- List<TrsSiteconfig> toutiaoList = trsSiteconfigMapper.selectTrsSiteconfigList2(siteconfig4);
- for (TrsSiteconfig toutiao : toutiaoList) {
- TrsArticleInfo trsArticleInfo = new TrsArticleInfo();
- trsArticleInfo.setSitename(toutiao.getSitename());
- trsArticleInfo.setRecord(toutiao.getRecord());
- Map<String, Object> params = new HashMap<>();
- params.put("beginTime","2025-08-01");
- params.put("endTime","2050-04-20");
- trsArticleInfo.setParams(params);
- List<TrsArticleInfo> articles = trsArticleInfoMapper.selectTrsArticleInfoList(trsArticleInfo);
- detectAndSaveResults(trsSensitiveResultsMapper, TrsSensitiveResults.class, trie, articles, correctWordMap, errorTypeMap, null, errorGradeMap,startTimeMap,endTimeMap, toutiao);
- detectAndSaveResults(trsPolicyResultsMapper, TrsPolicyResults.class, policyTrie, articles, null, policyErrorTypeMap, policyErrorRuleMap, errorGradeMap,startTimeMap,endTimeMap, toutiao);
- }
- }else if("网站".equals(sitename)){
- TrsSiteconfig siteconfig = new TrsSiteconfig();
- siteconfig.setMsg("site");
- List<TrsSiteconfig> siteList = trsSiteconfigMapper.selectTrsSiteconfigList2(siteconfig);
- for (TrsSiteconfig site : siteList) {
- // 设置分页参数
- int pageNum = 1; // 当前页码
- int pageSize = 500; // 每页大小,根据实际情况调整
- boolean hasMoreData = true;
- TrsArticleInfo trsArticleInfo = new TrsArticleInfo();
- trsArticleInfo.setSitename(site.getSitename());
- trsArticleInfo.setRecord(site.getRecord());
- Map<String, Object> params = new HashMap<>();
- params.put("beginTime","2025-08-01");
- params.put("endTime","2050-04-20");
- // trsArticleInfo.setTitle("习近平夫妇会见柬埔寨国王西哈莫尼和太后莫尼列");
- trsArticleInfo.setParams(params);
- while (hasMoreData) {
- try {
- // 1. 设置分页参数(关键步骤)
- PageHelper.startPage(pageNum, pageSize);
- // 2. 紧跟着的第一个查询会被分页
- List<TrsArticleInfo> articles = trsArticleInfoMapper.selectTrsArticleInfoList(trsArticleInfo);
- if (articles == null || articles.isEmpty()) {
- hasMoreData = false;
- } else {
- // 4. 处理当前页数据
- detectAndSaveResults(trsSensitiveResultsMapper, TrsSensitiveResults.class, trie, articles, correctWordMap, errorTypeMap, null, errorGradeMap,startTimeMap,endTimeMap, site);
- detectAndSaveResults(trsPolicyResultsMapper, TrsPolicyResults.class, policyTrie, articles, null, policyErrorTypeMap, policyErrorRuleMap, errorGradeMap,startTimeMap,endTimeMap, site);
- // 5. 准备查询下一页
- // 重要:检查是否已经是最后一页
- if (articles.size() < pageSize) {
- hasMoreData = false;
- } else {
- pageNum++;
- }
- }
- } finally {
- // 7. 确保每次循环后清除分页参数(重要!)
- PageHelper.clearPage();
- }
- }
- }
- }
- }else if("4".equals(type)){//定时任务自动复核
- if("新媒体".equals(sitename)){
- TrsSiteconfig siteconfig = new TrsSiteconfig();
- siteconfig.setMsg("weixin");
- List<TrsSiteconfig> weixinList = trsSiteconfigMapper.selectTrsSiteconfigList2(siteconfig);
- for (TrsSiteconfig weixin : weixinList) {
- fuhe(weixin.getSitename(),"新媒体");
- }
- TrsSiteconfig siteconfig2 = new TrsSiteconfig();
- siteconfig2.setMsg("weibo");
- List<TrsSiteconfig> weiboList = trsSiteconfigMapper.selectTrsSiteconfigList2(siteconfig2);
- for (TrsSiteconfig weixin : weiboList) {
- fuhe(weixin.getSitename(), "");
- }
- TrsSiteconfig siteconfig3 = new TrsSiteconfig();
- siteconfig3.setMsg("dsp");
- List<TrsSiteconfig> dspList = trsSiteconfigMapper.selectTrsSiteconfigList2(siteconfig3);
- for (TrsSiteconfig dsp : dspList) {
- fuhe(dsp.getSitename(), "");
- }
- TrsSiteconfig siteconfig4 = new TrsSiteconfig();
- siteconfig4.setMsg("toutiao");
- List<TrsSiteconfig> toutiaoList = trsSiteconfigMapper.selectTrsSiteconfigList2(siteconfig4);
- for (TrsSiteconfig toutiao : toutiaoList) {
- fuhe(toutiao.getSitename(), "");
- }
- }else{
- TrsSiteconfig siteconfig = new TrsSiteconfig();
- siteconfig.setMsg("site");
- List<TrsSiteconfig> siteList = trsSiteconfigMapper.selectTrsSiteconfigList2(siteconfig);
- for (TrsSiteconfig site : siteList) {
- fuhe(site.getSitename(),"");
- }
- }
- }else if("2".equals(type)){//手动复核
- //get请求调用第三方系统接口
- String url = "http://10.0.8.14:8082/hycloud/start";
- String resultStr = HttpUtils.sendGet(url);
- JSONObject jsonObject = JSONObject.parseObject(resultStr);
- if("任务执行成功".equals(jsonObject.get("msg"))){
- List<Map<String, Object>> articles = trsArticleInfoMapper.selectTrsArticleInfoReviewList();
- for (Map<String, Object> article : articles) {
- try {
- String content = article.get("content")+"";
- String title = article.get("title")+"";
- String wrong_word = article.get("wrong_word")+"";
- String correct_word = article.get("correct_word")+"";
- String datakey = article.get("datakey")+"";
- String msg = article.get("msg")+"";
- String resultid = article.get("id")+"";
- /*Date time = null;
- if(article.get("time")!=null){
- time = (Date) article.get("time");
- }else {
- time = new Date();
- }*/
- List<TrsResults> jiaodui = jiaoduiUtils.getJiaodui(content, title, type);
- boolean flag=true;
- TrsSensitiveResults trsresult = new TrsSensitiveResults();
- trsresult.setDatakey(datakey);
- //trsresult.setTime(time);
- trsresult.setId(Long.parseLong(resultid));
- for (TrsResults trsResults : jiaodui) {
- if(trsResults.getWrongWord().equals(wrong_word))
- flag=false;
- if(trsResults.getWrongWord().equals(wrong_word)||trsResults.getCorrectWord().equals(correct_word)){
- trsresult.setWrongWord(trsResults.getWrongWord());
- trsresult.setCorrectWord(trsResults.getCorrectWord());
- }
- }
- if(flag){
- if("其他".equals(msg)){
- trsresult.setStatus("其他");
- }else {
- trsresult.setStatus("已处理");
- }
- }else {
- trsresult.setStatus("待处理");
- }
- trsSensitiveResultsMapper.updateTrsSensitiveResults(trsresult);
- }catch (Exception e){
- e.printStackTrace();
- }
- }
- List<Map<String, Object>> articles2 = trsArticleInfoMapper.selectTrsArticleInfoReviewList2();
- for (Map<String, Object> article : articles2) {
- try {
- String content = article.get("content")+"";
- String title = article.get("title")+"";
- String wrong_word = article.get("wrong_word")+"";
- String datakey = article.get("datakey")+"";
- String msg = article.get("msg")+"";
- String resultid = article.get("id")+"";
- /*Date time = null;
- if(article.get("time")!=null){
- time = (Date) article.get("time");
- }else {
- time = new Date();
- }*/
- List<TrsResults> jiaodui = jiaoduiUtils.getJiaodui(content, title, type);
- boolean flag=true;
- TrsPolicyResults trsresult = new TrsPolicyResults();
- trsresult.setDatakey(datakey);
- //trsresult.setTime(time);
- trsresult.setId(Long.parseLong(resultid));
- for (TrsResults trsResults : jiaodui) {
- if(!trsResults.getWrongWord().equals(wrong_word))flag=false;
- }
- if(flag){
- if("其他".equals(msg)){
- trsresult.setStatus("其他");
- }else {
- trsresult.setStatus("已处理");
- }
- }else {
- trsresult.setStatus("待处理");
- }
- trsPolicyResultsMapper.updateTrsPolicyResults(trsresult);
- }catch (Exception e){
- e.printStackTrace();
- }
- }
- }
- }
- /* // 初始化 Scroll
- String scrollId = initScroll(id);
- // 分批处理文章
- while (true) {
- List<TrsArticleInfo> articles = loadArticlesBatch(scrollId,id);
- TrsArticleInfo trsArticleInfo = new TrsArticleInfo();
- trsArticleInfo.setRecord(id);
- List<TrsArticleInfo> articles = trsArticleInfoMapper.selectTrsArticleInfoList(trsArticleInfo);
- if (articles.isEmpty()) break;
- // 检测敏感词并生成结果
- *//*List<TrsSensitiveResults> results = detect(trie, articles, correctWordMap, errorTypeMap, trsSiteconfig.getSitename());
- // 保存结果到 MySQL
- results.forEach(result -> {
- trsSensitiveResultsMapper.insertTrsSensitiveResults(result);
- });*//*
- // 依次检测不同类型的敏感信息
- detectAndSaveResults(trsSensitiveResultsMapper, TrsSensitiveResults.class, trie, articles, correctWordMap, errorTypeMap, null, null, trsSiteconfig);
- // detectAndSaveResults(trsTechnicalResultsMapper, TrsTechnicalResults.class, technicalTrie, articles, null, technicalErrorTypeMap, null, null, trsSiteconfig);
- // detectAndSaveResults(trsPolicyResultsMapper, TrsPolicyResults.class, policyTrie, articles, null, policyErrorTypeMap, policyErrorRuleMap, null, trsSiteconfig);
- // detectAndSaveResults(trsNameResultsMapper, TrsNameResults.class, nameTrie, articles, nameCountryMap, nameWholeMap, nameShortMap, nameSortMap, trsSiteconfig);
- // detectAndSaveResults(trsDomainResultsMapper, TrsDomainResults.class, domainTrie, articles, null, domainMap, null, null, trsSiteconfig);
- if (articles.size()<100) break;
- }
- // 清理 Scroll
- ClearScrollRequest clearRequest = new ClearScrollRequest();
- clearRequest.addScrollId(scrollId);
- try {
- client.clearScroll(clearRequest, RequestOptions.DEFAULT);
- } catch (IOException e) {
- e.printStackTrace();
- }*/
- }
- private void fuhe(String sitename, String type) {
- TrsSensitiveResults trsSensitiveResults = new TrsSensitiveResults();
- // trsSensitiveResults.setStatus("待复核");
- trsSensitiveResults.setSitename(sitename);
- if("新媒体".equals(type)){
- Map<String, Object> params = new HashMap<>();
- params.put("beginTime","2025-08-01");
- params.put("endTime","2050-04-20");
- trsSensitiveResults.setParams(params);
- }
- List<TrsSensitiveResults> trsSensitiveResults1 = trsSensitiveResultsMapper.selectTrsSensitiveResultsListAll(trsSensitiveResults);
- for (TrsSensitiveResults trsSensitiveResults2 : trsSensitiveResults1) {
- if(!"待处理".equals(trsSensitiveResults2.getStatus())&&!"待复核".equals(trsSensitiveResults2.getStatus()))
- continue;
- /*long version = trsArticleInfoMapper.selectTrsArticleInfoMaxVersion();*/
- TrsArticleInfo trsArticleInfo = new TrsArticleInfo();
- /*trsArticleInfo.setRecord(trsSiteconfig.getRecord());*/
- /*trsArticleInfo.setVersion(version);*/
- trsArticleInfo.setTitle(trsSensitiveResults2.getTitle());
- trsArticleInfo.setUrl(trsSensitiveResults2.getPage());
- // List<TrsArticleInfo> articles = trsArticleInfoMapper.selectTrsArticleInfoList(trsArticleInfo);
- List<TrsArticleInfo> articles = trsArticleInfoMapper.selectTrsArticleInfoFuheList(trsArticleInfo);
- String wrongWord = trsSensitiveResults2.getWrongWord();
- if(articles.size()>0){
- List<TrsResults> jiaodui = jiaoduiUtils.getJiaodui(articles.get(0).getContent(), articles.get(0).getTitle(), "4");
- boolean flag = true;
- for (TrsResults trsResults : jiaodui) {
- if(wrongWord.equals(trsResults.getWrongWord()))flag = false;
- }
- if(flag){
- TrsSensitiveResults trsSensitiveResults_1 = new TrsSensitiveResults();
- trsSensitiveResults_1.setStatus("已处理");
- trsSensitiveResults_1.setId(trsSensitiveResults2.getId());
- trsSensitiveResultsMapper.updateTrsSensitiveResults(trsSensitiveResults_1);
- }
- }
- }
- TrsPolicyResults trsPolicyResults = new TrsPolicyResults();
- trsPolicyResults.setSitename(sitename);
- if("新媒体".equals(type)){
- Map<String, Object> params = new HashMap<>();
- params.put("beginTime","2025-08-01");
- params.put("endTime","2050-04-20");
- trsPolicyResults.setParams(params);
- }
- List<TrsPolicyResults> trsPolicyResults1 = trsPolicyResultsMapper.selectTrsPolicyResultsList(trsPolicyResults);
- for (TrsPolicyResults trsPolicyResults2 : trsPolicyResults1) {
- TrsArticleInfo trsArticleInfo = new TrsArticleInfo();
- /*trsArticleInfo.setRecord(trsSiteconfig.getRecord());*/
- trsArticleInfo.setTitle(trsPolicyResults2.getTitle());
- trsArticleInfo.setUrl(trsPolicyResults2.getPage());
- List<TrsArticleInfo> articles = trsArticleInfoMapper.selectTrsArticleInfoFuheList(trsArticleInfo);
- String wrongWord = trsPolicyResults2.getWrongWord();
- if(articles.size()>0){
- List<TrsResults> jiaodui = jiaoduiUtils.getJiaodui(articles.get(0).getContent(), articles.get(0).getTitle(), "4");
- boolean flag = true;
- for (TrsResults trsResults : jiaodui) {
- if(wrongWord.equals(trsResults.getWrongWord()))flag = false;
- }
- if(flag){
- TrsPolicyResults trsSensitiveResults_1 = new TrsPolicyResults();
- trsSensitiveResults_1.setStatus("已处理");
- trsSensitiveResults_1.setId(trsPolicyResults2.getId());
- trsPolicyResultsMapper.updateTrsPolicyResults(trsSensitiveResults_1);
- }
- }
- }
- }
- private <T> void detectAndSaveResults(Object mapper, Class<T> clazz, Trie trie, List<TrsArticleInfo> articles,
- Map<String, String> correctWordMap, Map<String, String> errorTypeMap,
- Map<String, String> extraMap1, Map<String, String> extraMap2,Map<String, Date> startTimeMap,Map<String, Date> endTimeMap,
- TrsSiteconfig trsSiteconfig) {
- if (mapper == null) {
- throw new RuntimeException("Mapper 为空,无法插入数据:" + clazz.getName());
- }
- Map<String, String> selectedErrorMap = errorTypeMap;
- Map<String, String> selectedExtraMap1 = extraMap1;
- Map<String, String> selectedExtraMap2 = extraMap2;
- List<T> results = detect(clazz, trie, articles, correctWordMap, selectedErrorMap, selectedExtraMap1, selectedExtraMap2,startTimeMap,endTimeMap, trsSiteconfig);
- // 通过反射调用 insert 方法
- for (Object result : results) {
- if (mapper instanceof TrsSensitiveResultsMapper) {
- Long id = ((TrsSensitiveResultsMapper) mapper).selectId(((TrsSensitiveResults)result).getDatakey());
- if (id != null) {
- TrsSensitiveResults result1 = (TrsSensitiveResults)result;
- result1.setId(id);
- ((TrsSensitiveResultsMapper) mapper).updateTrsSensitiveResults(result1);
- }else {
- TrsSensitiveResults result1 = (TrsSensitiveResults) result;
- result1.setStatus("待处理");
- ((TrsSensitiveResultsMapper) mapper).insertTrsSensitiveResults(result1);
- }
- } else if (mapper instanceof TrsTechnicalResultsMapper) {
- ((TrsTechnicalResultsMapper) mapper).insertTrsTechnicalResults((TrsTechnicalResults) result);
- } else if (mapper instanceof TrsPolicyResultsMapper) {
- Long id = ((TrsPolicyResultsMapper) mapper).selectId(((TrsPolicyResults)result).getDatakey());
- if (id != null) {
- TrsPolicyResults result1 = (TrsPolicyResults)result;
- result1.setId(id);
- ((TrsPolicyResultsMapper) mapper).updateTrsPolicyResults(result1);
- }else {
- TrsPolicyResults result1 = (TrsPolicyResults) result;
- result1.setStatus("待处理");
- ((TrsPolicyResultsMapper) mapper).insertTrsPolicyResults(result1);
- }
- } else if (mapper instanceof TrsNameResultsMapper) {
- ((TrsNameResultsMapper) mapper).insertTrsNameResults((TrsNameResults) result);
- } else if (mapper instanceof TrsDomainResultsMapper) {
- ((TrsDomainResultsMapper) mapper).insertTrsDomainResults((TrsDomainResults) result);
- } else {
- throw new RuntimeException("未找到适配的 Mapper:" + mapper.getClass().getSimpleName());
- }
- }
- }
- private <T> List<T> detect(Class<T> clazz, Trie trie, List<TrsArticleInfo> articles,
- Map<String, String> correctWordMap, Map<String, String> errorTypeMap,
- Map<String, String> extraMap1, Map<String, String> extraMap2,Map<String, Date> startTimeMap,Map<String, Date> endTimeMap,
- TrsSiteconfig trsSiteconfig) {
- List<T> resultSet = new ArrayList<>();
- for (TrsArticleInfo article : articles) {
- String content = article.getContent();
- if (content == null) {
- continue;
- }
- String title = article.getTitle();
- Date time = article.getTime();
- Date scantime = article.getScantime();
- String sitename = article.getSitename();
- Long record = article.getRecord();
- // resultSet.addAll(detectContent(clazz, trie, content, correctWordMap, errorTypeMap, sitename, article.getUrl()));
- // resultSet.addAll(detectSegmentedContent(clazz, trie, content, correctWordMap, errorTypeMap, sitename, article.getUrl()));
- List<T> ts = detectContent(clazz, trie, content, correctWordMap, errorTypeMap, extraMap1, extraMap2, startTimeMap, endTimeMap, trsSiteconfig, article.getUrl(), title, time, scantime, sitename, record);
- resultSet.addAll(ts);
- System.out.println("执行几次");
- // resultSet.addAll(detectSegmentedContent(clazz, trie, content, correctWordMap, errorTypeMap, extraMap1, extraMap2, trsSiteconfig, article.getUrl()));
- }
- return new ArrayList<>(resultSet);
- }
- private <T> List<T> detectContent(Class<T> clazz, Trie trie, String content,
- Map<String, String> correctWordMap, Map<String, String> errorTypeMap,
- Map<String, String> extraMap1, Map<String, String> extraMap2,Map<String, Date> startTimeMap,Map<String, Date> endTimeMap,
- TrsSiteconfig trsSiteconfig, String url, String title, Date time, Date scantime, String sitename, Long record) {
- List<T> results = new ArrayList<>();
- for (Emit emit : trie.parseText(content)) {
- String wrongWord = emit.getKeyword();
- String correctWord = "";
- if(correctWordMap!=null){
- correctWord = correctWordMap.getOrDefault(wrongWord, "未知");
- }
- // 如果匹配到的词在白名单中,则跳过
- if (isInWhitelist(content, emit.getStart(), emit.getEnd() + 1)) {
- continue;
- }
- if(!AdvancedSensitiveFilter.needsFiltering(content, wrongWord)){
- continue;
- }
- // 只匹配独立的词,确保不是误匹配
- Matcher matcher = Pattern.compile( Pattern.quote(wrongWord)).matcher(content);
- if (matcher.find()) {
- // 先对文章进行分词
- boolean flag = true;
- //时效性
- if(startTimeMap!=null && time!=null && startTimeMap.get(wrongWord)!=null && time.before(startTimeMap.get(wrongWord)) ){
- flag = false;
- }
- if(endTimeMap!=null && time!=null && endTimeMap.get(wrongWord)!=null && time.after(endTimeMap.get(wrongWord)) ){
- flag = false;
- }
- String subString = extractContext(content, emit.getStart(), emit.getEnd());
- if(correctWordMap!=null){
- if(subString!=null&&subString.contains(correctWord)&&subString.contains(wrongWord)&&!wrongWord.contains(correctWord)){
- flag=false;
- }
- }
- List<String> words = segmentText(wrongWord);
- for(String word:words){
- if (!wrongWord.contains(word)) {
- flag=false;
- }
- }
- List<String> subStringseg = segmentText(subString);
- /*if (!subStringseg.contains(wrongWord)&&!subStringseg.stream().anyMatch(words::contains)) {
- flag=false;
- }*/
- if (!subStringseg.contains(wrongWord)) {
- if(!subStringseg.containsAll(words))
- flag=false;
- }
- if(flag){
- T result = createResultInstance(clazz, url, wrongWord, correctWordMap, errorTypeMap, extraMap1, extraMap2, trsSiteconfig, content, emit.getStart(), emit.getEnd(), title, time, scantime, sitename, record);
- results.add(result);
- }
- }
- }
- return results;
- }
- private <T> List<T> detectSegmentedContent(Class<T> clazz, Trie trie, String content,
- Map<String, String> correctWordMap, Map<String, String> errorTypeMap,
- Map<String, String> extraMap1, Map<String, String> extraMap2,
- TrsSiteconfig trsSiteconfig, String url, String title, Date time, Date scantime, String sitename) {
- List<T> results = new ArrayList<>();
- // 先对文章进行分词
- List<String> words = segmentText(content);
- // 逐个分词进行敏感词匹配
- for (String word : words) {
- for (Emit emit : trie.parseText(word)) {
- String wrongWord = emit.getKeyword();
- // 如果匹配到的词在白名单中,则跳过
- if (isInWhitelist(content, emit.getStart(), emit.getEnd() + 1)) {
- continue;
- }
- // 只匹配独立的词,确保不是误匹配
- Matcher matcher = Pattern.compile("\\b" + Pattern.quote(wrongWord) + "\\b").matcher(word);
- if (matcher.find()) {
- // 确定分词的起始和结束位置
- int start = content.indexOf(word);
- int end = start + word.length();
- T result = createResultInstance(clazz, url, wrongWord, correctWordMap, errorTypeMap, extraMap1, extraMap2, trsSiteconfig, content, start, end, title, time,scantime, sitename, 99L);
- results.add(result);
- }
- }
- }
- return results;
- }
- private <T> T createResultInstance(Class<T> clazz, String url, String wrongWord,
- Map<String, String> correctWordMap, Map<String, String> errorTypeMap,
- Map<String, String> extraMap1, Map<String, String> extraMap2,
- TrsSiteconfig trsSiteconfig, String content, int start, int end, String title, Date time,Date scantime, String sitename, Long record) {
- try {
- T result = clazz.getDeclaredConstructor().newInstance();
- if (result instanceof TrsSensitiveResults) {
- TrsSensitiveResults r = (TrsSensitiveResults) result;
- /*if("1".equals(type)){
- }*/
- r.setPage(url);
- r.setTitle(title);
- r.setTime(time);
- r.setScantime(scantime);
- r.setWrongWord(wrongWord);
- r.setCorrectWord(correctWordMap.getOrDefault(wrongWord, ""));
- r.setErrorType(errorTypeMap.getOrDefault(wrongWord, ""));
- r.setErrorGrade(extraMap2.getOrDefault(wrongWord, "一般性错误"));
- r.setContext(extractContext(content, start, end));
- r.setAllcontext(content);
- r.setSitename(sitename);
- r.setTenantId(trsSiteconfig.getTenantId());
- if(97==record){
- r.setSitetag("weibo");
- } else if (98==record) {
- r.setSitetag("weixin");
- }else if (99==record) {
- r.setSitetag("site");
- }else if (96==record) {
- r.setSitetag("dsp");
- }else if (95==record) {
- r.setSitetag("toutiao");
- }
- /*r.setSitetag(trsSiteconfig.getMsg());*/
- //r.setStatus("待处理");
- r.setDatakey(Md5Utils.hash(wrongWord + url ));
- return (T) r;
- } else if (result instanceof TrsTechnicalResults) {
- TrsTechnicalResults r = (TrsTechnicalResults) result;
- r.setPage(url);
- r.setWrongWord(wrongWord);
- r.setErrorType(errorTypeMap.getOrDefault(wrongWord, ""));
- r.setContext(extractContext(content, start, end));
- r.setSitename(sitename);
- r.setTenantId(trsSiteconfig.getTenantId());
- //r.setStatus("待处理");
- return (T) r;
- } else if (result instanceof TrsPolicyResults) {
- TrsPolicyResults r = (TrsPolicyResults) result;
- r.setPage(url);
- r.setTitle(title);
- r.setTime(time);
- r.setScantime(scantime);
- r.setWrongWord(wrongWord);
- r.setErrorType(errorTypeMap.getOrDefault(wrongWord, ""));
- r.setErrorGrade(extraMap2.getOrDefault(wrongWord, "一般性错误"));
- r.setContext(extractContext(content, start, end));
- r.setAllcontext(content);
- r.setSitename(sitename);
- r.setTenantId(trsSiteconfig.getTenantId());
- if(97==record){
- r.setSitetag("weibo");
- } else if (98==record) {
- r.setSitetag("weixin");
- }else if (99==record) {
- r.setSitetag("site");
- }
- /*r.setSitetag(trsSiteconfig.getMsg());*/
- //r.setStatus("待处理");
- r.setWrongRule(extraMap1 != null ? extraMap1.getOrDefault(wrongWord, "") : "");
- r.setDatakey(Md5Utils.hash(wrongWord + url ));
- return (T) r;
- } else if (result instanceof TrsNameResults) {
- TrsNameResults r = (TrsNameResults) result;
- r.setPage(url);
- r.setTitle(title);
- r.setTime(time);
- r.setScantime(scantime);
- r.setWrongWord(wrongWord);
- r.setSort(extraMap2 != null ? extraMap2.getOrDefault(wrongWord, "") : "");
- r.setDutyShort(extraMap1 != null ? extraMap1.getOrDefault(wrongWord, "") : "");
- r.setDutyAll(errorTypeMap != null ? errorTypeMap.getOrDefault(wrongWord, "") : "");
- r.setCountry(correctWordMap != null ? correctWordMap.getOrDefault(wrongWord, "") : "");
- r.setContext(extractContext(content, start, end));
- r.setAllcontext(content);
- r.setSitename(sitename);
- r.setTenantId(trsSiteconfig.getTenantId());
- if(97==record){
- r.setSitetag("weibo");
- } else if (98==record) {
- r.setSitetag("weixin");
- }else if (99==record) {
- r.setSitetag("site");
- }
- /*r.setSitetag(trsSiteconfig.getMsg());*/
- //r.setStatus("待处理");
- r.setDatakey(Md5Utils.hash(wrongWord + url ));
- return (T) r;
- } else if (result instanceof TrsDomainResults) {
- TrsDomainResults r = (TrsDomainResults) result;
- r.setPage(url);
- r.setWrongWord(wrongWord);
- r.setDomainType(errorTypeMap != null ? errorTypeMap.getOrDefault(wrongWord, "") : "");
- r.setContext(extractContext(content, start, end));
- r.setSitename(sitename);
- r.setTenantId(trsSiteconfig.getTenantId());
- //r.setStatus("待处理");
- return (T) r;
- } else {
- throw new RuntimeException("未支持的结果类型:" + clazz.getName());
- }
- /*return result;*/
- } catch (Exception e) {
- throw new RuntimeException("无法创建实例:" + clazz.getName(), e);
- }
- }
- public List<String> segmentText(String text) {
- List<Term> termList = HanLP.segment(text);
- List<String> words = new ArrayList<>();
- for (Term term : termList) {
- words.add(term.word);
- }
- return words;
- }
- // 检测敏感词并生成结果
- /* public List<TrsSensitiveResults> detect(Trie trie, List<TrsArticleInfo> articles,
- Map<String, String> correctWordMap,
- Map<String, String> errorTypeMap, String sitename) {
- Set<TrsSensitiveResults> resultSet = new HashSet<>(); // 使用 Set 去重
- for (TrsArticleInfo article : articles) {
- String content = article.getContent();
- // 1. 先进行整体匹配
- resultSet.addAll(detectWholeContent(trie, content, correctWordMap, errorTypeMap, sitename, article.getUrl()));
- // 2. 再进行分词后匹配
- resultSet.addAll(detectSegmentedContent(trie, content, correctWordMap, errorTypeMap, sitename, article.getUrl()));
- }
- return new ArrayList<>(resultSet); // 转换为 List 返回
- }
- *//**
- * 整体匹配:直接对文章内容进行敏感词匹配
- *//*
- private List<TrsSensitiveResults> detectWholeContent(Trie trie, String content,
- Map<String, String> correctWordMap,
- Map<String, String> errorTypeMap, String sitename, String url) {
- List<TrsSensitiveResults> results = new ArrayList<>();
- for (Emit emit : trie.parseText(content)) {
- String wrongWord = emit.getKeyword();
- // 判断错词是否在白名单短语里
- if (isInWhitelist(content, emit.getStart(), emit.getEnd() + 1)) {
- continue; // 跳过匹配
- }
- // 使用正则表达式匹配独立的词
- String regex = "\\b" + Pattern.quote(wrongWord) + "\\b";
- Pattern pattern = Pattern.compile(regex);
- Matcher matcher = pattern.matcher(content);
- // 如果匹配到独立的词
- if (matcher.find()) {
- TrsSensitiveResults trsSensitiveResults = new TrsSensitiveResults();
- trsSensitiveResults.setPage(url);
- trsSensitiveResults.setWrongWord(wrongWord);
- trsSensitiveResults.setCorrectWord(correctWordMap.get(wrongWord));
- trsSensitiveResults.setErrorType(errorTypeMap.get(wrongWord));
- trsSensitiveResults.setContext(extractContext(content, emit.getStart(), emit.getEnd()));
- trsSensitiveResults.setSitename(sitename);
- trsSensitiveResults.setStatus("待处理");
- results.add(trsSensitiveResults);
- }
- }
- return results;
- }
- *//**
- * 分词后匹配:对文章内容进行分词,然后对每个分词结果进行敏感词匹配
- *//*
- private List<TrsSensitiveResults> detectSegmentedContent(Trie trie, String content,
- Map<String, String> correctWordMap,
- Map<String, String> errorTypeMap, String sitename, String url) {
- List<TrsSensitiveResults> results = new ArrayList<>();
- // 对内容进行分词
- List<String> words = segmentText(content);
- // 对每个分词结果进行敏感词匹配
- for (String word : words) {
- for (Emit emit : trie.parseText(word)) {
- String wrongWord = emit.getKeyword();
- // 判断错词是否在白名单短语里
- if (isInWhitelist(content, emit.getStart(), emit.getEnd() + 1)) {
- continue; // 跳过匹配
- }
- // 使用正则表达式匹配独立的词
- String regex = "\\b" + Pattern.quote(wrongWord) + "\\b";
- Pattern pattern = Pattern.compile(regex);
- Matcher matcher = pattern.matcher(word);
- // 如果匹配到独立的词
- if (matcher.find()) {
- // 找到分词结果在原始内容中的位置
- int start = content.indexOf(word); // 分词结果在原始内容中的起始位置
- int end = start + word.length(); // 分词结果在原始内容中的结束位置
- TrsSensitiveResults trsSensitiveResults = new TrsSensitiveResults();
- trsSensitiveResults.setPage(url);
- trsSensitiveResults.setWrongWord(wrongWord);
- trsSensitiveResults.setCorrectWord(correctWordMap.get(wrongWord));
- trsSensitiveResults.setErrorType(errorTypeMap.get(wrongWord));
- // trsSensitiveResults.setContext(extractContext(word, emit.getStart(), emit.getEnd()));
- trsSensitiveResults.setContext(extractContext(content, start, end)); // 从原始内容中截取上下文
- trsSensitiveResults.setSitename(sitename);
- trsSensitiveResults.setStatus("待处理");
- results.add(trsSensitiveResults);
- }
- }
- }
- return results;
- }*/
- // 提取上下文(示例:前后 20 字符)
- private String extractContext(String text, int start, int end) {
- int contextSize = 20;
- int contextStart = Math.max(0, start - contextSize);
- int contextEnd = Math.min(text.length(), end + contextSize);
- return text.substring(contextStart, contextEnd);
- }
- public void doArticleInfo(Long id) {
- }
- // 初始化 Scroll 查询
- public String initScroll(Long id) {
- SearchRequest searchRequest = new SearchRequest("articles");
- SearchSourceBuilder sourceBuilder = new SearchSourceBuilder();
- BoolQueryBuilder boolQueryBuilder = QueryBuilders.boolQuery();
- boolQueryBuilder.filter(QueryBuilders.termQuery("record", id));
- sourceBuilder.query(boolQueryBuilder);
- // sourceBuilder.query(QueryBuilders.matchAllQuery());
- sourceBuilder.size(100); // 每批加载 1000 篇文章
- // sourceBuilder.timeout(TimeValue.timeValueSeconds(30)); // 增加超时时间
- searchRequest.source(sourceBuilder);
- searchRequest.scroll(TimeValue.timeValueMinutes(1L));
- SearchResponse response = null;
- try {
- response = client.search(searchRequest, RequestOptions.DEFAULT);
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
- return response.getScrollId();
- }
- // 从 MySQL 加载敏感词库
- public Trie loadSensitiveWords(Map<String, String> correctWordMap, Map<String, String> errorTypeMap, Map<String, String> errorGradeMap, Map<String, Date> startTimeMap, Map<String, Date> endTimeMap) {
- /*
- java Aho-Corasick算法
- Aho-Corasick算法是一种用于多模式字符串匹配的算法,它可以同时在一个文本字符串中查找多个模式字符串。该算法由Alfred V. Aho和Margaret J. Corasick在1975年共同发明。相比于传统的多模式匹配方法(如KMP算法),Aho-Corasick算法在处理多个模式时具有更高的效率。
- Aho-Corasick算法的工作原理
- 构建Trie树:首先,将所有模式字符串构建成一个Trie树(前缀树)。
- 构建失败链接:在Trie树上添加失败链接(也称为失效链接或故障链接),使得在匹配过程中遇到非匹配字符时可以跳转到Trie树上的其他节点,从而减少不必要的回溯。
- 输出链接:对于每个节点,标记其所有子节点对应的模式字符串的结束位置。这样,在遍历文本字符串时,一旦到达某个节点的输出链接,就意味着在该位置找到了一个模式字符串。
- 匹配过程:在文本字符串上从左到右进行扫描,利用Trie树和失败链接进行匹配。如果在某个节点上找到了一个模式字符串的结束,就记录下这个位置。
- */
- Trie.TrieBuilder builder = Trie.builder();
- TrsSensitiveWords trsSensitiveWords = new TrsSensitiveWords();
- trsSensitiveWords.setStatus("已通过");
- List<TrsSensitiveWords> sensitiveWords = trsSensitiveWordsMapper.selectTrsSensitiveWordsList(trsSensitiveWords);
- for (TrsSensitiveWords sensitiveWord : sensitiveWords) {
- String wrongWord = sensitiveWord.getWrongWord();
- builder.addKeyword(wrongWord);
- correctWordMap.put(wrongWord, sensitiveWord.getCorrectWord());
- errorTypeMap.put(wrongWord, sensitiveWord.getErrorType());
- errorGradeMap.put(wrongWord, sensitiveWord.getErrorGrade());
- startTimeMap.put(wrongWord, sensitiveWord.getEffectstart());
- endTimeMap.put(wrongWord, sensitiveWord.getEffectend());
- }
- TrsSensitiveWordsTenant trsSensitiveWordsTenant = new TrsSensitiveWordsTenant();
- trsSensitiveWordsTenant.setStatus("已通过");
- List<TrsSensitiveWordsTenant> sensitiveWordsTenants = trsSensitiveWordsTenantMapper.selectTrsSensitiveWordsList(trsSensitiveWordsTenant);
- for (TrsSensitiveWordsTenant sensitiveWord : sensitiveWordsTenants) {
- String wrongWord = sensitiveWord.getWrongWord();
- builder.addKeyword(wrongWord);
- correctWordMap.put(wrongWord, sensitiveWord.getCorrectWord());
- errorTypeMap.put(wrongWord, sensitiveWord.getErrorType());
- errorGradeMap.put(wrongWord, sensitiveWord.getErrorGrade());
- startTimeMap.put(wrongWord, sensitiveWord.getEffectstart());
- endTimeMap.put(wrongWord, sensitiveWord.getEffectend());
- }
- return builder.build();
- }
- public Trie loadTechnicalWords(Map<String, String> errorTypeMap) {
- Trie.TrieBuilder builder = Trie.builder();
- TrsTechnicalWords results = new TrsTechnicalWords();
- results.setStatus("已通过");
- List<TrsTechnicalWords> results1 = trsTechnicalWordsMapper.selectTrsTechnicalWordsList(results);
- for (TrsTechnicalWords sensitiveWord : results1) {
- String wrongWord = sensitiveWord.getWrongWord();
- builder.addKeyword(wrongWord);
- errorTypeMap.put(wrongWord, sensitiveWord.getErrorType());
- }
- return builder.build();
- }
- public Trie loadPolicyWords(Map<String, String> policyErrorRuleMap, Map<String, String> policyErrorTypeMap, Map<String, Date> startTimeMap, Map<String, Date> endTimeMap) {
- Trie.TrieBuilder builder = Trie.builder();
- TrsPolicyWords results = new TrsPolicyWords();
- results.setStatus("已通过");
- List<TrsPolicyWords> results1 = trsPolicyWordsMapper.selectTrsPolicyWordsList(results);
- for (TrsPolicyWords sensitiveWord : results1) {
- String wrongWord = sensitiveWord.getWrongWord();
- builder.addKeyword(wrongWord);
- policyErrorRuleMap.put(wrongWord, sensitiveWord.getErrorRule());
- policyErrorTypeMap.put(wrongWord, sensitiveWord.getErrorType());
- startTimeMap.put(wrongWord, sensitiveWord.getEffectstart());
- endTimeMap.put(wrongWord, sensitiveWord.getEffectend());
- }
- TrsPolicyWordsTenant trsPolicyWordsTenant = new TrsPolicyWordsTenant();
- trsPolicyWordsTenant.setStatus("已通过");
- List<TrsPolicyWordsTenant> trsPolicyWordsTenants = trsPolicyWordsTenantMapper.selectTrsPolicyWordsList(trsPolicyWordsTenant);
- for (TrsPolicyWordsTenant sensitiveWord : trsPolicyWordsTenants) {
- String wrongWord = sensitiveWord.getWrongWord();
- builder.addKeyword(wrongWord);
- policyErrorRuleMap.put(wrongWord, sensitiveWord.getErrorRule());
- policyErrorTypeMap.put(wrongWord, sensitiveWord.getErrorType());
- startTimeMap.put(wrongWord, sensitiveWord.getEffectstart());
- endTimeMap.put(wrongWord, sensitiveWord.getEffectend());
- }
- return builder.build();
- }
- public Trie loadNameWords(Map<String, String> nameCountryMap, Map<String, String> nameWholeMap, Map<String, String> nameShortMap, Map<String, String> nameSortMap) {
- Trie.TrieBuilder builder = Trie.builder();
- TrsNameWords results = new TrsNameWords();
- results.setStatus("已通过");
- List<TrsNameWords> results1 = trsNameWordsMapper.selectTrsNameWordsList(results);
- for (TrsNameWords sensitiveWord : results1) {
- String wrongWord = sensitiveWord.getWrongWord();
- builder.addKeyword(wrongWord);
- nameCountryMap.put(wrongWord, sensitiveWord.getCountry());
- nameWholeMap.put(wrongWord, sensitiveWord.getDutyAll());
- nameShortMap.put(wrongWord, sensitiveWord.getDutyShort());
- nameSortMap.put(wrongWord, sensitiveWord.getSort());
- }
- return builder.build();
- }
- public Trie loadDomainWords(Map<String, String> domainMap) {
- Trie.TrieBuilder builder = Trie.builder();
- TrsDomainWords results = new TrsDomainWords();
- results.setStatus("已通过");
- List<TrsDomainWords> results1 = trsDomainWordsMapper.selectTrsDomainWordsList(results);
- for (TrsDomainWords sensitiveWord : results1) {
- String wrongWord = sensitiveWord.getWrongWord();
- builder.addKeyword(wrongWord);
- domainMap.put(wrongWord, sensitiveWord.getDomainType());
- }
- return builder.build();
- }
- // 加载一批文章
- /* public List<TrsArticleInfo> loadArticlesBatch(String scrollId, Long id) {
- List<TrsArticleInfo> articles = new ArrayList<>();
- SearchScrollRequest scrollRequest = new SearchScrollRequest(scrollId);
- scrollRequest.scroll(TimeValue.timeValueMinutes(1L));
- SearchResponse response = null;
- try {
- response = client.scroll(scrollRequest, RequestOptions.DEFAULT);
- } catch (IOException e) {
- e.printStackTrace();
- }
- //通过response获取命中的数量
- System.out.println("response获取命中的数量:" + response.getHits().getTotalHits().value);
- long totalHits = response.getHits().getTotalHits().value;
- if (totalHits < 100 && totalHits>0) {
- SearchRequest searchRequest = new SearchRequest("articles");
- SearchSourceBuilder sourceBuilder = new SearchSourceBuilder();
- BoolQueryBuilder boolQueryBuilder = QueryBuilders.boolQuery();
- boolQueryBuilder.filter(QueryBuilders.termQuery("record", id));
- searchRequest.source(sourceBuilder);
- sourceBuilder.query(boolQueryBuilder);
- try {
- response = client.search(searchRequest, RequestOptions.DEFAULT);
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- for (SearchHit hit : response.getHits().getHits()) {
- Map<String, Object> source = hit.getSourceAsMap();
- TrsArticleInfo article =new TrsArticleInfo();
- article.setContent((String) source.get("content"));
- article.setUrl((String) source.get("page"));
- articles.add(article);
- }
- return articles;
- }*/
- private Set<String> whitelistPhrases = new HashSet<>();
- public void loadWhitelist() {
- // 这里可以从数据库或配置文件加载白名单
- /*whitelistPhrases.add("内蒙古自治区");
- whitelistPhrases.add("中国银行");
- whitelistPhrases.add("中华人民共和国");*/
- TrsSensitiveWords trsSensitiveWords = new TrsSensitiveWords();
- trsSensitiveWords.setFlag(0);
- trsSensitiveWordsMapper.selectTrsSensitiveWordsListBy(new TrsSensitiveWords()).forEach(sensitiveWord -> {
- whitelistPhrases.add(sensitiveWord.getWrongWord());
- });
- }
- private boolean isInWhitelist(String text, int start, int end) {
- for (String phrase : whitelistPhrases) {
- int phraseStart = start - (phrase.length() - (end - start)); // 计算完整短语的起点
- if (phraseStart >= 0 && phraseStart + phrase.length() <= text.length()) {
- String subText = text.substring(phraseStart, phraseStart + phrase.length());
- if (whitelistPhrases.contains(subText)) {
- return true; // 如果匹配到白名单短语,则跳过
- }
- }
- }
- return false;
- }
- @Override
- public void docon(Long id) {
- //TrsSiteconfig trsSiteconfig = trsSiteconfigMapper.selectTrsSiteconfigById(id);
- //loadAllowedDomains(trsSiteconfig.getDomain());
- loadTrsErrorUrls();
- TrsSiteconfig siteconfig = new TrsSiteconfig();
- List<TrsSiteconfig> siteList = trsSiteconfigMapper.selectTrsSiteconfigList2(siteconfig);
- for (TrsSiteconfig site : siteList) {
- // 设置分页参数
- int pageNum = 1; // 当前页码
- int pageSize = 500; // 每页大小,根据实际情况调整
- boolean hasMoreData = true;
- TrsArticleInfo trsArticleInfo = new TrsArticleInfo();
- trsArticleInfo.setSitename(site.getSitename());
- trsArticleInfo.setRecord(site.getRecord());
- while (hasMoreData) {
- try {
- // 1. 设置分页参数(关键步骤)
- PageHelper.startPage(pageNum, pageSize);
- // 2. 紧跟着的第一个查询会被分页
- List<TrsArticleInfo> articles = trsArticleInfoMapper.selectTrsArticleInfoList(trsArticleInfo);
- if (articles == null || articles.isEmpty()) {
- hasMoreData = false;
- } else {
- // 4. 处理当前页数据
- processUrls(articles,site,id);
- // 5. 准备查询下一页
- // 重要:检查是否已经是最后一页
- if (articles.size() < pageSize) {
- hasMoreData = false;
- } else {
- pageNum++;
- }
- }
- } finally {
- // 7. 确保每次循环后清除分页参数(重要!)
- PageHelper.clearPage();
- }
- }
- }
- }
- private Set<String> TRS_ERROR_URLS = new HashSet<>();
- private void loadTrsErrorUrls() {
- TrsErrorUrl trsErrorUrl = new TrsErrorUrl();
- List<TrsErrorUrl> trsErrorUrls = trsErrorUrlMapper.selectTrsErrorUrlList(trsErrorUrl);
- for (TrsErrorUrl url : trsErrorUrls) {
- TRS_ERROR_URLS.add(url.getErrorurl());
- }
- }
- private boolean isErrorLink(String url) {
- try {
- if (url == null || url.isEmpty()) {
- return false;
- }
- URL parsedUrl = new URL(url);
- String host = parsedUrl.getHost();
- return TRS_ERROR_URLS.contains(host);
- } catch (Exception e) {
- e.printStackTrace();
- for (String url2 : TRS_ERROR_URLS) {
- if (url.contains(url2)) {
- return true;
- }
- }
- return false;
- }
- }
- private Set<String> ALLOWED_DOMAINS = new HashSet<>();
- private ArrayList<String> ALLOWED_DOMAINS2 = new ArrayList<>();
- public void loadAllowedDomains(String domain) {
- ALLOWED_DOMAINS.add(domain);
- ALLOWED_DOMAINS2.add(domain);
- }
- /**
- * 判断 URL 是否是外链
- *
- * @param url URL 地址
- * @param domain
- * @return true 是外链,false 不是外链
- */
- private boolean isExternalLink(String url, String domain) {
- try {
- if (url == null || url.isEmpty()) {
- return false;
- }
- if ("mp.weixin.qq.com".equals(domain)&&(url.contains("res.wx.qq.com")||url.contains("captcha.gtimg.com"))) {
- return false;
- }
- URL parsedUrl = new URL(url);
- String host = parsedUrl.getHost();
- // return !ALLOWED_DOMAINS.contains(host);
- return !host.contains(domain);
- } catch (Exception e) {
- e.printStackTrace();
- return true; // 如果解析失败,默认认为是外链
- }
- }
- /**
- * 判断 URL 是否是错断链
- *
- * @param url URL 地址
- * @return true 是错断链,false 不是错断链
- */
- public boolean isBrokenLink(String url) {
- Request request = new Request.Builder()
- .url(url)
- .head()
- .build();
- try (Response response = okHttpClient.newCall(request).execute()) {
- int responseCode = response.code();
- // return responseCode == 404 || responseCode >= 400;
- return responseCode == 404 || responseCode == 500 || responseCode == 502;
- } catch (IOException e) {
- return false;
- }
- }
- public String fetchHtmlContent(String url) {
- Request request = new Request.Builder()
- .url(url)
- .get() // 使用 GET 请求(默认方法,可省略)
- .build();
- try (Response response = okHttpClient.newCall(request).execute()) {
- if (!response.isSuccessful()) {
- throw new IOException("请求失败,HTTP 状态码: " + response.code());
- }
- // 返回 HTML 内容(假设响应是文本/HTML)
- return response.body().string();
- } catch (IOException e) {
- e.printStackTrace();
- }
- return null;
- }
- /* private boolean isBrokenLink(String url) {
- HttpURLConnection connection = null;
- try {
- URL parsedUrl = new URL(url);
- // Configure to ignore SSL verification for HTTPS URLs
- if (parsedUrl.getProtocol().equalsIgnoreCase("https")) {
- // Create a trust manager that does not validate certificate chains
- TrustManager[] trustAllCerts = new TrustManager[]{
- new X509TrustManager() {
- public X509Certificate[] getAcceptedIssuers() {
- return new X509Certificate[0];
- }
- public void checkClientTrusted(X509Certificate[] certs, String authType) {
- }
- public void checkServerTrusted(X509Certificate[] certs, String authType) {
- }
- }
- };
- // Install the all-trusting trust manager
- SSLContext sc = SSLContext.getInstance("SSL");
- sc.init(null, trustAllCerts, new java.security.SecureRandom());
- HttpsURLConnection.setDefaultSSLSocketFactory(sc.getSocketFactory());
- // Create all-trusting host name verifier
- HostnameVerifier allHostsValid = (hostname, session) -> true;
- HttpsURLConnection.setDefaultHostnameVerifier(allHostsValid);
- }
- connection = (HttpURLConnection) parsedUrl.openConnection();
- connection.setRequestMethod("HEAD"); // Use HEAD method to only get headers
- connection.setConnectTimeout(60000); // Set connection timeout
- connection.setReadTimeout(60000); // Set read timeout
- connection.setInstanceFollowRedirects(true); // Follow redirects
- int responseCode = connection.getResponseCode();
- if(responseCode == HttpURLConnection.HTTP_NOT_FOUND || responseCode >= 400){
- System.out.println(1111111);
- }
- return responseCode == HttpURLConnection.HTTP_NOT_FOUND || responseCode >= 400;
- } catch (IOException e) {
- return false; // If request fails, consider it a broken link
- } catch (Exception e) {
- return false; // For any other exception, consider it a broken link
- } finally {
- if (connection != null) {
- connection.disconnect(); // Close connection
- }
- }
- }*/
- /**
- * 获取 URL 的内容
- *
- * @param url URL 地址
- * @return URL 的内容,如果获取失败则返回 null
- */
- private String getUrlContent(String url) {
- HttpURLConnection connection = null;
- try {
- URL parsedUrl = new URL(url);
- connection = (HttpURLConnection) parsedUrl.openConnection();
- connection.setRequestMethod("GET"); // 使用 GET 方法获取内容
- connection.setConnectTimeout(5000); // 设置连接超时
- connection.setReadTimeout(5000); // 设置读取超时
- int responseCode = connection.getResponseCode();
- if (responseCode == HttpURLConnection.HTTP_OK) {
- InputStream inputStream = connection.getInputStream();
- Scanner scanner = new Scanner(inputStream).useDelimiter("\\A");
- return scanner.hasNext() ? scanner.next() : null;
- } else {
- return null; // 如果状态码不是 200,返回 null
- }
- } catch (IOException e) {
- e.printStackTrace();
- return null; // 如果请求失败,返回 null
- } finally {
- if (connection != null) {
- connection.disconnect(); // 关闭连接
- }
- }
- }
- /**
- * 处理单个 URL
- *
- * @param
- * @param id
- * @return 处理结果
- */
- private String processSingleUrl(TrsArticleInfo trsArticleInfo, TrsSiteconfig siteconfig, Long id) {
- TrsUrlResult trsUrlResult = new TrsUrlResult();
- trsUrlResult.setUrl(trsArticleInfo.getUrl());
- trsUrlResult.setRecord(trsArticleInfo.getRecord());
- trsUrlResult.setSitename(trsArticleInfo.getSitename());
- trsUrlResult.setTitle(trsArticleInfo.getTitle());
- trsUrlResult.setSitetag(siteconfig.getMsg());
- trsUrlResult.setMsg("-");
- List<TrsUrlResult> trsUrlResults = trsUrlResultMapper.selectTrsUrlResultList2(trsUrlResult);
- boolean isExist = trsUrlResults.size() < 1;
- if(isExist && isErrorLink(trsArticleInfo.getUrl())){
- trsUrlResult.setTenantId(siteconfig.getTenantId());
- trsUrlResult.setTime(new Date());
- trsUrlResult.setLinkurl(trsArticleInfo.getUrl());
- trsUrlResult.setType("敏感链接");
- trsUrlResultMapper.insertTrsUrlResult(trsUrlResult);
- } else if (isExist && isBrokenLink(trsArticleInfo.getUrl())) {
- trsUrlResult.setTenantId(siteconfig.getTenantId());
- trsUrlResult.setTime(new Date());
- trsUrlResult.setLinkurl(trsArticleInfo.getUrl());
- trsUrlResult.setType("错断链");
- trsUrlResultMapper.insertTrsUrlResult(trsUrlResult);
- return "错断链: " + trsArticleInfo.getUrl();
- }else if (isExist && isExternalLink(trsArticleInfo.getUrl(),siteconfig.getDomain())) {
- trsUrlResult.setTenantId(siteconfig.getTenantId());
- trsUrlResult.setTime(new Date());
- trsUrlResult.setLinkurl(trsArticleInfo.getUrl());
- trsUrlResult.setType("外链");
- trsUrlResultMapper.insertTrsUrlResult(trsUrlResult);
- return "外链: " + trsArticleInfo.getUrl();
- } else {
- // String content = getUrlContent(trsArticleInfo.getUrl());
- // String content = trsArticleInfo.getContent();
- String content = "";
- if(id != null && 100 == id){
- content = fetchHtmlContent(trsArticleInfo.getUrl());
- }else {
- content = trsArticleInfo.getContent();
- }
- if (content != null && !"".equals(content)) {
- for (String link : TRS_ERROR_URLS) {
- if(content.contains(link)){
- trsUrlResult.setTenantId(siteconfig.getTenantId());
- trsUrlResult.setMsg("-");
- trsUrlResult.setType("敏感链接");
- trsUrlResult.setLinkurl(link);
- List<TrsUrlResult> trsUrlResults2 = trsUrlResultMapper.selectTrsUrlResultList2(trsUrlResult);
- boolean isExist2 = trsUrlResults2.size() < 1;
- if(isExist2)
- trsUrlResultMapper.insertTrsUrlResult(trsUrlResult);
- }
- }
- String contentNoHtml = HtmlUtils.removeAllTags(content);
- if(SensitiveInfoDetector.containsIdCard(contentNoHtml)){
- for (String idCard : SensitiveInfoDetector.extractIdCards(contentNoHtml)){
- if(SensitiveInfoDetector.extractContext2(contentNoHtml, idCard).contains("."))continue;
- TrsPersonResult trsPersonResult = new TrsPersonResult();
- trsPersonResult.setPage(trsArticleInfo.getUrl());
- trsPersonResult.setSitename(trsArticleInfo.getSitename());
- trsPersonResult.setSitetag(siteconfig.getMsg());
- trsPersonResult.setTitle(trsArticleInfo.getTitle());
- trsPersonResult.setWrongWord(idCard);
- trsPersonResult.setWrongType("身份证号码");
- trsPersonResult.setStatus("待处理");
- trsPersonResult.setContext(SensitiveInfoDetector.extractContext(contentNoHtml, idCard));
- trsPersonResult.setAllcontext(content);
- trsPersonResult.setDatakey(Md5Utils.hash(idCard + trsArticleInfo.getUrl()));
- trsPersonResult.setTenantId(siteconfig.getTenantId());
- List<TrsPersonResult> trsUrlResults2 = trsPersonResultMapper.selectTrsPersonResultList2(trsPersonResult);
- trsPersonResult.setTime(new Date());
- boolean isExist2 = trsUrlResults2.size() < 1;
- if(isExist2)
- trsPersonResultMapper.insertTrsPersonResult(trsPersonResult);
- }
- }
- if(SensitiveInfoDetector.containsPhone(contentNoHtml)){
- for (String idCard : SensitiveInfoDetector.extractPhones(contentNoHtml)){
- if(SensitiveInfoDetector.extractContext2(contentNoHtml, idCard).contains("."))continue;
- TrsPersonResult trsPersonResult = new TrsPersonResult();
- trsPersonResult.setPage(trsArticleInfo.getUrl());
- trsPersonResult.setSitename(trsArticleInfo.getSitename());
- trsPersonResult.setSitetag(siteconfig.getMsg());
- trsPersonResult.setTitle(trsArticleInfo.getTitle());
- trsPersonResult.setWrongWord(idCard);
- trsPersonResult.setWrongType("手机号码");
- trsPersonResult.setStatus("待处理");
- trsPersonResult.setContext(SensitiveInfoDetector.extractContext(contentNoHtml, idCard));
- trsPersonResult.setAllcontext(content);
- trsPersonResult.setDatakey(Md5Utils.hash(idCard + trsArticleInfo.getUrl()));
- trsPersonResult.setTenantId(siteconfig.getTenantId());
- List<TrsPersonResult> trsUrlResults2 = trsPersonResultMapper.selectTrsPersonResultList2(trsPersonResult);
- trsPersonResult.setTime(new Date());
- boolean isExist2 = trsUrlResults2.size() < 1;
- if(isExist2)
- trsPersonResultMapper.insertTrsPersonResult(trsPersonResult);
- }
- }
- if(SensitiveInfoDetector.containsBankCard(contentNoHtml)){
- for (String idCard : SensitiveInfoDetector.extractBankCards(contentNoHtml)){
- if(SensitiveInfoDetector.extractContext2(contentNoHtml, idCard).contains("."))continue;
- TrsPersonResult trsPersonResult = new TrsPersonResult();
- trsPersonResult.setPage(trsArticleInfo.getUrl());
- trsPersonResult.setSitename(trsArticleInfo.getSitename());
- trsPersonResult.setSitetag(siteconfig.getMsg());
- trsPersonResult.setTitle(trsArticleInfo.getTitle());
- trsPersonResult.setWrongWord(idCard);
- trsPersonResult.setWrongType("银行卡号");
- trsPersonResult.setStatus("待处理");
- trsPersonResult.setContext(SensitiveInfoDetector.extractContext(contentNoHtml, idCard));
- trsPersonResult.setAllcontext(content);
- trsPersonResult.setDatakey(Md5Utils.hash(idCard + trsArticleInfo.getUrl()));
- trsPersonResult.setTenantId(siteconfig.getTenantId());
- List<TrsPersonResult> trsUrlResults2 = trsPersonResultMapper.selectTrsPersonResultList2(trsPersonResult);
- trsPersonResult.setTime(new Date());
- boolean isExist2 = trsUrlResults2.size() < 1;
- if(isExist2)
- trsPersonResultMapper.insertTrsPersonResult(trsPersonResult);
- }
- }
- // 检查内容中的链接
- checkContentLinks(trsArticleInfo, content, trsUrlResult, siteconfig.getDomain(), siteconfig.getTenantId());
- //更新到数据库中
- // trsArticleInfo.setContent(content);
- // trsArticleInfoMapper.updateTrsArticleInfo(trsArticleInfo);
- return "URL 内容: " + content.substring(0, Math.min(content.length(), 100)) + "...";
- } else {
- return "无法获取 URL 内容: " + trsArticleInfo.getUrl();
- }
- }
- return "";
- }
- /**
- * 多线程处理 URL 列表
- *
- * @param urls URL 列表
- * @return 处理结果列表
- */
- // 线程池
- private final ExecutorService executorService = Executors.newFixedThreadPool(1);
- public ConcurrentHashMap<String, String> processUrls(List<TrsArticleInfo> trsArticleInfos, TrsSiteconfig siteconfig, Long id) {
- ConcurrentHashMap<String, String> resultMap = new ConcurrentHashMap<>();
- CountDownLatch latch = new CountDownLatch(trsArticleInfos.size()); // 用于等待所有任务完成
- // 提交任务到线程池
- for (TrsArticleInfo articleInfo : trsArticleInfos) {
- executorService.submit(() -> {
- try {
- String result = processSingleUrl(articleInfo,siteconfig,id);
- resultMap.put(articleInfo.getUrl(), result);
- }catch (Exception e){
- e.printStackTrace();
- }finally {
- latch.countDown(); // 任务完成,计数器减一
- }
- });
- }
- try {
- latch.await(); // 等待所有任务完成
- } catch (InterruptedException e) {
- e.printStackTrace();
- }
- return resultMap;
- }
- /**
- * 关闭线程池
- */
- @PreDestroy
- public void shutdown() {
- executorService.shutdown();
- try {
- if (!executorService.awaitTermination(1, TimeUnit.MINUTES)) {
- executorService.shutdownNow();
- }
- } catch (InterruptedException e) {
- executorService.shutdownNow();
- }
- }
- private List<String> extractLinksFromContent(String content) {
- List<String> links = new ArrayList<>();
- // 正则表达式匹配href和src链接
- Pattern pattern = Pattern.compile("(href|src)=\"([^\"]*)\"");
- Matcher matcher = pattern.matcher(content);
- while (matcher.find()) {
- String link = matcher.group(2);
- if (link != null && !link.isEmpty() && !link.startsWith("#") && !link.startsWith("javascript:")&& !link.equals("./")) {
- links.add(link);
- }
- }
- return links;
- }
- private void checkContentLinks(TrsArticleInfo articleInfo, String content, TrsUrlResult trsUrlResult, String domain, String tenantId) {
- List<String> links = extractLinksFromContent(content);
- for (String link : links) {
- try {
- // 处理相对路径
- String absoluteLink = makeAbsoluteUrl(articleInfo.getUrl(), link);
- trsUrlResult.setMsg("-");
- trsUrlResult.setLinkurl(absoluteLink);
- List<TrsUrlResult> trsUrlResults = trsUrlResultMapper.selectTrsUrlResultList2(trsUrlResult);
- boolean isExist = trsUrlResults.size() < 1;
- trsUrlResult.setTime(new Date());
- if(isExist&&isErrorLink(absoluteLink)){
- trsUrlResult.setTenantId(tenantId);
- trsUrlResult.setType("敏感链接");
- trsUrlResultMapper.insertTrsUrlResult(trsUrlResult);
- } else if (isExist&&isBrokenLink(absoluteLink)) {
- // 记录错断链
- trsUrlResult.setTenantId(tenantId);
- trsUrlResult.setType("错断链");
- if(absoluteLink.contains("exPlay")||absoluteLink.contains("mp4"))continue;
- if(absoluteLink.contains("beian"))continue;
- if(absoluteLink.contains("img.henan"))continue;
- if(absoluteLink.contains("baike.baidu.com"))continue;
- trsUrlResultMapper.insertTrsUrlResult(trsUrlResult);
- //recordLinkIssue(articleInfo, absoluteLink, "错断链");
- }else if (isExist&&isExternalLink(absoluteLink, domain)) {
- // 记录外链
- trsUrlResult.setType("外链");
- trsUrlResultMapper.insertTrsUrlResult(trsUrlResult);
- //recordLinkIssue(articleInfo, absoluteLink, "外链");
- }
- } catch (Exception e) {
- // 记录无效链接
- recordLinkIssue(articleInfo, link, "无效链接");
- }
- }
- }
- private String makeAbsoluteUrl(String baseUrl, String link) throws MalformedURLException {
- if (link.startsWith("http://") || link.startsWith("https://")) {
- return link;
- }
- URL base = null;
- try {
- base = new URL(baseUrl);
- } catch (MalformedURLException e) {
- throw new RuntimeException(e);
- }
- return new URL(base, link).toString();
- }
- private void recordLinkIssue(TrsArticleInfo articleInfo, String link, String issueType) {
- // 这里可以记录到数据库或日志中
- System.out.println("文章ID: " + articleInfo.getId() + ", URL: " + articleInfo.getUrl() +
- ", 问题链接: " + link + ", 问题类型: " + issueType);
- // 如果需要保存到数据库,可以创建一个新的表来存储这些信息
- // linkIssueMapper.insert(new LinkIssue(articleInfo.getId(), link, issueType));
- }
- }
|