|
|
@@ -166,6 +166,53 @@ public class DocumentUtil {
|
|
|
return content;
|
|
|
}
|
|
|
|
|
|
+ /**
|
|
|
+ * 智能提取文件内容,优先根据扩展名判断
|
|
|
+ */
|
|
|
+ public String smartExtractContent(MultipartFile file, String fileType) throws IOException {
|
|
|
+ String fileName = file.getOriginalFilename();
|
|
|
+
|
|
|
+ // 优先根据扩展名判断
|
|
|
+ if (fileName != null) {
|
|
|
+ String lowerFileName = fileName.toLowerCase();
|
|
|
+
|
|
|
+ // 处理Excel文件
|
|
|
+ if (lowerFileName.endsWith(".xlsx")) {
|
|
|
+ try (InputStream inputStream = file.getInputStream()) {
|
|
|
+ return extractXLSXContent(inputStream);
|
|
|
+ }
|
|
|
+ } else if (lowerFileName.endsWith(".xls")) {
|
|
|
+ // 如果是.xls文件,即使魔数识别为wps,也尝试按xls处理
|
|
|
+ try (InputStream inputStream = file.getInputStream()) {
|
|
|
+ return extractXLSContent(inputStream);
|
|
|
+ } catch (Exception e) {
|
|
|
+ // 如果按xls处理失败,且魔数识别为wps,则尝试按wps表格处理
|
|
|
+ if ("wps".equals(fileType)) {
|
|
|
+ try (InputStream inputStream = file.getInputStream()) {
|
|
|
+ return extractWPSETContent(inputStream);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ throw e;
|
|
|
+ }
|
|
|
+ } else if (lowerFileName.endsWith(".et")) {
|
|
|
+ // WPS表格文件
|
|
|
+ try (InputStream inputStream = file.getInputStream()) {
|
|
|
+ return extractWPSETContent(inputStream);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // 处理其他文件
|
|
|
+ try (InputStream inputStream = file.getInputStream()) {
|
|
|
+ return extractContent(fileType, inputStream);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // 如果没有扩展名,按原逻辑处理
|
|
|
+ try (InputStream inputStream = file.getInputStream()) {
|
|
|
+ return extractContent(fileType, inputStream);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
/**
|
|
|
* 提取WPS文档内容(.wps, .wpt格式)
|
|
|
*/
|
|
|
@@ -485,7 +532,8 @@ public class DocumentUtil {
|
|
|
extractedText.append("\n");
|
|
|
}
|
|
|
//
|
|
|
- return extractedText.toString().replaceAll("\\r?\\n", "</br>").replaceAll("\\t", " ");
|
|
|
+// return extractedText.toString().replaceAll("\\r?\\n", "</br>").replaceAll("\\t", " ");
|
|
|
+ return extractedText.toString().replaceAll("\\r?\\n", "</br>").replaceAll(" ", "");
|
|
|
}catch (Exception e){
|
|
|
e.printStackTrace();
|
|
|
}finally {
|
|
|
@@ -538,7 +586,8 @@ public class DocumentUtil {
|
|
|
}
|
|
|
extractedText.append("\n");
|
|
|
}
|
|
|
- return extractedText.toString().replaceAll("\\r?\\n", "</br>").replaceAll("\\t", " ");
|
|
|
+// return extractedText.toString().replaceAll("\\r?\\n", "</br>").replaceAll("\\t", " ");
|
|
|
+ return extractedText.toString().replaceAll("\\r?\\n", "</br>").replaceAll(" ", "");
|
|
|
}catch (Exception e){
|
|
|
e.printStackTrace();
|
|
|
}finally {
|