SplitTools.java 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117
  1. package org.cnnlp.data.splitter;
  2. import org.cnnlp.data.document.GDocument;
  3. import org.cnnlp.data.util.BaseParameters;
  4. import java.io.File;
  5. import java.io.IOException;
  6. import java.nio.file.Files;
  7. import java.nio.file.Path;
  8. import java.nio.file.Paths;
  9. import java.util.List;
  10. import java.util.stream.Stream;
  11. public class SplitTools {
  12. public SplitTools() {
  13. }
  14. public void splitFaq(Path path) throws IOException {
  15. System.out.println("Process " + path.toString());
  16. String json = path.toString() + ".json";
  17. FaqMdSplitter splitter = new FaqMdSplitter();
  18. BaseParameters params = BaseParameters.defaultParams();
  19. List<GDocument> docs = splitter.split(path, params);
  20. SplitUtils.toJsonFile(new File(json), docs);
  21. }
  22. public static boolean isMd(Path p) {
  23. if (Files.isRegularFile(p)) {
  24. boolean b = p.toString().endsWith(".md");
  25. return b;
  26. }
  27. return false;
  28. }
  29. public void splitSimple(Path path) throws IOException {
  30. System.out.println("Process " + path.toString());
  31. String json = path.toString() + ".json";
  32. SimpleMdSplitter splitter = new SimpleMdSplitter();
  33. splitter.setCommentProcessor(KvCommentProcessor.build());
  34. BaseParameters params = BaseParameters.defaultParams();
  35. // Path path = Paths.get(md);
  36. //String baseName = SplitUtils.getFileBaseName(path);
  37. //System.out.println("baseName=" + baseName);
  38. List<GDocument> docs = splitter.split(path, params);
  39. SplitUtils.toJsonFile(new File(json), docs);
  40. }
  41. public void splitSimpleDir(Path p) throws IOException {
  42. Stream<Path> fl = Files.list(p);
  43. fl.forEach(p1 -> {
  44. try {
  45. // if (isMd(f)) {
  46. // tools.splitSimple(f);
  47. // }
  48. //System.out.println(p1);
  49. File file = p1.toFile();
  50. if (file.isDirectory()) {
  51. splitSimpleDir(p1);
  52. } else {
  53. System.out.println(p1);
  54. splitSimple(p1);
  55. }
  56. } catch (Exception e) {
  57. throw new RuntimeException(e);
  58. }
  59. });
  60. }
  61. public static void main(String[] args) throws IOException {
  62. String md = "D:\\testdata\\md\\官网银行卡知识.1.md";
  63. md = "D:\\testdata\\md\\海底捞员工手册.md";
  64. md = "D:\\testdata\\md\\列表1.md";
  65. md = "D:\\data\\乙烯\\乙烯装置技术与运行测试.md";
  66. md = "D:\\data\\乙烯\\乙烯1.md";
  67. md = "D:\\data\\乙烯\\催化裂化操作规程2024.4.28.md";
  68. // String json = md + ".json";
  69. SplitTools tools = new SplitTools();
  70. // tools.splitSimple(Paths.get(md));
  71. // String p = "D:\\data\\乙烯\\target";
  72. // tools.splitSimpleDir(Paths.get(p));
  73. String p = "D:\\data\\乙烯\\规章制度\\中国石油四川石化有限责任公司原油采购管理办法.md";
  74. tools.splitSimple(Paths.get(p));
  75. // String dir = "D:\\data\\乙烯\\0624\\安全技术说明书完善\\";
  76. // tools.splitSimpleDir(Paths.get(dir));
  77. // String p = "D:\\data\\乙烯\\2\\";
  78. // Stream<Path> fl = Files.list(Path.of(p));
  79. // fl.forEach(f->{
  80. // try {
  81. // if (isMd(f)){
  82. // tools.splitSimple(f);
  83. // }
  84. // } catch (IOException e) {
  85. // throw new RuntimeException(e);
  86. // }
  87. // });
  88. //
  89. // String p2 = "D:\\data\\乙烯\\2\\faq\\";
  90. // Stream<Path> fl2 = Files.list(Path.of(p2));
  91. // fl2.forEach(f->{
  92. // try {
  93. // if (isMd(f)){
  94. // tools.splitFaq(f);
  95. // }
  96. // } catch (IOException e) {
  97. // throw new RuntimeException(e);
  98. // }
  99. // });
  100. }
  101. }