SimpleMdSplitter.java 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288
  1. package org.cnnlp.data.splitter;
  2. import com.vladsch.flexmark.util.ast.Node;
  3. import org.cnnlp.data.book.GTBookConstants;
  4. import org.cnnlp.data.book.IElement;
  5. import org.cnnlp.data.document.GDocConstants;
  6. import org.cnnlp.data.document.GDocument;
  7. import org.cnnlp.data.util.BaseParameters;
  8. import org.cnnlp.data.util.id.IdGenerator;
  9. import org.cnnlp.data.util.id.UlidGenerator;
  10. import java.io.File;
  11. import java.io.IOException;
  12. import java.nio.file.Files;
  13. import java.nio.file.Path;
  14. import java.nio.file.Paths;
  15. import java.util.ArrayList;
  16. import java.util.LinkedHashMap;
  17. import java.util.List;
  18. import java.util.Map;
  19. // MdSimpleSplitter 用于将 md 按照 headline 的outline拆分,同时 把<!--key: value --> 解析到metadata中
  20. public class SimpleMdSplitter extends BaseMdParser implements ISplitter {
  21. private static final int MIN_PARAGRAPH_LEN = 32;
  22. protected IdGenerator idGen = new UlidGenerator();
  23. protected ICommentProcessor commentProcessor;
  24. protected BaseParameters params;
  25. public SimpleMdSplitter() {
  26. }
  27. public void setIdGenerator(IdGenerator idGen) {
  28. this.idGen = idGen;
  29. }
  30. public void setCommentProcessor(ICommentProcessor commentProcessor) {
  31. this.commentProcessor = commentProcessor;
  32. }
  33. private void mergeMap(Map<String, Object> dest, Map<String, Object> src) {
  34. if (src == null || src.size() <= 0) return;
  35. src.forEach((k, v) -> {
  36. Object o = dest.get(k);
  37. if (o == null) {
  38. dest.put(k, v);
  39. } else {
  40. if (o instanceof List) {
  41. List ls = (List) o;
  42. ls.add(v);
  43. dest.put(k, ls);
  44. } else {
  45. List ls = new ArrayList();
  46. ls.add(o);
  47. ls.add(v);
  48. dest.put(k, ls);
  49. }
  50. }
  51. });
  52. }
  53. protected Map<String, Object> processComment(int beginIdx, int endIdx) {
  54. Map<String, Object> rets = new LinkedHashMap<>();
  55. if (commentProcessor != null) {
  56. if (beginIdx >= 1) {
  57. GTNode2 nd = nodes[beginIdx - 1];
  58. if (nd.getType() == GTBookConstants.MD_COMMENTS) {
  59. Map<String, Object> meta1 = processComment1(beginIdx - 1, beginIdx);
  60. mergeMap(rets, meta1);
  61. }
  62. }
  63. for (int i = beginIdx; i < endIdx; i++) {
  64. GTNode2 nd = nodes[i];
  65. if (nd.getType() == GTBookConstants.MD_COMMENTS) {
  66. Map<String, Object> meta1 = processComment1(i, i + 1);
  67. mergeMap(rets, meta1);
  68. }
  69. }
  70. }
  71. return rets;
  72. }
  73. protected Map<String, Object> processComment1(int beginIdx, int endIdx) {
  74. List<String> hs = getHtmls(beginIdx, endIdx);
  75. if (hs != null && hs.size() > 0) {
  76. List<String> hs2 = SplitUtils.mdStringToList(hs);
  77. Map<String, Object> meta1 = HtmlCommentParser.parseKv(hs2);
  78. return meta1;
  79. } else {
  80. return Map.of();
  81. }
  82. }
  83. protected Map<String, Object> getMetadata(int beginIdx, int endIdx) {
  84. Map<String, Object> metadata = new LinkedHashMap<>();
  85. Object o = params.get(GDocConstants.FILE_NAME);
  86. if (o != null) {
  87. metadata.put(GDocConstants.FILE_NAME, o);
  88. }
  89. metadata.put(GDocConstants.FROM_IDX, beginIdx);
  90. metadata.put(GDocConstants.TO_IDX, endIdx);
  91. Map<String, Object> cm = processComment(beginIdx, endIdx);
  92. if (cm != null) {
  93. metadata.putAll(cm);
  94. }
  95. return metadata;
  96. }
  97. @Override
  98. public List<GDocument> split(String md, BaseParameters params) {
  99. this.params = params;
  100. List<GDocument> docs = new ArrayList<>();
  101. parse(md, params);
  102. root.countingTextSize();
  103. int totalSize = root.countingTextSize();
  104. int[] depthes = docTree.getDepthes();
  105. int[] fathers = docTree.getFathers();
  106. //List<String> outline = getOutline(true);
  107. //System.out.println(outline);
  108. int docLen = depthes.length;
  109. int i = 0;
  110. int nowJ = 0;
  111. while (i < docLen) {
  112. GTNode2 nd = nodes[i];
  113. //int blockSize = nd.getBlockSize();
  114. if (nd.getType() == GTBookConstants.MD_HEADING) {
  115. int nextJ = docLen;
  116. for (int j = i + 1; j < docLen; j++) {
  117. if (nodes[j].getType() == GTBookConstants.MD_HEADING) {
  118. nextJ = j;
  119. break;
  120. }
  121. }
  122. // i..nextJ 之间 的 text 是否大于 minChunkLen
  123. int textLen = getTextLen(nowJ + 1, nextJ);
  124. if (textLen > MIN_PARAGRAPH_LEN) {
  125. List<String> fatherLabels = getFatherLabels(nowJ);
  126. //List<String> htmls = getHtmls(nowJ, nextJ);
  127. List<String> texts = getTexts(nowJ, nextJ);
  128. GDocument.Builder builder = GDocument.builder().id(idGen.generateId()).text(SplitUtils.listToString(texts));
  129. builder.metadata(GDocConstants.CURRENT_SECTION_TOC, fatherLabels);
  130. //builder.metadata(GDocConstants.RAW_CONTENT, SplitUtils.listToString(htmls));
  131. builder.metadata(GDocConstants.FIRST_HEADING, nd.getLabel());
  132. getCharOffset(builder, nowJ, nextJ);
  133. Map<String, Object> metadata = getMetadata(nowJ, nextJ);
  134. metadata.forEach((k, v) -> builder.metadata(k, v));
  135. GDocument doc = builder.build();
  136. docs.add(doc);
  137. //Object o = doc.getMetadata().get(GDocConstants.RAW_CONTENT);
  138. //System.out.println(o);
  139. nowJ = nextJ;
  140. }
  141. i = nextJ;
  142. } else {
  143. i++;
  144. }
  145. }
  146. // 没有任何标题
  147. if (docs.size() <= 0 && i > 0) {
  148. List<String> texts = getTexts(0, docLen);
  149. GDocument.Builder builder = GDocument.builder().id(idGen.generateId()).text(SplitUtils.listToString(texts));
  150. builder.metadata(GDocConstants.CURRENT_SECTION_TOC, List.of(""));
  151. //builder.metadata(GDocConstants.FIRST_HEADING, nd.getLabel());
  152. getCharOffset(builder, 0, docLen);
  153. Map<String, Object> metadata = getMetadata(0, docLen);
  154. metadata.forEach((k, v) -> builder.metadata(k, v));
  155. GDocument doc = builder.build();
  156. docs.add(doc);
  157. }
  158. return docs;
  159. }
  160. protected void getCharOffset(GDocument.Builder builder, int startIndex, int endIndex) {
  161. String text = docTree.getText();
  162. Node mdNode = nodes[startIndex].getMdNode();
  163. int startOffset = mdNode.getStartOffset();
  164. int endOffset = mdNode.getEndOffset();
  165. if (endIndex < nodes.length) {
  166. mdNode = nodes[endIndex].getMdNode();
  167. endOffset = mdNode.getStartOffset();
  168. } else {
  169. endOffset = text.length();
  170. }
  171. String s = text.substring(startOffset, endOffset);
  172. //System.out.println("s="+s);
  173. builder.metadata(GDocConstants.RAW_CONTENT, s);
  174. builder.metadata(GDocConstants.START_OFFSET, startOffset);
  175. builder.metadata(GDocConstants.END_OFFSET, endOffset);
  176. }
  177. private int getNodeTextLength(int index) {
  178. int len = ((IElement) nodes[index].getValue()).getText().get(0).length();
  179. return len;
  180. }
  181. protected int getTextLen(int startIndex, int endIndex) {
  182. int count = 0;
  183. for (int i = startIndex; i < endIndex; i++) {
  184. count = count + getNodeTextLength(i);
  185. }
  186. return count;
  187. }
  188. protected List<String> getHtmls(int startIndex, int endIndex) {
  189. List<String> htmls = new ArrayList<>();
  190. for (int i = startIndex; i < endIndex; i++) {
  191. IElement value = (IElement) nodes[i].getValue();
  192. List<String> html = value.getHtml();
  193. htmls.addAll(html);
  194. }
  195. return htmls;
  196. }
  197. protected List<String> getTexts(int startIndex, int endIndex) {
  198. List<String> texts = new ArrayList<>();
  199. for (int i = startIndex; i < endIndex; i++) {
  200. List<String> txt = ((IElement) nodes[i].getValue()).getText();
  201. texts.addAll(txt);
  202. }
  203. return texts;
  204. }
  205. // 从上至下
  206. protected List<String> getFatherLabels(int index) {
  207. int[] fathers = docTree.getFather(index);
  208. List<String> labels = new ArrayList<>();
  209. if (fathers != null && fathers.length > 0) {
  210. for (int i = fathers.length - 1; i >= 0; i--) {
  211. labels.add(nodes[fathers[i]].getLabel());
  212. }
  213. }
  214. return labels;
  215. }
  216. protected void preProcess(Path path, BaseParameters params) {
  217. if (params.get(GDocConstants.FILE_NAME) == null) {
  218. params.put(GDocConstants.FILE_NAME, SplitUtils.getFileBaseName(path));
  219. }
  220. }
  221. // 留做扩展
  222. protected void postProcess(List<GDocument> docs) {
  223. }
  224. @Override
  225. public List<GDocument> split(Path path, BaseParameters params) throws IOException {
  226. preProcess(path, params);
  227. String md = Files.readString(path);
  228. List<GDocument> rets = split(md, params);
  229. postProcess(rets);
  230. return rets;
  231. }
  232. // public static void main(String[] args) throws IOException {
  233. // String md = "D:\\testdata\\md\\官网银行卡知识.1.md";
  234. // md = "D:\\testdata\\md\\海底捞员工手册.md";
  235. // md = "D:\\testdata\\md\\列表1.md";
  236. // md = "D:\\data\\乙烯\\乙烯装置技术与运行测试.md";
  237. // md = "D:\\data\\乙烯\\乙烯1.md";
  238. // String json = md + ".json";
  239. // SimpleMdSplitter splitter = new SimpleMdSplitter();
  240. // splitter.setCommentProcessor(KvCommentProcessor.build());
  241. // BaseParameters params = BaseParameters.defaultParams();
  242. // Path path = Paths.get(md);
  243. // //String baseName = SplitUtils.getFileBaseName(path);
  244. // //System.out.println("baseName=" + baseName);
  245. // List<GDocument> docs = splitter.split(path, params);
  246. // SplitUtils.toJsonFile(new File(json), docs);
  247. // }
  248. }