package org.cnnlp.data.splitter; import com.vladsch.flexmark.util.ast.Node; import org.cnnlp.data.book.GTBookConstants; import org.cnnlp.data.book.IElement; import org.cnnlp.data.document.GDocConstants; import org.cnnlp.data.document.GDocument; import org.cnnlp.data.util.BaseParameters; import org.cnnlp.data.util.id.IdGenerator; import org.cnnlp.data.util.id.UlidGenerator; import java.io.File; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.ArrayList; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; // MdSimpleSplitter 用于将 md 按照 headline 的outline拆分,同时 把 解析到metadata中 public class SimpleMdSplitter extends BaseMdParser implements ISplitter { private static final int MIN_PARAGRAPH_LEN = 32; protected IdGenerator idGen = new UlidGenerator(); protected ICommentProcessor commentProcessor; protected BaseParameters params; public SimpleMdSplitter() { } public void setIdGenerator(IdGenerator idGen) { this.idGen = idGen; } public void setCommentProcessor(ICommentProcessor commentProcessor) { this.commentProcessor = commentProcessor; } private void mergeMap(Map dest, Map src) { if (src == null || src.size() <= 0) return; src.forEach((k, v) -> { Object o = dest.get(k); if (o == null) { dest.put(k, v); } else { if (o instanceof List) { List ls = (List) o; ls.add(v); dest.put(k, ls); } else { List ls = new ArrayList(); ls.add(o); ls.add(v); dest.put(k, ls); } } }); } protected Map processComment(int beginIdx, int endIdx) { Map rets = new LinkedHashMap<>(); if (commentProcessor != null) { if (beginIdx >= 1) { GTNode2 nd = nodes[beginIdx - 1]; if (nd.getType() == GTBookConstants.MD_COMMENTS) { Map meta1 = processComment1(beginIdx - 1, beginIdx); mergeMap(rets, meta1); } } for (int i = beginIdx; i < endIdx; i++) { GTNode2 nd = nodes[i]; if (nd.getType() == GTBookConstants.MD_COMMENTS) { Map meta1 = processComment1(i, i + 1); mergeMap(rets, meta1); } } } return rets; } protected Map processComment1(int beginIdx, int endIdx) { List hs = getHtmls(beginIdx, endIdx); if (hs != null && hs.size() > 0) { List hs2 = SplitUtils.mdStringToList(hs); Map meta1 = HtmlCommentParser.parseKv(hs2); return meta1; } else { return Map.of(); } } protected Map getMetadata(int beginIdx, int endIdx) { Map metadata = new LinkedHashMap<>(); Object o = params.get(GDocConstants.FILE_NAME); if (o != null) { metadata.put(GDocConstants.FILE_NAME, o); } metadata.put(GDocConstants.FROM_IDX, beginIdx); metadata.put(GDocConstants.TO_IDX, endIdx); Map cm = processComment(beginIdx, endIdx); if (cm != null) { metadata.putAll(cm); } return metadata; } @Override public List split(String md, BaseParameters params) { this.params = params; List docs = new ArrayList<>(); parse(md, params); root.countingTextSize(); int totalSize = root.countingTextSize(); int[] depthes = docTree.getDepthes(); int[] fathers = docTree.getFathers(); //List outline = getOutline(true); //System.out.println(outline); int docLen = depthes.length; int i = 0; int nowJ = 0; while (i < docLen) { GTNode2 nd = nodes[i]; //int blockSize = nd.getBlockSize(); if (nd.getType() == GTBookConstants.MD_HEADING) { int nextJ = docLen; for (int j = i + 1; j < docLen; j++) { if (nodes[j].getType() == GTBookConstants.MD_HEADING) { nextJ = j; break; } } // i..nextJ 之间 的 text 是否大于 minChunkLen int textLen = getTextLen(nowJ + 1, nextJ); if (textLen > MIN_PARAGRAPH_LEN) { List fatherLabels = getFatherLabels(nowJ); //List htmls = getHtmls(nowJ, nextJ); List texts = getTexts(nowJ, nextJ); GDocument.Builder builder = GDocument.builder().id(idGen.generateId()).text(SplitUtils.listToString(texts)); builder.metadata(GDocConstants.CURRENT_SECTION_TOC, fatherLabels); //builder.metadata(GDocConstants.RAW_CONTENT, SplitUtils.listToString(htmls)); builder.metadata(GDocConstants.FIRST_HEADING, nd.getLabel()); getCharOffset(builder, nowJ, nextJ); Map metadata = getMetadata(nowJ, nextJ); metadata.forEach((k, v) -> builder.metadata(k, v)); GDocument doc = builder.build(); docs.add(doc); //Object o = doc.getMetadata().get(GDocConstants.RAW_CONTENT); //System.out.println(o); nowJ = nextJ; } i = nextJ; } else { i++; } } // 没有任何标题 if (docs.size() <= 0 && i > 0) { List texts = getTexts(0, docLen); GDocument.Builder builder = GDocument.builder().id(idGen.generateId()).text(SplitUtils.listToString(texts)); builder.metadata(GDocConstants.CURRENT_SECTION_TOC, List.of("")); //builder.metadata(GDocConstants.FIRST_HEADING, nd.getLabel()); getCharOffset(builder, 0, docLen); Map metadata = getMetadata(0, docLen); metadata.forEach((k, v) -> builder.metadata(k, v)); GDocument doc = builder.build(); docs.add(doc); } return docs; } protected void getCharOffset(GDocument.Builder builder, int startIndex, int endIndex) { String text = docTree.getText(); Node mdNode = nodes[startIndex].getMdNode(); int startOffset = mdNode.getStartOffset(); int endOffset = mdNode.getEndOffset(); if (endIndex < nodes.length) { mdNode = nodes[endIndex].getMdNode(); endOffset = mdNode.getStartOffset(); } else { endOffset = text.length(); } String s = text.substring(startOffset, endOffset); //System.out.println("s="+s); builder.metadata(GDocConstants.RAW_CONTENT, s); builder.metadata(GDocConstants.START_OFFSET, startOffset); builder.metadata(GDocConstants.END_OFFSET, endOffset); } private int getNodeTextLength(int index) { int len = ((IElement) nodes[index].getValue()).getText().get(0).length(); return len; } protected int getTextLen(int startIndex, int endIndex) { int count = 0; for (int i = startIndex; i < endIndex; i++) { count = count + getNodeTextLength(i); } return count; } protected List getHtmls(int startIndex, int endIndex) { List htmls = new ArrayList<>(); for (int i = startIndex; i < endIndex; i++) { IElement value = (IElement) nodes[i].getValue(); List html = value.getHtml(); htmls.addAll(html); } return htmls; } protected List getTexts(int startIndex, int endIndex) { List texts = new ArrayList<>(); for (int i = startIndex; i < endIndex; i++) { List txt = ((IElement) nodes[i].getValue()).getText(); texts.addAll(txt); } return texts; } // 从上至下 protected List getFatherLabels(int index) { int[] fathers = docTree.getFather(index); List labels = new ArrayList<>(); if (fathers != null && fathers.length > 0) { for (int i = fathers.length - 1; i >= 0; i--) { labels.add(nodes[fathers[i]].getLabel()); } } return labels; } protected void preProcess(Path path, BaseParameters params) { if (params.get(GDocConstants.FILE_NAME) == null) { params.put(GDocConstants.FILE_NAME, SplitUtils.getFileBaseName(path)); } } // 留做扩展 protected void postProcess(List docs) { } @Override public List split(Path path, BaseParameters params) throws IOException { preProcess(path, params); String md = Files.readString(path); List rets = split(md, params); postProcess(rets); return rets; } // public static void main(String[] args) throws IOException { // String md = "D:\\testdata\\md\\官网银行卡知识.1.md"; // md = "D:\\testdata\\md\\海底捞员工手册.md"; // md = "D:\\testdata\\md\\列表1.md"; // md = "D:\\data\\乙烯\\乙烯装置技术与运行测试.md"; // md = "D:\\data\\乙烯\\乙烯1.md"; // String json = md + ".json"; // SimpleMdSplitter splitter = new SimpleMdSplitter(); // splitter.setCommentProcessor(KvCommentProcessor.build()); // BaseParameters params = BaseParameters.defaultParams(); // Path path = Paths.get(md); // //String baseName = SplitUtils.getFileBaseName(path); // //System.out.println("baseName=" + baseName); // List docs = splitter.split(path, params); // SplitUtils.toJsonFile(new File(json), docs); // } }