package org.cnnlp.data.splitter; import com.vladsch.flexmark.ast.*; import com.vladsch.flexmark.ext.obs.comments.Comments; import com.vladsch.flexmark.ext.tables.TableBlock; import com.vladsch.flexmark.ext.yaml.front.matter.YamlFrontMatterBlock; import com.vladsch.flexmark.util.ast.Node; import com.vladsch.flexmark.util.ast.NodeVisitor; import com.vladsch.flexmark.util.ast.TextCollectingVisitor; import com.vladsch.flexmark.util.ast.VisitHandler; import com.vladsch.flexmark.util.collection.iteration.ReversiblePeekingIterable; import com.vladsch.flexmark.util.sequence.BasedSequence; import gnu.trove.TIntArrayList; import org.cnnlp.data.book.*; import org.cnnlp.data.md.DocTree; import org.cnnlp.data.md.HtmlToPlainText; import org.cnnlp.data.md.MDHelper; import org.cnnlp.data.md.MDRegxUtil; import org.cnnlp.data.util.BaseParameters; import org.cnnlp.data.util.SenUtil; import org.jsoup.Jsoup; import java.util.*; //org.cnnlp.data.md.DocRender2.java copy过来 // GTNode -> GTNode2 public class BaseMdParser { public static final String DEFALT_LABEL = ""; private String DEFALT_ROOT_LABEL = ""; // 用于解决 writeUTF(String),太长引起的exception public static final int MAX_STRING_LEN = 20000; TextCollectingVisitor textr; HtmlToPlainText formatter = null; protected MDElement faqComments; protected Map metadata; protected DocTree docTree; protected GTNode2 root; //= GTNode2.buildRoot(""); protected GTNode2[] nodes; //= new GTNode2[depthes.length]; public BaseMdParser() { init(); } protected void init() { // stripNewlines(true) 去掉回车 //TextContentRenderer text = TextContentRenderer.builder().stripNewlines(true).build(); TextCollectingVisitor textCollectingVisitor = new TextCollectingVisitor(); this.textr = textCollectingVisitor; metadata = new LinkedHashMap<>(); } public Map getMetadata() { return metadata; } // 2025.3.15 发生错误时,用此方法 //Accumulated range [1379874, 1379876) overlaps Transformed Range[2]: [1379874, 1379876) private String extractText(Node node) { StringBuilder text = new StringBuilder(); NodeVisitor visitor = new NodeVisitor( new VisitHandler<>(Text.class, textNode -> { //System.out.println("Text: " + textNode.getChars()); text.append(textNode.getChars()).append("\n"); }) ); visitor.visit(node); return text.toString(); } public String getText(Node nd) { //System.out.println("======="); //System.out.println(nd.getChars()); String s = ""; try { s = textr.collectAndGetText(nd); }catch(Exception e) { s=extractText(nd); } s = s.replaceAll("\r\n", "\n"); return s; } protected String getSub(String text) { if (text == null || text.length() < 10) { return text; } else { return text.substring(0, 10); } } private List splitLongString(String s) { int maxLineLen = MAX_STRING_LEN; int len = s.length(); int its = len / maxLineLen; List ls = new ArrayList<>(); int nowOffset = 0; for (int i = 0; i < its; i++) { String s1 = s.substring(nowOffset, nowOffset + maxLineLen); nowOffset = nowOffset + maxLineLen; ls.add(s1); } if (len == nowOffset) { } else { String s1 = s.substring(nowOffset); ls.add(s1); } return ls; } public String getPath(TIntArrayList path) { StringBuilder sb = new StringBuilder(); //sb.append(cid).append("/"); if (path.size() > 0) { sb.append(path.getQuick(0)); for (int i = 1; i < path.size(); i++) { sb.append("-").append(path.getQuick(i)); } return sb.toString(); } else { // 没有父节点,例如 "前言",有时不带#号 return ""; } } private String getPathId(int idx) { return String.valueOf(idx); } protected void processComments(List comments, int idx) { if (faqComments == null) { faqComments = new MDElement(); } comments.forEach(k -> faqComments.add(k, getPathId(idx))); } protected String processHtmlBlock(String s) { if (formatter == null) { formatter = new HtmlToPlainText(); } String converted = formatter.getPlainText(Jsoup.parse(s)); return converted; } // 增加path,主要是 Comments 需要指向路径 // 2025.2.19 增加 HtmlCommentBlock处理 // copy from DocRender2.java protected void processNode(Node nd1, GTNode gn, int idx) { List ts2 = new ArrayList(); List hs2 = new ArrayList(); //基本上就 这 三种类型 //Paragraph{} /HtmlBlock{} /OrderedList{} /TableBlock{} /HtmlInline{}/BulletList{} if (nd1 instanceof HtmlBlock) { HtmlBlock p1 = (HtmlBlock) nd1; List ls1 = p1.getContentLines(); //System.out.println(p1.getContentLines()); //System.out.println("=="); for (int i = 0; i < ls1.size(); i++) { hs2.add(ls1.get(i).toString()); } String txt = processHtmlBlock(SplitUtils.getMDTxt(p1)); ts2.add(txt); gn.setType(GTBookConstants.MD_HTMLBLOCK); } else if (nd1 instanceof TableBlock) { TableBlock p1 = (TableBlock) nd1; String md = SplitUtils.getMDTxt(p1); hs2.add(md); String txt = getText(nd1); ts2.add(txt); gn.setType(GTBookConstants.MD_TABLE); } else if (nd1 instanceof Comments) { Comments p1 = (Comments) nd1; String md = SplitUtils.getMDTxt(p1); hs2.add(md); String txt = getText(nd1); ts2.add(txt); gn.setType(GTBookConstants.MD_COMMENTS); processComments(List.of(txt), idx); } else if (nd1 instanceof HtmlCommentBlock) { HtmlCommentBlock p1 = (HtmlCommentBlock) nd1; String md = SplitUtils.getMDTxt(p1); hs2.add(md); String txt = getText(nd1); ts2.add(txt); gn.setType(GTBookConstants.MD_COMMENTS); processComments(List.of(txt), idx); } else if (nd1 instanceof Paragraph) { Paragraph p1 = (Paragraph) nd1; Node firstChild = p1.getFirstChild(); if (firstChild instanceof Comments) { List comments = new ArrayList<>(); StringBuilder sb = new StringBuilder(); ReversiblePeekingIterable children = p1.getChildren(); children.forEach(n -> { String t1 = getText(n); if (n instanceof Comments) { comments.add(t1); } else { sb.append(t1); } }); String htm = SplitUtils.getMDTxt(p1); String txt = comments.get(0) + sb.toString() + "\n"; hs2.add(htm); ts2.add(txt); processComments(comments, idx); } else if (firstChild instanceof Image) { StringBuilder sb = new StringBuilder(); ReversiblePeekingIterable children = p1.getChildren(); children.forEach(n -> { String t1 = getText(n); if (n instanceof Image) { if (SenUtil.isMeaninglessString(t1)) { } else { sb.append(t1); } } else { sb.append(t1); } }); String htm = SplitUtils.getMDTxt(p1); String txt = sb.toString() + "\n"; hs2.add(htm); ts2.add(txt); } else { String md = SplitUtils.getMDTxt(p1); //[下划线]{.underline} if (md.length() > 12) { String md1 = MDRegxUtil.convertUnderline(md); if (md1.length() != md.length()) { hs2.add(md1); String txt = processHtmlBlock(md1); ts2.add(txt); gn.setType(GTBookConstants.MD_HTMLBLOCK); } } } } else if (nd1 instanceof OrderedList) { String htm = SplitUtils.getMDTxt(nd1); String txt = getText(nd1); txt = txt.replaceAll("\n\n\n", "\n\n"); hs2.add(htm); ts2.add(txt); } else if (nd1 instanceof BulletList) { // 2025.3.7 增加 String htm = SplitUtils.getMDTxt(nd1); String txt = getText(nd1); txt = txt.replaceAll("\n\n", "\n"); hs2.add(htm); ts2.add(txt); }else if (nd1 instanceof YamlFrontMatterBlock) { // 第一行的元数据 YamlFrontMatterBlock p1 = (YamlFrontMatterBlock) nd1; String md = SplitUtils.getMDTxt(p1); hs2.add(md); String txt = ""; ts2.add(txt); gn.setType(GTBookConstants.MD_FRONTMATTER); } if (ts2.size() <= 0) { // 2024.1.7 这部分用于pandoc转出的md的图片杂乱信息的处理,暂时注释掉 // String htm = getMDTxt(nd1); // hs2 = MDRegxUtil.splitByBrackets(htm); // //[!, [C:\\Users\\kevin-pc\\Desktop\\2020\\index.png], (/images/1237763635937497090/resource/image9.png), ] // boolean isRepl = false; // if (hs2.size() >= 2) { // isRepl = processImageNote(hs2); // } // // //ts2 = new ArrayList(); // String txt = null; // if (isRepl) { // String mdTxt1 = GTBookUtil.listToString(hs2, ""); // Node nnd1 = MDHelper.PARSER.parse(mdTxt1); // txt = getText(nnd1); // } else { // txt = getText(nd1); // } // ts2.add(txt); String htm = SplitUtils.getMDTxt(nd1); String txt = getText(nd1); hs2.add(htm); ts2.add(txt); } //// 2020.8.26 要进行长串的处理 String txt = ts2.get(0); if (txt.length() > MAX_STRING_LEN) { List ls = splitLongString(txt); ts2 = ls; } boolean isTooLong = false; for (String s : hs2) { if (s.length() > MAX_STRING_LEN) { isTooLong = true; break; } } if (isTooLong) { List ls = new ArrayList<>(); for (String s : hs2) { if (s.length() > MAX_STRING_LEN) { List ls1 = splitLongString(s); ls.addAll(ls1); } else { ls.add(s); } } hs2 = ls; } MDElement v = new MDElement(ts2, hs2); gn.setValue(v); } protected String getHeadingTitle(Node nd, List hs1) { boolean isRemoved = false; if (hs1.size() > 1) { isRemoved = SplitUtils.processHeadingNote(hs1); } String txt1 = null; if (isRemoved) { String mdTxt1 = GTBookUtil.listToString(hs1, ""); Node nnd1 = MDHelper.PARSER.parse(mdTxt1); txt1 = getText(nnd1); } else { txt1 = getText(nd); } String title = SplitUtils.stripLast(txt1); return title; } //没有 章节 headings 的处理 private void processNoHeadings(Map params) { int[] depthes = docTree.getDepthes(); //int[] fathers = docTree.getFathers(); List ls = docTree.getSource(); String[] titles = new String[ls.size()]; GTNode2[] bns = new GTNode2[depthes.length]; this.nodes = bns; GTNode2 ch1 = GTNode2.buildNode(null, 1); ch1.setLabel(DEFALT_ROOT_LABEL); root.addChild(ch1); int baseNo = 1; TIntArrayList path = new TIntArrayList(); int i1 = root.size() - 1; path.add(i1); for (int i = 0; i < depthes.length; i++) { Node nd = ls.get(i); GTNode2 gn = GTNode2.buildLeaf(i + baseNo); ch1.addChild(gn); processNode(nd, gn, i); bns[i] = gn; Object obj = gn.getValue(); if (obj != null) { MDElement e = (MDElement) obj; List ts = e.getText(); String txt1 = null; if (ts != null && ts.size() > 0) { txt1 = ts.get(0); } titles[i] = SplitUtils.stripLast(txt1); gn.setLabel(getSub(titles[i])); //gn.setDepth(depthes[i]); gn.setDepth(1); } } } protected String getHtml(IElement v) { List hs = v.getHtml(); if (hs != null && hs.size() > 0) { return hs.get(0); } return null; } private String getTxt(IElement v) { List hs = v.getText(); if (hs != null && hs.size() > 0) { return hs.get(0); } return null; } public List getOutline(boolean isIndented){ List outline = new ArrayList<>(); if (nodes != null){ if (isIndented){ String spaces = " "; int minDepth = docTree.getMinLevel(); int[] depthes = docTree.getDepthes(); for (int i = 0; i < nodes.length; i++) { if (nodes[i].getType() == GTBookConstants.MD_HEADING) { //System.out.println(nodes[i].getMdNode().getChars().toString()); String space = ""; if (depthes[i] > minDepth) { space = spaces.substring(0, depthes[i] - minDepth); } String title = space+((IElement) nodes[i].getValue()).getHtml().get(0); outline.add(title); } } }else { for (int i = 0; i < nodes.length; i++) { if (nodes[i].getType() == GTBookConstants.MD_HEADING) { //System.out.println(nodes[i].getMdNode().getChars().toString()); String title = ((IElement) nodes[i].getValue()).getHtml().get(0); outline.add(title); } } } } return outline; } // 改造自 DocRender2.java的 renderToBook() public void parse(DocTree dt, Map params) { this.docTree = dt; List ls = dt.getSource(); Map> metaMap = new HashMap>(); if (ls.size() > 0 && (ls.get(0) instanceof YamlFrontMatterBlock)) { //YamlFrontMatterBlock metaBlock = (YamlFrontMatterBlock) ls.remove(0); YamlFrontMatterBlock metaBlock = (YamlFrontMatterBlock) ls.get(0); metaMap = SplitUtils.processMetaBlock(metaBlock); metaMap.forEach((k, v) -> { metadata.put(k,v); }); } int[] depthes = dt.getDepthes(); int[] fathers = dt.getFathers(); GTNode2 root = GTNode2.buildRoot(""); GTNode2[] bns = new GTNode2[depthes.length]; this.root = root; this.nodes = bns; if (dt.getHeadings() <= 0) { // 没有章节的处理 processNoHeadings(params); return; } String[] titles = new String[ls.size()]; int baseNo = 1; for (int i = 0; i < depthes.length; i++) { Node nd = ls.get(i); if (depthes[i] >= 0) { String htm1 = SplitUtils.getMDTxt(nd); List hs1 = MDRegxUtil.splitByBrackets(htm1); String title = getHeadingTitle(nd, hs1); titles[i] = title; List ts1 = new ArrayList(); ts1.add(title); MDElement v1 = new MDElement(ts1, hs1); GTNode2 gn = GTNode2.buildNode(null, i + baseNo); gn.setLabel(titles[i]); gn.setDepth(depthes[i]); gn.setValue(v1); gn.setType(GTBookConstants.MD_HEADING); gn.setMdNode(nd); bns[i] = gn; if (fathers[i] >= 0) { bns[fathers[i]].addChild(gn); } else { root.addChild(gn); } int j = i + 1; for (; j < fathers.length; j++) { if (depthes[j] < 0) { //基本上就 这 三种类型 //Paragraph{} /HtmlBlock{} /OrderedList{} Node nd1 = ls.get(j); GTNode2 leaf = GTNode2.buildLeaf(j + baseNo); leaf.setDepth(depthes[i] + 1); leaf.setMdNode(nd1); gn.addChild(leaf); //TIntArrayList path1 = (TIntArrayList) path.clone(); //path1.add(gn.size() - 1); processNode(nd1, leaf, j); bns[j] = leaf; } else { break; } } i = j - 1; } else { //System.out.println("===叶子节点==="); int minLevel = docTree.getMinLevel(); GTNode2 leaf = GTNode2.buildLeaf(i + baseNo); leaf.setDepth(minLevel); leaf.setMdNode(nd); root.addChild(leaf); //TIntArrayList path1 = (TIntArrayList) path.clone(); //path1.add(gn.size() - 1); processNode(nd, leaf, i); bns[i] = leaf; if (fathers[i] >= 0) { bns[fathers[i]].addChild(leaf); } else { root.addChild(leaf); } } } } // public void splitNodeList(List ls, Map params) { // //没有 章节 // // int[] depthes = new int[ls.size()]; // int[] fathers = new int[ls.size()]; // // String[] titles = new String[ls.size()]; // // GTNode root = GTNode.buildRoot(""); // GTNode[] bns = new GTNode[depthes.length]; // // GTNode ch1 = GTNode.buildNode(null, 1); // ch1.setLabel(DEFALT_ROOT_LABEL); // // root.addChild(ch1); // TIntArrayList path = new TIntArrayList(); // int i1 = root.size() - 1; // path.add(i1); // // for (int i = 0; i < depthes.length; i++) { // Node nd = ls.get(i); // // //MDElement v = processNode1(nd); // GTNode gn = GTNode.buildLeaf(i + 2); // ch1.addChild(gn); // // processNode(nd, gn, i); // Object obj = gn.getValue(); // if (obj != null) { // MDElement e = (MDElement) obj; // List ts = e.getText(); // String txt1 = null; // if (ts != null && ts.size() > 0) { // txt1 = ts.get(0); // } // titles[i] = SplitUtils.stripLast(txt1); // gn.setLabel(getSub(titles[i])); // //gn.setDepth(depthes[i]); // gn.setDepth(1); // // } // // } // } public void parse(String md, BaseParameters params) { Map paramsMap = params.toMap(); DocTree dt = MDHelper.parseToDocTree(md, paramsMap); dt.setText(md); parse(dt, paramsMap); } }