| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606 |
- package org.cnnlp.data.splitter;
- import com.vladsch.flexmark.ast.*;
- import com.vladsch.flexmark.ext.obs.comments.Comments;
- import com.vladsch.flexmark.ext.tables.TableBlock;
- import com.vladsch.flexmark.ext.yaml.front.matter.YamlFrontMatterBlock;
- import com.vladsch.flexmark.util.ast.Node;
- import com.vladsch.flexmark.util.ast.NodeVisitor;
- import com.vladsch.flexmark.util.ast.TextCollectingVisitor;
- import com.vladsch.flexmark.util.ast.VisitHandler;
- import com.vladsch.flexmark.util.collection.iteration.ReversiblePeekingIterable;
- import com.vladsch.flexmark.util.sequence.BasedSequence;
- import gnu.trove.TIntArrayList;
- import org.cnnlp.data.book.*;
- import org.cnnlp.data.md.DocTree;
- import org.cnnlp.data.md.HtmlToPlainText;
- import org.cnnlp.data.md.MDHelper;
- import org.cnnlp.data.md.MDRegxUtil;
- import org.cnnlp.data.util.BaseParameters;
- import org.cnnlp.data.util.SenUtil;
- import org.jsoup.Jsoup;
- import java.util.*;
- //org.cnnlp.data.md.DocRender2.java copy过来
- // GTNode -> GTNode2
- public class BaseMdParser {
- public static final String DEFALT_LABEL = "";
- private String DEFALT_ROOT_LABEL = "";
- // 用于解决 writeUTF(String),太长引起的exception
- public static final int MAX_STRING_LEN = 20000;
- TextCollectingVisitor textr;
- HtmlToPlainText formatter = null;
- protected MDElement faqComments;
- protected Map<String,Object> metadata;
- protected DocTree docTree;
- protected GTNode2 root; //= GTNode2.buildRoot("");
- protected GTNode2[] nodes; //= new GTNode2[depthes.length];
- public BaseMdParser() {
- init();
- }
- protected void init() {
- // stripNewlines(true) 去掉回车
- //TextContentRenderer text = TextContentRenderer.builder().stripNewlines(true).build();
- TextCollectingVisitor textCollectingVisitor = new TextCollectingVisitor();
- this.textr = textCollectingVisitor;
- metadata = new LinkedHashMap<>();
- }
- public Map<String, Object> getMetadata() {
- return metadata;
- }
- // 2025.3.15 发生错误时,用此方法
- //Accumulated range [1379874, 1379876) overlaps Transformed Range[2]: [1379874, 1379876)
- private String extractText(Node node) {
- StringBuilder text = new StringBuilder();
- NodeVisitor visitor = new NodeVisitor(
- new VisitHandler<>(Text.class, textNode -> {
- //System.out.println("Text: " + textNode.getChars());
- text.append(textNode.getChars()).append("\n");
- })
- );
- visitor.visit(node);
- return text.toString();
- }
- public String getText(Node nd) {
- //System.out.println("=======");
- //System.out.println(nd.getChars());
- String s = "";
- try {
- s = textr.collectAndGetText(nd);
- }catch(Exception e) {
- s=extractText(nd);
- }
- s = s.replaceAll("\r\n", "\n");
- return s;
- }
- protected String getSub(String text) {
- if (text == null || text.length() < 10) {
- return text;
- } else {
- return text.substring(0, 10);
- }
- }
- private List<String> splitLongString(String s) {
- int maxLineLen = MAX_STRING_LEN;
- int len = s.length();
- int its = len / maxLineLen;
- List<String> ls = new ArrayList<>();
- int nowOffset = 0;
- for (int i = 0; i < its; i++) {
- String s1 = s.substring(nowOffset, nowOffset + maxLineLen);
- nowOffset = nowOffset + maxLineLen;
- ls.add(s1);
- }
- if (len == nowOffset) {
- } else {
- String s1 = s.substring(nowOffset);
- ls.add(s1);
- }
- return ls;
- }
- public String getPath(TIntArrayList path) {
- StringBuilder sb = new StringBuilder();
- //sb.append(cid).append("/");
- if (path.size() > 0) {
- sb.append(path.getQuick(0));
- for (int i = 1; i < path.size(); i++) {
- sb.append("-").append(path.getQuick(i));
- }
- return sb.toString();
- } else {
- // 没有父节点,例如 "前言",有时不带#号
- return "";
- }
- }
- private String getPathId(int idx) {
- return String.valueOf(idx);
- }
- protected void processComments(List<String> comments, int idx) {
- if (faqComments == null) {
- faqComments = new MDElement();
- }
- comments.forEach(k -> faqComments.add(k, getPathId(idx)));
- }
- protected String processHtmlBlock(String s) {
- if (formatter == null) {
- formatter = new HtmlToPlainText();
- }
- String converted = formatter.getPlainText(Jsoup.parse(s));
- return converted;
- }
- // 增加path,主要是 Comments 需要指向路径
- // 2025.2.19 增加 HtmlCommentBlock处理
- // copy from DocRender2.java
- protected void processNode(Node nd1, GTNode gn, int idx) {
- List<String> ts2 = new ArrayList<String>();
- List<String> hs2 = new ArrayList<String>();
- //基本上就 这 三种类型
- //Paragraph{} /HtmlBlock{} /OrderedList{} /TableBlock{} /HtmlInline{}/BulletList{}
- if (nd1 instanceof HtmlBlock) {
- HtmlBlock p1 = (HtmlBlock) nd1;
- List<BasedSequence> ls1 = p1.getContentLines();
- //System.out.println(p1.getContentLines());
- //System.out.println("==");
- for (int i = 0; i < ls1.size(); i++) {
- hs2.add(ls1.get(i).toString());
- }
- String txt = processHtmlBlock(SplitUtils.getMDTxt(p1));
- ts2.add(txt);
- gn.setType(GTBookConstants.MD_HTMLBLOCK);
- } else if (nd1 instanceof TableBlock) {
- TableBlock p1 = (TableBlock) nd1;
- String md = SplitUtils.getMDTxt(p1);
- hs2.add(md);
- String txt = getText(nd1);
- ts2.add(txt);
- gn.setType(GTBookConstants.MD_TABLE);
- } else if (nd1 instanceof Comments) {
- Comments p1 = (Comments) nd1;
- String md = SplitUtils.getMDTxt(p1);
- hs2.add(md);
- String txt = getText(nd1);
- ts2.add(txt);
- gn.setType(GTBookConstants.MD_COMMENTS);
- processComments(List.of(txt), idx);
- } else if (nd1 instanceof HtmlCommentBlock) {
- HtmlCommentBlock p1 = (HtmlCommentBlock) nd1;
- String md = SplitUtils.getMDTxt(p1);
- hs2.add(md);
- String txt = getText(nd1);
- ts2.add(txt);
- gn.setType(GTBookConstants.MD_COMMENTS);
- processComments(List.of(txt), idx);
- } else if (nd1 instanceof Paragraph) {
- Paragraph p1 = (Paragraph) nd1;
- Node firstChild = p1.getFirstChild();
- if (firstChild instanceof Comments) {
- List<String> comments = new ArrayList<>();
- StringBuilder sb = new StringBuilder();
- ReversiblePeekingIterable<Node> children = p1.getChildren();
- children.forEach(n -> {
- String t1 = getText(n);
- if (n instanceof Comments) {
- comments.add(t1);
- } else {
- sb.append(t1);
- }
- });
- String htm = SplitUtils.getMDTxt(p1);
- String txt = comments.get(0) + sb.toString() + "\n";
- hs2.add(htm);
- ts2.add(txt);
- processComments(comments, idx);
- } else if (firstChild instanceof Image) {
- StringBuilder sb = new StringBuilder();
- ReversiblePeekingIterable<Node> children = p1.getChildren();
- children.forEach(n -> {
- String t1 = getText(n);
- if (n instanceof Image) {
- if (SenUtil.isMeaninglessString(t1)) {
- } else {
- sb.append(t1);
- }
- } else {
- sb.append(t1);
- }
- });
- String htm = SplitUtils.getMDTxt(p1);
- String txt = sb.toString() + "\n";
- hs2.add(htm);
- ts2.add(txt);
- } else {
- String md = SplitUtils.getMDTxt(p1);
- //[下划线]{.underline}
- if (md.length() > 12) {
- String md1 = MDRegxUtil.convertUnderline(md);
- if (md1.length() != md.length()) {
- hs2.add(md1);
- String txt = processHtmlBlock(md1);
- ts2.add(txt);
- gn.setType(GTBookConstants.MD_HTMLBLOCK);
- }
- }
- }
- } else if (nd1 instanceof OrderedList) {
- String htm = SplitUtils.getMDTxt(nd1);
- String txt = getText(nd1);
- txt = txt.replaceAll("\n\n\n", "\n\n");
- hs2.add(htm);
- ts2.add(txt);
- } else if (nd1 instanceof BulletList) {
- // 2025.3.7 增加
- String htm = SplitUtils.getMDTxt(nd1);
- String txt = getText(nd1);
- txt = txt.replaceAll("\n\n", "\n");
- hs2.add(htm);
- ts2.add(txt);
- }else if (nd1 instanceof YamlFrontMatterBlock) {
- // 第一行的元数据
- YamlFrontMatterBlock p1 = (YamlFrontMatterBlock) nd1;
- String md = SplitUtils.getMDTxt(p1);
- hs2.add(md);
- String txt = "";
- ts2.add(txt);
- gn.setType(GTBookConstants.MD_FRONTMATTER);
- }
- if (ts2.size() <= 0) {
- // 2024.1.7 这部分用于pandoc转出的md的图片杂乱信息的处理,暂时注释掉
- // String htm = getMDTxt(nd1);
- // hs2 = MDRegxUtil.splitByBrackets(htm);
- // //[!, [C:\\Users\\kevin-pc\\Desktop\\2020\\index.png], (/images/1237763635937497090/resource/image9.png), ]
- // boolean isRepl = false;
- // if (hs2.size() >= 2) {
- // isRepl = processImageNote(hs2);
- // }
- //
- // //ts2 = new ArrayList<String>();
- // String txt = null;
- // if (isRepl) {
- // String mdTxt1 = GTBookUtil.listToString(hs2, "");
- // Node nnd1 = MDHelper.PARSER.parse(mdTxt1);
- // txt = getText(nnd1);
- // } else {
- // txt = getText(nd1);
- // }
- // ts2.add(txt);
- String htm = SplitUtils.getMDTxt(nd1);
- String txt = getText(nd1);
- hs2.add(htm);
- ts2.add(txt);
- }
- //// 2020.8.26 要进行长串的处理
- String txt = ts2.get(0);
- if (txt.length() > MAX_STRING_LEN) {
- List<String> ls = splitLongString(txt);
- ts2 = ls;
- }
- boolean isTooLong = false;
- for (String s : hs2) {
- if (s.length() > MAX_STRING_LEN) {
- isTooLong = true;
- break;
- }
- }
- if (isTooLong) {
- List<String> ls = new ArrayList<>();
- for (String s : hs2) {
- if (s.length() > MAX_STRING_LEN) {
- List<String> ls1 = splitLongString(s);
- ls.addAll(ls1);
- } else {
- ls.add(s);
- }
- }
- hs2 = ls;
- }
- MDElement v = new MDElement(ts2, hs2);
- gn.setValue(v);
- }
- protected String getHeadingTitle(Node nd, List<String> hs1) {
- boolean isRemoved = false;
- if (hs1.size() > 1) {
- isRemoved = SplitUtils.processHeadingNote(hs1);
- }
- String txt1 = null;
- if (isRemoved) {
- String mdTxt1 = GTBookUtil.listToString(hs1, "");
- Node nnd1 = MDHelper.PARSER.parse(mdTxt1);
- txt1 = getText(nnd1);
- } else {
- txt1 = getText(nd);
- }
- String title = SplitUtils.stripLast(txt1);
- return title;
- }
- //没有 章节 headings 的处理
- private void processNoHeadings(Map<String, Object> params) {
- int[] depthes = docTree.getDepthes();
- //int[] fathers = docTree.getFathers();
- List<Node> ls = docTree.getSource();
- String[] titles = new String[ls.size()];
- GTNode2[] bns = new GTNode2[depthes.length];
- this.nodes = bns;
- GTNode2 ch1 = GTNode2.buildNode(null, 1);
- ch1.setLabel(DEFALT_ROOT_LABEL);
- root.addChild(ch1);
- int baseNo = 1;
- TIntArrayList path = new TIntArrayList();
- int i1 = root.size() - 1;
- path.add(i1);
- for (int i = 0; i < depthes.length; i++) {
- Node nd = ls.get(i);
- GTNode2 gn = GTNode2.buildLeaf(i + baseNo);
- ch1.addChild(gn);
- processNode(nd, gn, i);
- bns[i] = gn;
- Object obj = gn.getValue();
- if (obj != null) {
- MDElement e = (MDElement) obj;
- List<String> ts = e.getText();
- String txt1 = null;
- if (ts != null && ts.size() > 0) {
- txt1 = ts.get(0);
- }
- titles[i] = SplitUtils.stripLast(txt1);
- gn.setLabel(getSub(titles[i]));
- //gn.setDepth(depthes[i]);
- gn.setDepth(1);
- }
- }
- }
- protected String getHtml(IElement v) {
- List<String> hs = v.getHtml();
- if (hs != null && hs.size() > 0) {
- return hs.get(0);
- }
- return null;
- }
- private String getTxt(IElement v) {
- List<String> hs = v.getText();
- if (hs != null && hs.size() > 0) {
- return hs.get(0);
- }
- return null;
- }
- public List<String> getOutline(boolean isIndented){
- List<String> outline = new ArrayList<>();
- if (nodes != null){
- if (isIndented){
- String spaces = " ";
- int minDepth = docTree.getMinLevel();
- int[] depthes = docTree.getDepthes();
- for (int i = 0; i < nodes.length; i++) {
- if (nodes[i].getType() == GTBookConstants.MD_HEADING) {
- //System.out.println(nodes[i].getMdNode().getChars().toString());
- String space = "";
- if (depthes[i] > minDepth) {
- space = spaces.substring(0, depthes[i] - minDepth);
- }
- String title = space+((IElement) nodes[i].getValue()).getHtml().get(0);
- outline.add(title);
- }
- }
- }else {
- for (int i = 0; i < nodes.length; i++) {
- if (nodes[i].getType() == GTBookConstants.MD_HEADING) {
- //System.out.println(nodes[i].getMdNode().getChars().toString());
- String title = ((IElement) nodes[i].getValue()).getHtml().get(0);
- outline.add(title);
- }
- }
- }
- }
- return outline;
- }
- // 改造自 DocRender2.java的 renderToBook()
- public void parse(DocTree dt, Map<String, Object> params) {
- this.docTree = dt;
- List<Node> ls = dt.getSource();
- Map<String, List<String>> metaMap = new HashMap<String, List<String>>();
- if (ls.size() > 0 && (ls.get(0) instanceof YamlFrontMatterBlock)) {
- //YamlFrontMatterBlock metaBlock = (YamlFrontMatterBlock) ls.remove(0);
- YamlFrontMatterBlock metaBlock = (YamlFrontMatterBlock) ls.get(0);
- metaMap = SplitUtils.processMetaBlock(metaBlock);
- metaMap.forEach((k, v) -> {
- metadata.put(k,v);
- });
- }
- int[] depthes = dt.getDepthes();
- int[] fathers = dt.getFathers();
- GTNode2 root = GTNode2.buildRoot("");
- GTNode2[] bns = new GTNode2[depthes.length];
- this.root = root;
- this.nodes = bns;
- if (dt.getHeadings() <= 0) {
- // 没有章节的处理
- processNoHeadings(params);
- return;
- }
- String[] titles = new String[ls.size()];
- int baseNo = 1;
- for (int i = 0; i < depthes.length; i++) {
- Node nd = ls.get(i);
- if (depthes[i] >= 0) {
- String htm1 = SplitUtils.getMDTxt(nd);
- List<String> hs1 = MDRegxUtil.splitByBrackets(htm1);
- String title = getHeadingTitle(nd, hs1);
- titles[i] = title;
- List<String> ts1 = new ArrayList<String>();
- ts1.add(title);
- MDElement v1 = new MDElement(ts1, hs1);
- GTNode2 gn = GTNode2.buildNode(null, i + baseNo);
- gn.setLabel(titles[i]);
- gn.setDepth(depthes[i]);
- gn.setValue(v1);
- gn.setType(GTBookConstants.MD_HEADING);
- gn.setMdNode(nd);
- bns[i] = gn;
- if (fathers[i] >= 0) {
- bns[fathers[i]].addChild(gn);
- } else {
- root.addChild(gn);
- }
- int j = i + 1;
- for (; j < fathers.length; j++) {
- if (depthes[j] < 0) {
- //基本上就 这 三种类型
- //Paragraph{} /HtmlBlock{} /OrderedList{}
- Node nd1 = ls.get(j);
- GTNode2 leaf = GTNode2.buildLeaf(j + baseNo);
- leaf.setDepth(depthes[i] + 1);
- leaf.setMdNode(nd1);
- gn.addChild(leaf);
- //TIntArrayList path1 = (TIntArrayList) path.clone();
- //path1.add(gn.size() - 1);
- processNode(nd1, leaf, j);
- bns[j] = leaf;
- } else {
- break;
- }
- }
- i = j - 1;
- } else {
- //System.out.println("===叶子节点===");
- int minLevel = docTree.getMinLevel();
- GTNode2 leaf = GTNode2.buildLeaf(i + baseNo);
- leaf.setDepth(minLevel);
- leaf.setMdNode(nd);
- root.addChild(leaf);
- //TIntArrayList path1 = (TIntArrayList) path.clone();
- //path1.add(gn.size() - 1);
- processNode(nd, leaf, i);
- bns[i] = leaf;
- if (fathers[i] >= 0) {
- bns[fathers[i]].addChild(leaf);
- } else {
- root.addChild(leaf);
- }
- }
- }
- }
- // public void splitNodeList(List<Node> ls, Map<String, Object> params) {
- // //没有 章节
- //
- // int[] depthes = new int[ls.size()];
- // int[] fathers = new int[ls.size()];
- //
- // String[] titles = new String[ls.size()];
- //
- // GTNode root = GTNode.buildRoot("");
- // GTNode[] bns = new GTNode[depthes.length];
- //
- // GTNode ch1 = GTNode.buildNode(null, 1);
- // ch1.setLabel(DEFALT_ROOT_LABEL);
- //
- // root.addChild(ch1);
- // TIntArrayList path = new TIntArrayList();
- // int i1 = root.size() - 1;
- // path.add(i1);
- //
- // for (int i = 0; i < depthes.length; i++) {
- // Node nd = ls.get(i);
- //
- // //MDElement v = processNode1(nd);
- // GTNode gn = GTNode.buildLeaf(i + 2);
- // ch1.addChild(gn);
- //
- // processNode(nd, gn, i);
- // Object obj = gn.getValue();
- // if (obj != null) {
- // MDElement e = (MDElement) obj;
- // List<String> ts = e.getText();
- // String txt1 = null;
- // if (ts != null && ts.size() > 0) {
- // txt1 = ts.get(0);
- // }
- // titles[i] = SplitUtils.stripLast(txt1);
- // gn.setLabel(getSub(titles[i]));
- // //gn.setDepth(depthes[i]);
- // gn.setDepth(1);
- //
- // }
- //
- // }
- // }
- public void parse(String md, BaseParameters params) {
- Map<String, Object> paramsMap = params.toMap();
- DocTree dt = MDHelper.parseToDocTree(md, paramsMap);
- dt.setText(md);
- parse(dt, paramsMap);
- }
- }
|