BaseMdParser.java 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606
  1. package org.cnnlp.data.splitter;
  2. import com.vladsch.flexmark.ast.*;
  3. import com.vladsch.flexmark.ext.obs.comments.Comments;
  4. import com.vladsch.flexmark.ext.tables.TableBlock;
  5. import com.vladsch.flexmark.ext.yaml.front.matter.YamlFrontMatterBlock;
  6. import com.vladsch.flexmark.util.ast.Node;
  7. import com.vladsch.flexmark.util.ast.NodeVisitor;
  8. import com.vladsch.flexmark.util.ast.TextCollectingVisitor;
  9. import com.vladsch.flexmark.util.ast.VisitHandler;
  10. import com.vladsch.flexmark.util.collection.iteration.ReversiblePeekingIterable;
  11. import com.vladsch.flexmark.util.sequence.BasedSequence;
  12. import gnu.trove.TIntArrayList;
  13. import org.cnnlp.data.book.*;
  14. import org.cnnlp.data.md.DocTree;
  15. import org.cnnlp.data.md.HtmlToPlainText;
  16. import org.cnnlp.data.md.MDHelper;
  17. import org.cnnlp.data.md.MDRegxUtil;
  18. import org.cnnlp.data.util.BaseParameters;
  19. import org.cnnlp.data.util.SenUtil;
  20. import org.jsoup.Jsoup;
  21. import java.util.*;
  22. //org.cnnlp.data.md.DocRender2.java copy过来
  23. // GTNode -> GTNode2
  24. public class BaseMdParser {
  25. public static final String DEFALT_LABEL = "";
  26. private String DEFALT_ROOT_LABEL = "";
  27. // 用于解决 writeUTF(String),太长引起的exception
  28. public static final int MAX_STRING_LEN = 20000;
  29. TextCollectingVisitor textr;
  30. HtmlToPlainText formatter = null;
  31. protected MDElement faqComments;
  32. protected Map<String,Object> metadata;
  33. protected DocTree docTree;
  34. protected GTNode2 root; //= GTNode2.buildRoot("");
  35. protected GTNode2[] nodes; //= new GTNode2[depthes.length];
  36. public BaseMdParser() {
  37. init();
  38. }
  39. protected void init() {
  40. // stripNewlines(true) 去掉回车
  41. //TextContentRenderer text = TextContentRenderer.builder().stripNewlines(true).build();
  42. TextCollectingVisitor textCollectingVisitor = new TextCollectingVisitor();
  43. this.textr = textCollectingVisitor;
  44. metadata = new LinkedHashMap<>();
  45. }
  46. public Map<String, Object> getMetadata() {
  47. return metadata;
  48. }
  49. // 2025.3.15 发生错误时,用此方法
  50. //Accumulated range [1379874, 1379876) overlaps Transformed Range[2]: [1379874, 1379876)
  51. private String extractText(Node node) {
  52. StringBuilder text = new StringBuilder();
  53. NodeVisitor visitor = new NodeVisitor(
  54. new VisitHandler<>(Text.class, textNode -> {
  55. //System.out.println("Text: " + textNode.getChars());
  56. text.append(textNode.getChars()).append("\n");
  57. })
  58. );
  59. visitor.visit(node);
  60. return text.toString();
  61. }
  62. public String getText(Node nd) {
  63. //System.out.println("=======");
  64. //System.out.println(nd.getChars());
  65. String s = "";
  66. try {
  67. s = textr.collectAndGetText(nd);
  68. }catch(Exception e) {
  69. s=extractText(nd);
  70. }
  71. s = s.replaceAll("\r\n", "\n");
  72. return s;
  73. }
  74. protected String getSub(String text) {
  75. if (text == null || text.length() < 10) {
  76. return text;
  77. } else {
  78. return text.substring(0, 10);
  79. }
  80. }
  81. private List<String> splitLongString(String s) {
  82. int maxLineLen = MAX_STRING_LEN;
  83. int len = s.length();
  84. int its = len / maxLineLen;
  85. List<String> ls = new ArrayList<>();
  86. int nowOffset = 0;
  87. for (int i = 0; i < its; i++) {
  88. String s1 = s.substring(nowOffset, nowOffset + maxLineLen);
  89. nowOffset = nowOffset + maxLineLen;
  90. ls.add(s1);
  91. }
  92. if (len == nowOffset) {
  93. } else {
  94. String s1 = s.substring(nowOffset);
  95. ls.add(s1);
  96. }
  97. return ls;
  98. }
  99. public String getPath(TIntArrayList path) {
  100. StringBuilder sb = new StringBuilder();
  101. //sb.append(cid).append("/");
  102. if (path.size() > 0) {
  103. sb.append(path.getQuick(0));
  104. for (int i = 1; i < path.size(); i++) {
  105. sb.append("-").append(path.getQuick(i));
  106. }
  107. return sb.toString();
  108. } else {
  109. // 没有父节点,例如 "前言",有时不带#号
  110. return "";
  111. }
  112. }
  113. private String getPathId(int idx) {
  114. return String.valueOf(idx);
  115. }
  116. protected void processComments(List<String> comments, int idx) {
  117. if (faqComments == null) {
  118. faqComments = new MDElement();
  119. }
  120. comments.forEach(k -> faqComments.add(k, getPathId(idx)));
  121. }
  122. protected String processHtmlBlock(String s) {
  123. if (formatter == null) {
  124. formatter = new HtmlToPlainText();
  125. }
  126. String converted = formatter.getPlainText(Jsoup.parse(s));
  127. return converted;
  128. }
  129. // 增加path,主要是 Comments 需要指向路径
  130. // 2025.2.19 增加 HtmlCommentBlock处理
  131. // copy from DocRender2.java
  132. protected void processNode(Node nd1, GTNode gn, int idx) {
  133. List<String> ts2 = new ArrayList<String>();
  134. List<String> hs2 = new ArrayList<String>();
  135. //基本上就 这 三种类型
  136. //Paragraph{} /HtmlBlock{} /OrderedList{} /TableBlock{} /HtmlInline{}/BulletList{}
  137. if (nd1 instanceof HtmlBlock) {
  138. HtmlBlock p1 = (HtmlBlock) nd1;
  139. List<BasedSequence> ls1 = p1.getContentLines();
  140. //System.out.println(p1.getContentLines());
  141. //System.out.println("==");
  142. for (int i = 0; i < ls1.size(); i++) {
  143. hs2.add(ls1.get(i).toString());
  144. }
  145. String txt = processHtmlBlock(SplitUtils.getMDTxt(p1));
  146. ts2.add(txt);
  147. gn.setType(GTBookConstants.MD_HTMLBLOCK);
  148. } else if (nd1 instanceof TableBlock) {
  149. TableBlock p1 = (TableBlock) nd1;
  150. String md = SplitUtils.getMDTxt(p1);
  151. hs2.add(md);
  152. String txt = getText(nd1);
  153. ts2.add(txt);
  154. gn.setType(GTBookConstants.MD_TABLE);
  155. } else if (nd1 instanceof Comments) {
  156. Comments p1 = (Comments) nd1;
  157. String md = SplitUtils.getMDTxt(p1);
  158. hs2.add(md);
  159. String txt = getText(nd1);
  160. ts2.add(txt);
  161. gn.setType(GTBookConstants.MD_COMMENTS);
  162. processComments(List.of(txt), idx);
  163. } else if (nd1 instanceof HtmlCommentBlock) {
  164. HtmlCommentBlock p1 = (HtmlCommentBlock) nd1;
  165. String md = SplitUtils.getMDTxt(p1);
  166. hs2.add(md);
  167. String txt = getText(nd1);
  168. ts2.add(txt);
  169. gn.setType(GTBookConstants.MD_COMMENTS);
  170. processComments(List.of(txt), idx);
  171. } else if (nd1 instanceof Paragraph) {
  172. Paragraph p1 = (Paragraph) nd1;
  173. Node firstChild = p1.getFirstChild();
  174. if (firstChild instanceof Comments) {
  175. List<String> comments = new ArrayList<>();
  176. StringBuilder sb = new StringBuilder();
  177. ReversiblePeekingIterable<Node> children = p1.getChildren();
  178. children.forEach(n -> {
  179. String t1 = getText(n);
  180. if (n instanceof Comments) {
  181. comments.add(t1);
  182. } else {
  183. sb.append(t1);
  184. }
  185. });
  186. String htm = SplitUtils.getMDTxt(p1);
  187. String txt = comments.get(0) + sb.toString() + "\n";
  188. hs2.add(htm);
  189. ts2.add(txt);
  190. processComments(comments, idx);
  191. } else if (firstChild instanceof Image) {
  192. StringBuilder sb = new StringBuilder();
  193. ReversiblePeekingIterable<Node> children = p1.getChildren();
  194. children.forEach(n -> {
  195. String t1 = getText(n);
  196. if (n instanceof Image) {
  197. if (SenUtil.isMeaninglessString(t1)) {
  198. } else {
  199. sb.append(t1);
  200. }
  201. } else {
  202. sb.append(t1);
  203. }
  204. });
  205. String htm = SplitUtils.getMDTxt(p1);
  206. String txt = sb.toString() + "\n";
  207. hs2.add(htm);
  208. ts2.add(txt);
  209. } else {
  210. String md = SplitUtils.getMDTxt(p1);
  211. //[下划线]{.underline}
  212. if (md.length() > 12) {
  213. String md1 = MDRegxUtil.convertUnderline(md);
  214. if (md1.length() != md.length()) {
  215. hs2.add(md1);
  216. String txt = processHtmlBlock(md1);
  217. ts2.add(txt);
  218. gn.setType(GTBookConstants.MD_HTMLBLOCK);
  219. }
  220. }
  221. }
  222. } else if (nd1 instanceof OrderedList) {
  223. String htm = SplitUtils.getMDTxt(nd1);
  224. String txt = getText(nd1);
  225. txt = txt.replaceAll("\n\n\n", "\n\n");
  226. hs2.add(htm);
  227. ts2.add(txt);
  228. } else if (nd1 instanceof BulletList) {
  229. // 2025.3.7 增加
  230. String htm = SplitUtils.getMDTxt(nd1);
  231. String txt = getText(nd1);
  232. txt = txt.replaceAll("\n\n", "\n");
  233. hs2.add(htm);
  234. ts2.add(txt);
  235. }else if (nd1 instanceof YamlFrontMatterBlock) {
  236. // 第一行的元数据
  237. YamlFrontMatterBlock p1 = (YamlFrontMatterBlock) nd1;
  238. String md = SplitUtils.getMDTxt(p1);
  239. hs2.add(md);
  240. String txt = "";
  241. ts2.add(txt);
  242. gn.setType(GTBookConstants.MD_FRONTMATTER);
  243. }
  244. if (ts2.size() <= 0) {
  245. // 2024.1.7 这部分用于pandoc转出的md的图片杂乱信息的处理,暂时注释掉
  246. // String htm = getMDTxt(nd1);
  247. // hs2 = MDRegxUtil.splitByBrackets(htm);
  248. // //[!, [C:\\Users\\kevin-pc\\Desktop\\2020\\index.png], (/images/1237763635937497090/resource/image9.png), ]
  249. // boolean isRepl = false;
  250. // if (hs2.size() >= 2) {
  251. // isRepl = processImageNote(hs2);
  252. // }
  253. //
  254. // //ts2 = new ArrayList<String>();
  255. // String txt = null;
  256. // if (isRepl) {
  257. // String mdTxt1 = GTBookUtil.listToString(hs2, "");
  258. // Node nnd1 = MDHelper.PARSER.parse(mdTxt1);
  259. // txt = getText(nnd1);
  260. // } else {
  261. // txt = getText(nd1);
  262. // }
  263. // ts2.add(txt);
  264. String htm = SplitUtils.getMDTxt(nd1);
  265. String txt = getText(nd1);
  266. hs2.add(htm);
  267. ts2.add(txt);
  268. }
  269. //// 2020.8.26 要进行长串的处理
  270. String txt = ts2.get(0);
  271. if (txt.length() > MAX_STRING_LEN) {
  272. List<String> ls = splitLongString(txt);
  273. ts2 = ls;
  274. }
  275. boolean isTooLong = false;
  276. for (String s : hs2) {
  277. if (s.length() > MAX_STRING_LEN) {
  278. isTooLong = true;
  279. break;
  280. }
  281. }
  282. if (isTooLong) {
  283. List<String> ls = new ArrayList<>();
  284. for (String s : hs2) {
  285. if (s.length() > MAX_STRING_LEN) {
  286. List<String> ls1 = splitLongString(s);
  287. ls.addAll(ls1);
  288. } else {
  289. ls.add(s);
  290. }
  291. }
  292. hs2 = ls;
  293. }
  294. MDElement v = new MDElement(ts2, hs2);
  295. gn.setValue(v);
  296. }
  297. protected String getHeadingTitle(Node nd, List<String> hs1) {
  298. boolean isRemoved = false;
  299. if (hs1.size() > 1) {
  300. isRemoved = SplitUtils.processHeadingNote(hs1);
  301. }
  302. String txt1 = null;
  303. if (isRemoved) {
  304. String mdTxt1 = GTBookUtil.listToString(hs1, "");
  305. Node nnd1 = MDHelper.PARSER.parse(mdTxt1);
  306. txt1 = getText(nnd1);
  307. } else {
  308. txt1 = getText(nd);
  309. }
  310. String title = SplitUtils.stripLast(txt1);
  311. return title;
  312. }
  313. //没有 章节 headings 的处理
  314. private void processNoHeadings(Map<String, Object> params) {
  315. int[] depthes = docTree.getDepthes();
  316. //int[] fathers = docTree.getFathers();
  317. List<Node> ls = docTree.getSource();
  318. String[] titles = new String[ls.size()];
  319. GTNode2[] bns = new GTNode2[depthes.length];
  320. this.nodes = bns;
  321. GTNode2 ch1 = GTNode2.buildNode(null, 1);
  322. ch1.setLabel(DEFALT_ROOT_LABEL);
  323. root.addChild(ch1);
  324. int baseNo = 1;
  325. TIntArrayList path = new TIntArrayList();
  326. int i1 = root.size() - 1;
  327. path.add(i1);
  328. for (int i = 0; i < depthes.length; i++) {
  329. Node nd = ls.get(i);
  330. GTNode2 gn = GTNode2.buildLeaf(i + baseNo);
  331. ch1.addChild(gn);
  332. processNode(nd, gn, i);
  333. bns[i] = gn;
  334. Object obj = gn.getValue();
  335. if (obj != null) {
  336. MDElement e = (MDElement) obj;
  337. List<String> ts = e.getText();
  338. String txt1 = null;
  339. if (ts != null && ts.size() > 0) {
  340. txt1 = ts.get(0);
  341. }
  342. titles[i] = SplitUtils.stripLast(txt1);
  343. gn.setLabel(getSub(titles[i]));
  344. //gn.setDepth(depthes[i]);
  345. gn.setDepth(1);
  346. }
  347. }
  348. }
  349. protected String getHtml(IElement v) {
  350. List<String> hs = v.getHtml();
  351. if (hs != null && hs.size() > 0) {
  352. return hs.get(0);
  353. }
  354. return null;
  355. }
  356. private String getTxt(IElement v) {
  357. List<String> hs = v.getText();
  358. if (hs != null && hs.size() > 0) {
  359. return hs.get(0);
  360. }
  361. return null;
  362. }
  363. public List<String> getOutline(boolean isIndented){
  364. List<String> outline = new ArrayList<>();
  365. if (nodes != null){
  366. if (isIndented){
  367. String spaces = " ";
  368. int minDepth = docTree.getMinLevel();
  369. int[] depthes = docTree.getDepthes();
  370. for (int i = 0; i < nodes.length; i++) {
  371. if (nodes[i].getType() == GTBookConstants.MD_HEADING) {
  372. //System.out.println(nodes[i].getMdNode().getChars().toString());
  373. String space = "";
  374. if (depthes[i] > minDepth) {
  375. space = spaces.substring(0, depthes[i] - minDepth);
  376. }
  377. String title = space+((IElement) nodes[i].getValue()).getHtml().get(0);
  378. outline.add(title);
  379. }
  380. }
  381. }else {
  382. for (int i = 0; i < nodes.length; i++) {
  383. if (nodes[i].getType() == GTBookConstants.MD_HEADING) {
  384. //System.out.println(nodes[i].getMdNode().getChars().toString());
  385. String title = ((IElement) nodes[i].getValue()).getHtml().get(0);
  386. outline.add(title);
  387. }
  388. }
  389. }
  390. }
  391. return outline;
  392. }
  393. // 改造自 DocRender2.java的 renderToBook()
  394. public void parse(DocTree dt, Map<String, Object> params) {
  395. this.docTree = dt;
  396. List<Node> ls = dt.getSource();
  397. Map<String, List<String>> metaMap = new HashMap<String, List<String>>();
  398. if (ls.size() > 0 && (ls.get(0) instanceof YamlFrontMatterBlock)) {
  399. //YamlFrontMatterBlock metaBlock = (YamlFrontMatterBlock) ls.remove(0);
  400. YamlFrontMatterBlock metaBlock = (YamlFrontMatterBlock) ls.get(0);
  401. metaMap = SplitUtils.processMetaBlock(metaBlock);
  402. metaMap.forEach((k, v) -> {
  403. metadata.put(k,v);
  404. });
  405. }
  406. int[] depthes = dt.getDepthes();
  407. int[] fathers = dt.getFathers();
  408. GTNode2 root = GTNode2.buildRoot("");
  409. GTNode2[] bns = new GTNode2[depthes.length];
  410. this.root = root;
  411. this.nodes = bns;
  412. if (dt.getHeadings() <= 0) {
  413. // 没有章节的处理
  414. processNoHeadings(params);
  415. return;
  416. }
  417. String[] titles = new String[ls.size()];
  418. int baseNo = 1;
  419. for (int i = 0; i < depthes.length; i++) {
  420. Node nd = ls.get(i);
  421. if (depthes[i] >= 0) {
  422. String htm1 = SplitUtils.getMDTxt(nd);
  423. List<String> hs1 = MDRegxUtil.splitByBrackets(htm1);
  424. String title = getHeadingTitle(nd, hs1);
  425. titles[i] = title;
  426. List<String> ts1 = new ArrayList<String>();
  427. ts1.add(title);
  428. MDElement v1 = new MDElement(ts1, hs1);
  429. GTNode2 gn = GTNode2.buildNode(null, i + baseNo);
  430. gn.setLabel(titles[i]);
  431. gn.setDepth(depthes[i]);
  432. gn.setValue(v1);
  433. gn.setType(GTBookConstants.MD_HEADING);
  434. gn.setMdNode(nd);
  435. bns[i] = gn;
  436. if (fathers[i] >= 0) {
  437. bns[fathers[i]].addChild(gn);
  438. } else {
  439. root.addChild(gn);
  440. }
  441. int j = i + 1;
  442. for (; j < fathers.length; j++) {
  443. if (depthes[j] < 0) {
  444. //基本上就 这 三种类型
  445. //Paragraph{} /HtmlBlock{} /OrderedList{}
  446. Node nd1 = ls.get(j);
  447. GTNode2 leaf = GTNode2.buildLeaf(j + baseNo);
  448. leaf.setDepth(depthes[i] + 1);
  449. leaf.setMdNode(nd1);
  450. gn.addChild(leaf);
  451. //TIntArrayList path1 = (TIntArrayList) path.clone();
  452. //path1.add(gn.size() - 1);
  453. processNode(nd1, leaf, j);
  454. bns[j] = leaf;
  455. } else {
  456. break;
  457. }
  458. }
  459. i = j - 1;
  460. } else {
  461. //System.out.println("===叶子节点===");
  462. int minLevel = docTree.getMinLevel();
  463. GTNode2 leaf = GTNode2.buildLeaf(i + baseNo);
  464. leaf.setDepth(minLevel);
  465. leaf.setMdNode(nd);
  466. root.addChild(leaf);
  467. //TIntArrayList path1 = (TIntArrayList) path.clone();
  468. //path1.add(gn.size() - 1);
  469. processNode(nd, leaf, i);
  470. bns[i] = leaf;
  471. if (fathers[i] >= 0) {
  472. bns[fathers[i]].addChild(leaf);
  473. } else {
  474. root.addChild(leaf);
  475. }
  476. }
  477. }
  478. }
  479. // public void splitNodeList(List<Node> ls, Map<String, Object> params) {
  480. // //没有 章节
  481. //
  482. // int[] depthes = new int[ls.size()];
  483. // int[] fathers = new int[ls.size()];
  484. //
  485. // String[] titles = new String[ls.size()];
  486. //
  487. // GTNode root = GTNode.buildRoot("");
  488. // GTNode[] bns = new GTNode[depthes.length];
  489. //
  490. // GTNode ch1 = GTNode.buildNode(null, 1);
  491. // ch1.setLabel(DEFALT_ROOT_LABEL);
  492. //
  493. // root.addChild(ch1);
  494. // TIntArrayList path = new TIntArrayList();
  495. // int i1 = root.size() - 1;
  496. // path.add(i1);
  497. //
  498. // for (int i = 0; i < depthes.length; i++) {
  499. // Node nd = ls.get(i);
  500. //
  501. // //MDElement v = processNode1(nd);
  502. // GTNode gn = GTNode.buildLeaf(i + 2);
  503. // ch1.addChild(gn);
  504. //
  505. // processNode(nd, gn, i);
  506. // Object obj = gn.getValue();
  507. // if (obj != null) {
  508. // MDElement e = (MDElement) obj;
  509. // List<String> ts = e.getText();
  510. // String txt1 = null;
  511. // if (ts != null && ts.size() > 0) {
  512. // txt1 = ts.get(0);
  513. // }
  514. // titles[i] = SplitUtils.stripLast(txt1);
  515. // gn.setLabel(getSub(titles[i]));
  516. // //gn.setDepth(depthes[i]);
  517. // gn.setDepth(1);
  518. //
  519. // }
  520. //
  521. // }
  522. // }
  523. public void parse(String md, BaseParameters params) {
  524. Map<String, Object> paramsMap = params.toMap();
  525. DocTree dt = MDHelper.parseToDocTree(md, paramsMap);
  526. dt.setText(md);
  527. parse(dt, paramsMap);
  528. }
  529. }