|
|
@@ -1,4 +1,4 @@
|
|
|
-package com.giantan.mds;
|
|
|
+package org.cnnlp.data.md;
|
|
|
|
|
|
import com.vladsch.flexmark.ast.Text;
|
|
|
import com.vladsch.flexmark.util.ast.Node;
|
|
|
@@ -7,23 +7,23 @@ import com.vladsch.flexmark.util.ast.TextCollectingVisitor;
|
|
|
import com.vladsch.flexmark.util.ast.VisitHandler;
|
|
|
import gnu.trove.TIntArrayList;
|
|
|
import org.cnnlp.data.book.GTBookUtil;
|
|
|
-import org.cnnlp.data.md.DocTree;
|
|
|
-import org.cnnlp.data.md.MDHelper;
|
|
|
-import org.cnnlp.data.md.MDRegxUtil;
|
|
|
+import org.cnnlp.data.document.GDocConstants;
|
|
|
import org.cnnlp.data.splitter.SplitUtils;
|
|
|
import org.cnnlp.data.util.BaseParameters;
|
|
|
|
|
|
-import java.io.IOException;
|
|
|
-import java.nio.file.Files;
|
|
|
-import java.nio.file.Path;
|
|
|
import java.util.*;
|
|
|
|
|
|
public class MdSearcher {
|
|
|
|
|
|
+ public final static String KEY_MATCH = "match";
|
|
|
+ public final static String KEY_HEADING = "headings";
|
|
|
TextCollectingVisitor textr;
|
|
|
|
|
|
DocTree dt;
|
|
|
|
|
|
+ transient String[] txts;
|
|
|
+ transient List<Line> flatList;
|
|
|
+
|
|
|
public MdSearcher() {
|
|
|
TextCollectingVisitor textCollectingVisitor = new TextCollectingVisitor();
|
|
|
this.textr = textCollectingVisitor;
|
|
|
@@ -34,10 +34,15 @@ public class MdSearcher {
|
|
|
this.dt = dt;
|
|
|
}
|
|
|
|
|
|
+ public String getText() {
|
|
|
+ return dt.getText();
|
|
|
+ }
|
|
|
+
|
|
|
public static int[] match(String text, String str) {
|
|
|
//text = text.replace("\r\n", "\n");
|
|
|
str = str.replace("\r\n", "\n");
|
|
|
String[] ss = str.split("\n");
|
|
|
+
|
|
|
int count = 0;
|
|
|
|
|
|
int startOffset = 0;
|
|
|
@@ -115,7 +120,7 @@ public class MdSearcher {
|
|
|
// }
|
|
|
// }
|
|
|
///2025.6.15 纯粹是因为md处理过程中,space的不同引起匹配不上 ,特殊处理一下
|
|
|
- if (!target[i].startsWith(" ")){
|
|
|
+ if (!target[i].startsWith(" ")) {
|
|
|
for (int j = nowP; j < text.length(); j++) {
|
|
|
char c = text.charAt(j);
|
|
|
if (c != '\n' && c != '\r' && c != ' ') {
|
|
|
@@ -133,7 +138,7 @@ public class MdSearcher {
|
|
|
isMatched = false;
|
|
|
break;
|
|
|
}
|
|
|
- }else {
|
|
|
+ } else {
|
|
|
isMatched = false;
|
|
|
break;
|
|
|
}
|
|
|
@@ -194,7 +199,7 @@ public class MdSearcher {
|
|
|
return title;
|
|
|
}
|
|
|
|
|
|
- protected String getTttle(Node nd){
|
|
|
+ protected String getTttle(Node nd) {
|
|
|
String htm1 = SplitUtils.getMDTxt(nd);
|
|
|
List<String> hs1 = MDRegxUtil.splitByBrackets(htm1);
|
|
|
|
|
|
@@ -260,7 +265,7 @@ public class MdSearcher {
|
|
|
int[] fathers = dt.getFathers();
|
|
|
int[] depthes = dt.getDepthes();
|
|
|
int nIdx1 = nodeR[0];
|
|
|
- int nIdx2 = fathers.length-1;
|
|
|
+ int nIdx2 = fathers.length - 1;
|
|
|
int fIdx = fathers[nIdx1];
|
|
|
//int eIdx = fIdx;
|
|
|
if (fIdx >= 0) {
|
|
|
@@ -283,7 +288,7 @@ public class MdSearcher {
|
|
|
return rets;
|
|
|
}
|
|
|
|
|
|
- public String getMdSource(int[] nodeR){
|
|
|
+ public String getMdSource(int[] nodeR) {
|
|
|
if (nodeR == null) return null;
|
|
|
List<Node> nodes = dt.getSource();
|
|
|
int startOffset = nodes.get(nodeR[0]).getStartOffset();
|
|
|
@@ -307,7 +312,7 @@ public class MdSearcher {
|
|
|
return s2;
|
|
|
}
|
|
|
|
|
|
- public Map<String,Object> searchAndHeadings(String str) {
|
|
|
+ public Map<String, Object> searchAndHeadings(String str) {
|
|
|
str = trimLeadingNewlines(str);
|
|
|
int[] mr = match(dt.getText(), str);
|
|
|
if (mr == null) return null;
|
|
|
@@ -320,70 +325,70 @@ public class MdSearcher {
|
|
|
String s2 = getMdSource(fatherRegion);
|
|
|
//System.out.println(s2);
|
|
|
List<String> headings = getHeadings(fatherRegion[0]);
|
|
|
- Map<String, Object> ret = Map.of("match", s2, "headings", headings);
|
|
|
+ Map<String, Object> ret = Map.of(KEY_MATCH, s2, KEY_HEADING, headings);
|
|
|
return ret;
|
|
|
}
|
|
|
|
|
|
- protected List<String> getHeadings(int nodeIdx){
|
|
|
+ protected List<String> getHeadings(int nodeIdx) {
|
|
|
int[] fathers = dt.getFathers();
|
|
|
List<Node> nodes = dt.getSource();
|
|
|
ArrayList<String> ls = new ArrayList<>();
|
|
|
int idx = nodeIdx;
|
|
|
int[] depthes = dt.getDepthes();
|
|
|
- if (depthes[idx]>=0){
|
|
|
+ if (depthes[idx] >= 0) {
|
|
|
String t = getTttle(nodes.get(idx));
|
|
|
- ls.add(0,t);
|
|
|
+ ls.add(0, t);
|
|
|
}
|
|
|
- while (idx >=0 && fathers[idx]>=0){
|
|
|
+ while (idx >= 0 && fathers[idx] >= 0) {
|
|
|
String t = getTttle(nodes.get(fathers[idx]));
|
|
|
- ls.add(0,t);
|
|
|
+ ls.add(0, t);
|
|
|
idx = fathers[idx];
|
|
|
}
|
|
|
return ls;
|
|
|
}
|
|
|
|
|
|
- public void outTitles(){
|
|
|
+ public void outTitles() {
|
|
|
int[] depthes = dt.getDepthes();
|
|
|
List<Node> nodes = dt.getSource();
|
|
|
for (int i = 0; i < depthes.length; i++) {
|
|
|
- if (depthes[i]>= 0){
|
|
|
+ if (depthes[i] >= 0) {
|
|
|
String t = getTttle(nodes.get(i));
|
|
|
- System.out.println("title="+t);
|
|
|
+ System.out.println("title=" + t);
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
|
|
|
|
|
|
- private int matchTitle(int idx, String[] titles,String[] headings){
|
|
|
+ private int matchTitle(int idx, String[] titles, String[] headings) {
|
|
|
int[] father2 = dt.getFather(idx);
|
|
|
boolean ok = true;
|
|
|
- if (headings.length > father2.length){
|
|
|
+ if (headings.length > father2.length) {
|
|
|
return -1;
|
|
|
}
|
|
|
for (int i = 0; i < headings.length; i++) {
|
|
|
- if (!headings[i].equals(titles[father2[headings.length-i-1]])){
|
|
|
+ if (!headings[i].equals(titles[father2[headings.length - i - 1]])) {
|
|
|
ok = false;
|
|
|
break;
|
|
|
}
|
|
|
}
|
|
|
- if (ok){
|
|
|
+ if (ok) {
|
|
|
return father2[0];
|
|
|
}
|
|
|
return -1;
|
|
|
}
|
|
|
|
|
|
// 取出 该标题下的所有内容
|
|
|
- private int[] getNodeContentRegion(int fIdx){
|
|
|
+ private int[] getNodeContentRegion(int fIdx) {
|
|
|
int[] rets = new int[2];
|
|
|
int[] fathers = dt.getFathers();
|
|
|
int[] depthes = dt.getDepthes();
|
|
|
|
|
|
int nIdx1 = fIdx;
|
|
|
- int nIdx2 = fathers.length-1;
|
|
|
+ int nIdx2 = fathers.length - 1;
|
|
|
if (fIdx >= 0) {
|
|
|
for (int i = fIdx + 1; i < fathers.length; i++) {
|
|
|
if (depthes[i] == depthes[fIdx]) {
|
|
|
- break;
|
|
|
+ break;
|
|
|
} else {
|
|
|
nIdx2 = i;
|
|
|
}
|
|
|
@@ -394,63 +399,71 @@ public class MdSearcher {
|
|
|
return rets;
|
|
|
}
|
|
|
|
|
|
- protected Map<String,Object> doHeadingMatch(int anchor,String[] titles ,String[] headings){
|
|
|
- Map<String,Object> rets = new HashMap<>();
|
|
|
- if (headings.length == 1){
|
|
|
+ protected Map<String, Object> doHeadingMatch(int anchor, String[] titles, String[] headings) {
|
|
|
+ Map<String, Object> rets = new HashMap<>();
|
|
|
+ if (headings.length == 1) {
|
|
|
int[] nodeContentRegion = getNodeContentRegion(anchor);
|
|
|
String s2 = getMdSource(nodeContentRegion);
|
|
|
//System.out.println(s2);
|
|
|
- rets.put("match",s2);
|
|
|
- }else {
|
|
|
- String[] h2 = Arrays.copyOf(headings,headings.length-1);
|
|
|
- int m2 = matchTitle(anchor, titles,h2);
|
|
|
+ rets.put(KEY_MATCH, s2);
|
|
|
+ } else {
|
|
|
+ String[] h2 = Arrays.copyOf(headings, headings.length - 1);
|
|
|
+ int m2 = matchTitle(anchor, titles, h2);
|
|
|
//System.out.println("m2="+m2);
|
|
|
- if (m2 <0){
|
|
|
- return null;
|
|
|
- }else{
|
|
|
+ if (m2 < 0) {
|
|
|
+ return null;
|
|
|
+ } else {
|
|
|
int[] nodeContentRegion = getNodeContentRegion(anchor);
|
|
|
String s2 = getMdSource(nodeContentRegion);
|
|
|
//System.out.println(s2);
|
|
|
- rets.put("match",s2);
|
|
|
+ rets.put(KEY_MATCH, s2);
|
|
|
}
|
|
|
}
|
|
|
return rets;
|
|
|
}
|
|
|
|
|
|
- public Map<String,Object> searchByHeadings(String[] headings){
|
|
|
- Map<String,Object> rets = null;
|
|
|
+ public Map<String, Object> searchByHeadings(String[] headings) {
|
|
|
+ Map<String, Object> rets = null;
|
|
|
+
|
|
|
+ // 如果 headings 是 "/" 表示 获取全文
|
|
|
+ if (headings.length == 1 && headings[0].equals("/")) {
|
|
|
+ String s = dt.getText();
|
|
|
+ rets = new HashMap<>();
|
|
|
+ rets.put(KEY_MATCH, s);
|
|
|
+ return rets;
|
|
|
+ }
|
|
|
|
|
|
int[] depthes = dt.getDepthes();
|
|
|
String[] titles = new String[depthes.length];
|
|
|
|
|
|
List<Node> nodes = dt.getSource();
|
|
|
for (int i = 0; i < depthes.length; i++) {
|
|
|
- if (depthes[i]>= 0){
|
|
|
+ if (depthes[i] >= 0) {
|
|
|
String t = getTttle(nodes.get(i));
|
|
|
//System.out.println("title="+t);
|
|
|
titles[i] = t;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- String lastHeading = headings[headings.length-1];
|
|
|
+ String lastHeading = headings[headings.length - 1];
|
|
|
TIntArrayList mi = new TIntArrayList();
|
|
|
for (int i = 0; i < titles.length; i++) {
|
|
|
- if (titles[i]!= null && titles[i].equals(lastHeading)){
|
|
|
+ if (titles[i] != null && titles[i].equals(lastHeading)) {
|
|
|
mi.add(i);
|
|
|
}
|
|
|
}
|
|
|
|
|
|
//String[] h2 = Arrays.copyOf(headings,headings.length-1);
|
|
|
|
|
|
- if (mi.size() == 0){
|
|
|
+ if (mi.size() == 0) {
|
|
|
return null;
|
|
|
- }else if (mi.size() == 1){
|
|
|
- rets = doHeadingMatch(mi.getQuick(0),titles,headings);
|
|
|
+ } else if (mi.size() == 1) {
|
|
|
+ rets = doHeadingMatch(mi.getQuick(0), titles, headings);
|
|
|
// if (headings.length == 1){
|
|
|
// int[] nodeContentRegion = getNodeContentRegion(mi.getQuick(0));
|
|
|
// String s2 = getMdSource(nodeContentRegion);
|
|
|
// //System.out.println(s2);
|
|
|
-// rets.put("match",s2);
|
|
|
+// rets.put(KEY_MATCH,s2);
|
|
|
// }else {
|
|
|
// int m2 = matchTitle(mi.getQuick(0), titles,h2);
|
|
|
// System.out.println("m2="+m2);
|
|
|
@@ -460,20 +473,208 @@ public class MdSearcher {
|
|
|
// int[] nodeContentRegion = getNodeContentRegion(mi.getQuick(0));
|
|
|
// String s2 = getMdSource(nodeContentRegion);
|
|
|
// //System.out.println(s2);
|
|
|
-// rets.put("match",s2);
|
|
|
+// rets.put(KEY_MATCH,s2);
|
|
|
// }
|
|
|
// }
|
|
|
|
|
|
- }else {
|
|
|
+ } else {
|
|
|
for (int i = 0; i < mi.size(); i++) {
|
|
|
int nowL = mi.getQuick(i);
|
|
|
- int[] father2 = dt.getFather(nowL);
|
|
|
+ //int[] father2 = dt.getFather(nowL);
|
|
|
+ Map<String, Object> rets1 = doHeadingMatch(nowL, titles, headings);
|
|
|
+ if (rets1 != null && rets1.get(KEY_MATCH) != null) {
|
|
|
+ rets = rets1;
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ return rets;
|
|
|
+ }
|
|
|
+
|
|
|
+ private static String[] strip(String[] ss) {
|
|
|
+ List<String> ls = new ArrayList<>();
|
|
|
+ for (int i = 0; i < ss.length; i++) {
|
|
|
+ if (!ss[i].isEmpty()) {
|
|
|
+ ls.add(ss[i]);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return ls.toArray(new String[ls.size()]);
|
|
|
+ }
|
|
|
+
|
|
|
+ private static boolean isEqual(String s1, String s2) {
|
|
|
+ if (s2.startsWith(s1)) {
|
|
|
+ return true;
|
|
|
+ }
|
|
|
+ return false;
|
|
|
+ }
|
|
|
+
|
|
|
+ // 在 String[] ss2 中找到 ss1 的连续子序列(忽略 ss2 中的空字符串 ""),并返回其在 ss2 中的 起始位置 和 终止位置(索引)。
|
|
|
+ public static int[] findSubArrayRange(String[] ss1, String[] ss2, int offset) {
|
|
|
+ ss1 = strip(ss1);
|
|
|
+ for (int i = offset; i < ss2.length; i++) {
|
|
|
+ int idx1 = 0;
|
|
|
+ int idx2 = i;
|
|
|
+ int start = -1;
|
|
|
+ int end = -1;
|
|
|
+
|
|
|
+ while (idx2 < ss2.length && idx1 < ss1.length) {
|
|
|
+ if (ss2[idx2].isEmpty()) {
|
|
|
+ idx2++; // 忽略空串
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+// if (!ss2[idx2].equals(ss1[idx1])) {
|
|
|
+// break; // 匹配失败,从下一个 i 开始
|
|
|
+// }
|
|
|
+
|
|
|
+ if (!isEqual(ss1[idx1], ss2[idx2])) {
|
|
|
+ break;
|
|
|
+ }
|
|
|
|
|
|
+ if (start == -1) {
|
|
|
+ start = idx2;
|
|
|
+ }
|
|
|
+ end = idx2;
|
|
|
+ idx1++;
|
|
|
+ idx2++;
|
|
|
+ }
|
|
|
+
|
|
|
+ if (idx1 == ss1.length) {
|
|
|
+ return new int[]{start, end}; // 返回实际在 ss2 中的起止索引
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+ return null; // 没有匹配
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+ class Line {
|
|
|
+ String text;
|
|
|
+ int ss2Index; // 原始在ss2中的索引
|
|
|
+
|
|
|
+ public Line(String text, int ss2Index) {
|
|
|
+ this.text = text;
|
|
|
+ this.ss2Index = ss2Index;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ public int[] findSubArrayRangeWithLineSplit(String[] ss1, String[] ss2, int offset) {
|
|
|
+ // 构造逻辑行列表:每一行来自 ss2[i] 的第 j 段
|
|
|
+
|
|
|
+ ss1 = strip(ss1);
|
|
|
+
|
|
|
+ // 尝试在 flatList 中找到 ss1 匹配子序列
|
|
|
+ for (int i = offset; i <= flatList.size() - ss1.length; i++) {
|
|
|
+ boolean matched = true;
|
|
|
+ for (int j = 0; j < ss1.length; j++) {
|
|
|
+ if (!flatList.get(i + j).text.equals(ss1[j])) {
|
|
|
+ matched = false;
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ if (matched) {
|
|
|
+ int start = flatList.get(i).ss2Index;
|
|
|
+ int end = flatList.get(i + ss1.length - 1).ss2Index;
|
|
|
+ return new int[]{start, end};
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ return null; // 未找到
|
|
|
+ }
|
|
|
+
|
|
|
+ // 找出 txt 在md文件中的位置
|
|
|
+ public Map<String, Object> searchByPlainTxt(String txt, int offsetMd) {
|
|
|
+ Map<String, Object> rets = new HashMap<>();
|
|
|
+ List<Node> nodes = dt.getSource();
|
|
|
+ if (txts == null) {
|
|
|
+ txts = new String[nodes.size()];
|
|
|
+ for (int i = 0; i < nodes.size(); i++) {
|
|
|
+ txts[i] = getText(nodes.get(i));
|
|
|
+ //System.out.println(txts[i]);
|
|
|
+ }
|
|
|
+
|
|
|
+ List<Line> fl = new ArrayList<>();
|
|
|
+ for (int i = 0; i < txts.length; i++) {
|
|
|
+ String s = txts[i];
|
|
|
+ if (s.isEmpty()) continue;
|
|
|
+
|
|
|
+ String[] lines = s.split("\n");
|
|
|
+ for (String line : lines) {
|
|
|
+ if (!line.isEmpty()) {
|
|
|
+ fl.add(new Line(line, i));
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ flatList = fl;
|
|
|
+ }
|
|
|
+
|
|
|
+ int idx = 0;
|
|
|
+ for (int i = 0; i < txts.length; i++) {
|
|
|
+ Node node = nodes.get(i);
|
|
|
+ int endOffset = node.getEndOffset();
|
|
|
+ int startOffset = node.getStartOffset();
|
|
|
+ if (offsetMd < endOffset) {
|
|
|
+ idx = i;
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ for (int i = idx; i < txts.length; i++) {
|
|
|
+ if (txts[i].length() > txt.length()) {
|
|
|
+ int i1 = txts[i].indexOf(txt);
|
|
|
+ if (i1 > 0) {
|
|
|
+ rets.put(GDocConstants.START_OFFSET, nodes.get(i).getStartOffset());
|
|
|
+ rets.put(GDocConstants.END_OFFSET, nodes.get(i).getEndOffset());
|
|
|
+ rets.put("inLine", i1);
|
|
|
+ return rets;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ int fIdx = 0;
|
|
|
+ for (int i = 0; i < flatList.size(); i++) {
|
|
|
+ if (flatList.get(i).ss2Index == idx) {
|
|
|
+ fIdx = i;
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ String[] ss = txt.split("\n");
|
|
|
+ //int[] range = findSubArrayRange(ss, txts, idx);
|
|
|
+ int[] range = findSubArrayRangeWithLineSplit(ss, txts, fIdx);
|
|
|
+
|
|
|
+
|
|
|
+ if (range != null) {
|
|
|
+ int startOffset = nodes.get(range[0]).getStartOffset();
|
|
|
+ int endOffset = nodes.get(range[1]).getEndOffset();
|
|
|
+ rets.put(GDocConstants.START_OFFSET, startOffset);
|
|
|
+ rets.put(GDocConstants.END_OFFSET, endOffset);
|
|
|
+ }
|
|
|
+
|
|
|
+// if (txt.length() <= txts[idx].length() && offsetMd + txt.length() <= nodes.get(idx).getEndOffset()) {
|
|
|
+// if (txts[idx].indexOf(txt) >= 0) {
|
|
|
+// rets.put(GDocConstants.START_OFFSET, nodes.get(idx).getStartOffset());
|
|
|
+// rets.put(GDocConstants.END_OFFSET, nodes.get(idx).getEndOffset());
|
|
|
+// }
|
|
|
+// } else {
|
|
|
+// String[] ss = txt.split("\n");
|
|
|
+// //int[] range = findSubArrayRange(ss, txts, idx);
|
|
|
+// int[] range = findSubArrayRangeWithLineSplit(ss, txts, idx);
|
|
|
+//
|
|
|
+//
|
|
|
+// if (range != null) {
|
|
|
+// int startOffset = nodes.get(range[0]).getStartOffset();
|
|
|
+// int endOffset = nodes.get(range[1]).getEndOffset();
|
|
|
+// rets.put(GDocConstants.START_OFFSET, startOffset);
|
|
|
+// rets.put(GDocConstants.END_OFFSET, endOffset);
|
|
|
+// }
|
|
|
+// }
|
|
|
return rets;
|
|
|
}
|
|
|
+
|
|
|
+
|
|
|
// public static void main(String[] args) throws IOException {
|
|
|
// String f = "D:\\data\\乙烯\\乙烯1.md";
|
|
|
// String content = Files.readString(Path.of(f));
|