Sfoglia il codice sorgente

增加了MdChuncking,GChatClient

dwp 8 mesi fa
parent
commit
7552639c9d
61 ha cambiato i file con 2754 aggiunte e 236 eliminazioni
  1. 1 0
      gfs/src/main/java/com/giantan/gfs/storer/impl/S3Storer.java
  2. 1 1
      gtbook/src/main/java/org/cnnlp/data/book/GTNode.java
  3. 24 6
      gtbook/src/main/java/org/cnnlp/data/document/GDocConstants.java
  4. 253 52
      gtbook/src/main/java/org/cnnlp/data/md/MdSearcher.java
  5. 1 0
      gtbook/src/main/java/org/cnnlp/data/splitter/BaseMdParser.java
  6. 3 1
      gtbook/src/main/java/org/cnnlp/data/splitter/FaqMdSplitter.java
  7. 24 0
      gtbook/src/main/java/org/cnnlp/data/splitter/IMdChunking.java
  8. 457 0
      gtbook/src/main/java/org/cnnlp/data/splitter/MdChunking.java
  9. 35 6
      gtbook/src/main/java/org/cnnlp/data/splitter/SimpleMdSplitter.java
  10. 38 2
      gtbook/src/main/java/org/cnnlp/data/splitter/SplitUtils.java
  11. 84 0
      gtbook/src/main/java/org/cnnlp/data/splitter/SubSequence.java
  12. 198 0
      gtbook/src/main/java/org/cnnlp/data/splitter/TaggedMdSplitter.java
  13. 3 2
      gtbook/src/main/java/org/cnnlp/data/splitter/model/ChunkMdParser.java
  14. 2 1
      gtbook/src/main/java/org/cnnlp/data/splitter/model/ChunkSample.java
  15. 1 1
      gtbook/src/main/java/org/cnnlp/data/splitter/model/MdChunkModel.java
  16. 2 3
      gtbook/src/main/java/org/cnnlp/data/splitter/model/MdChunkTrainer.java
  17. 1 1
      gtbook/src/main/java/org/cnnlp/data/splitter/model/TextUnitMlModel.java
  18. 1 1
      gtbook/src/main/java/org/cnnlp/data/splitter/model/TextUtitPatternModel.java
  19. 3 3
      gtbook/src/test/java/org/cnnlp/data/splitter/MdChunkSplitter1.java
  20. 37 0
      gtbook/src/test/java/org/cnnlp/data/splitter/MdChunkingTest.java
  21. 1 0
      gtbook/src/test/java/org/cnnlp/data/splitter/SplitTest.java
  22. 4 4
      gtbook/src/test/java/org/cnnlp/data/splitter/SplitTools.java
  23. 34 0
      gtbook/src/test/java/org/cnnlp/data/splitter/SplitUtilsTest.java
  24. 28 0
      gtbook/src/test/java/org/cnnlp/data/splitter/TaggedMdSplitterTest.java
  25. 1 1
      gtbook/src/test/java/org/cnnlp/data/splitter/model/MdChunkTrainerTest.java
  26. 34 1
      server/pom.xml
  27. 7 3
      server/src/main/java/com/giantan/data/mds/MdsApplication.java
  28. 186 0
      server/src/main/java/com/giantan/data/mds/bot/ExtractPrompts.java
  29. 71 0
      server/src/main/java/com/giantan/data/mds/bot/GChatClient.java
  30. 88 0
      server/src/main/java/com/giantan/data/mds/bot/JingluoPrompts.java
  31. 136 0
      server/src/main/java/com/giantan/data/mds/chunk/DynamicChunkRepository.java
  32. 2 2
      server/src/main/java/com/giantan/data/mds/chunk/MdChunkController.java
  33. 2 2
      server/src/main/java/com/giantan/data/mds/chunk/MdChunkRepository.java
  34. 63 0
      server/src/main/java/com/giantan/data/mds/config/ChatClientConfig.java
  35. 10 1
      server/src/main/java/com/giantan/data/mds/config/TaskConfiguration.java
  36. 65 0
      server/src/main/java/com/giantan/data/mds/controller/ChatController.java
  37. 5 18
      server/src/main/java/com/giantan/data/mds/controller/DownloadController.java
  38. 72 23
      server/src/main/java/com/giantan/data/mds/controller/TaskController.java
  39. 2 0
      server/src/main/java/com/giantan/data/mds/repository/MdDynamicChunkRepository.java
  40. 5 1
      server/src/main/java/com/giantan/data/mds/service/FileProcessingService.java
  41. 1 0
      server/src/main/java/com/giantan/data/mds/service/IDynamicDocService.java
  42. 15 0
      server/src/main/java/com/giantan/data/mds/service/IMdChunksService.java
  43. 15 0
      server/src/main/java/com/giantan/data/mds/service/IMdFilesService.java
  44. 1 2
      server/src/main/java/com/giantan/data/mds/service/MdCache.java
  45. 71 0
      server/src/main/java/com/giantan/data/mds/service/MdChunksService.java
  46. 1 1
      server/src/main/java/com/giantan/data/mds/service/MdDocsService.java
  47. 57 0
      server/src/main/java/com/giantan/data/mds/service/MdFilesService.java
  48. 39 0
      server/src/main/java/com/giantan/data/mds/task/impl/BaseTaskHandler.java
  49. 189 14
      server/src/main/java/com/giantan/data/mds/task/impl/SliceTaskHandler.java
  50. 16 6
      server/src/main/java/com/giantan/data/tasks/ITaskManager.java
  51. 19 2
      server/src/main/java/com/giantan/data/tasks/TaskContext.java
  52. 3 1
      server/src/main/java/com/giantan/data/tasks/TaskEvent.java
  53. 1 1
      server/src/main/java/com/giantan/data/tasks/TaskEventListener.java
  54. 75 18
      server/src/main/java/com/giantan/data/tasks/TaskManager.java
  55. 35 35
      server/src/main/java/com/giantan/data/tasks/controller/TaskController.java
  56. 1 0
      server/src/main/java/com/giantan/data/tasks/repository/DynamicTaskRepository.java
  57. 42 7
      server/src/main/java/com/giantan/data/tasks/repository/PersistentTaskManager.java
  58. 1 0
      server/src/main/java/com/giantan/data/tasks/repository/TaskStatusHistory.java
  59. 16 1
      server/src/main/resources/application.yml
  60. 16 0
      server/src/test/java/com/giantan/data/mds/MdsApplicationTests.java
  61. 155 12
      tools/src/test/java/com/giantan/mds/MdSearcherTest.java

+ 1 - 0
gfs/src/main/java/com/giantan/gfs/storer/impl/S3Storer.java

@@ -537,6 +537,7 @@ public class S3Storer extends AbstractStorer {
                     .region(Region.US_EAST_1)
                     .credentialsProvider(credentialsProvider)
                     .endpointOverride(URI.create(endpoint))
+                    .forcePathStyle(true)  // 2025.7.3
                     .build();
         } catch (Exception e) {
             log.error(e.getMessage());

+ 1 - 1
gtbook/src/main/java/org/cnnlp/data/book/GTNode.java

@@ -68,7 +68,7 @@ public class GTNode implements INode,Externalizable{
 	}
 
 	// 指的文本块类型  paragraph/table/html/image 
-	public int getType() {
+	public int  getType() {
 		return type;
 	}
 

+ 24 - 6
gtbook/src/main/java/org/cnnlp/data/document/GDocConstants.java

@@ -2,10 +2,28 @@ package org.cnnlp.data.document;
 
 public class GDocConstants {
 
-    public static final String CURRENT_SECTION_TOC ="currentSectionToc";
-    public static final String RAW_CONTENT = "rawContent";
-    public static final String FILE_NAME = "fileName";
-    public static final String FROM_IDX = "fromIdx";
-    public static final String TO_IDX = "toIdx";
-    public static final String FIRST_HEADING = "heading";
+    //    public static final String CURRENT_SECTION_TOC ="currentSectionToc";
+//    public static final String RAW_CONTENT = "rawContent";
+//    public static final String FILE_NAME = "fileName";
+//    public static final String FROM_IDX = "fromIdx";
+//    public static final String TO_IDX = "toIdx";
+//    public static final String FIRST_HEADING = "heading";
+//
+//    // 在原始的text中,char的起始和终止pos
+//    public static final String START_OFFSET = "startOffset";
+//    public static final String END_OFFSET = "endOffset";
+//
+//    public static final String PARAGRAPH_CHUNK_IDX = "paragraphChunkIdx";
+    public static final String CURRENT_SECTION_TOC = "_currentSectionToc";
+    public static final String RAW_CONTENT = "_rawContent";
+    public static final String FILE_NAME = "_fileName";
+    public static final String FROM_IDX = "_fromIdx";
+    public static final String TO_IDX = "_toIdx";
+    public static final String FIRST_HEADING = "_heading";
+
+    // 在原始的text中,char的起始和终止pos
+    public static final String START_OFFSET = "_startOffset";
+    public static final String END_OFFSET = "_endOffset";
+
+    public static final String PARAGRAPH_CHUNK_IDX = "_paragraphChunkIdx";
 }

+ 253 - 52
tools/src/main/java/com/giantan/mds/MdSearcher.java → gtbook/src/main/java/org/cnnlp/data/md/MdSearcher.java

@@ -1,4 +1,4 @@
-package com.giantan.mds;
+package org.cnnlp.data.md;
 
 import com.vladsch.flexmark.ast.Text;
 import com.vladsch.flexmark.util.ast.Node;
@@ -7,23 +7,23 @@ import com.vladsch.flexmark.util.ast.TextCollectingVisitor;
 import com.vladsch.flexmark.util.ast.VisitHandler;
 import gnu.trove.TIntArrayList;
 import org.cnnlp.data.book.GTBookUtil;
-import org.cnnlp.data.md.DocTree;
-import org.cnnlp.data.md.MDHelper;
-import org.cnnlp.data.md.MDRegxUtil;
+import org.cnnlp.data.document.GDocConstants;
 import org.cnnlp.data.splitter.SplitUtils;
 import org.cnnlp.data.util.BaseParameters;
 
-import java.io.IOException;
-import java.nio.file.Files;
-import java.nio.file.Path;
 import java.util.*;
 
 public class MdSearcher {
 
+    public final static String KEY_MATCH = "match";
+    public final static String KEY_HEADING = "headings";
     TextCollectingVisitor textr;
 
     DocTree dt;
 
+    transient String[] txts;
+    transient List<Line> flatList;
+
     public MdSearcher() {
         TextCollectingVisitor textCollectingVisitor = new TextCollectingVisitor();
         this.textr = textCollectingVisitor;
@@ -34,10 +34,15 @@ public class MdSearcher {
         this.dt = dt;
     }
 
+    public String getText() {
+        return dt.getText();
+    }
+
     public static int[] match(String text, String str) {
         //text = text.replace("\r\n", "\n");
         str = str.replace("\r\n", "\n");
         String[] ss = str.split("\n");
+
         int count = 0;
 
         int startOffset = 0;
@@ -115,7 +120,7 @@ public class MdSearcher {
 //                    }
 //                }
                 ///2025.6.15 纯粹是因为md处理过程中,space的不同引起匹配不上 ,特殊处理一下
-                if (!target[i].startsWith(" ")){
+                if (!target[i].startsWith(" ")) {
                     for (int j = nowP; j < text.length(); j++) {
                         char c = text.charAt(j);
                         if (c != '\n' && c != '\r' && c != ' ') {
@@ -133,7 +138,7 @@ public class MdSearcher {
                         isMatched = false;
                         break;
                     }
-                }else {
+                } else {
                     isMatched = false;
                     break;
                 }
@@ -194,7 +199,7 @@ public class MdSearcher {
         return title;
     }
 
-    protected String getTttle(Node nd){
+    protected String getTttle(Node nd) {
         String htm1 = SplitUtils.getMDTxt(nd);
         List<String> hs1 = MDRegxUtil.splitByBrackets(htm1);
 
@@ -260,7 +265,7 @@ public class MdSearcher {
         int[] fathers = dt.getFathers();
         int[] depthes = dt.getDepthes();
         int nIdx1 = nodeR[0];
-        int nIdx2 = fathers.length-1;
+        int nIdx2 = fathers.length - 1;
         int fIdx = fathers[nIdx1];
         //int eIdx = fIdx;
         if (fIdx >= 0) {
@@ -283,7 +288,7 @@ public class MdSearcher {
         return rets;
     }
 
-    public String getMdSource(int[] nodeR){
+    public String getMdSource(int[] nodeR) {
         if (nodeR == null) return null;
         List<Node> nodes = dt.getSource();
         int startOffset = nodes.get(nodeR[0]).getStartOffset();
@@ -307,7 +312,7 @@ public class MdSearcher {
         return s2;
     }
 
-    public Map<String,Object> searchAndHeadings(String str) {
+    public Map<String, Object> searchAndHeadings(String str) {
         str = trimLeadingNewlines(str);
         int[] mr = match(dt.getText(), str);
         if (mr == null) return null;
@@ -320,70 +325,70 @@ public class MdSearcher {
         String s2 = getMdSource(fatherRegion);
         //System.out.println(s2);
         List<String> headings = getHeadings(fatherRegion[0]);
-        Map<String, Object> ret = Map.of("match", s2, "headings", headings);
+        Map<String, Object> ret = Map.of(KEY_MATCH, s2, KEY_HEADING, headings);
         return ret;
     }
 
-    protected List<String> getHeadings(int nodeIdx){
+    protected List<String> getHeadings(int nodeIdx) {
         int[] fathers = dt.getFathers();
         List<Node> nodes = dt.getSource();
         ArrayList<String> ls = new ArrayList<>();
         int idx = nodeIdx;
         int[] depthes = dt.getDepthes();
-        if (depthes[idx]>=0){
+        if (depthes[idx] >= 0) {
             String t = getTttle(nodes.get(idx));
-            ls.add(0,t);
+            ls.add(0, t);
         }
-        while (idx >=0 && fathers[idx]>=0){
+        while (idx >= 0 && fathers[idx] >= 0) {
             String t = getTttle(nodes.get(fathers[idx]));
-            ls.add(0,t);
+            ls.add(0, t);
             idx = fathers[idx];
         }
         return ls;
     }
 
-    public void outTitles(){
+    public void outTitles() {
         int[] depthes = dt.getDepthes();
         List<Node> nodes = dt.getSource();
         for (int i = 0; i < depthes.length; i++) {
-            if (depthes[i]>= 0){
+            if (depthes[i] >= 0) {
                 String t = getTttle(nodes.get(i));
-                System.out.println("title="+t);
+                System.out.println("title=" + t);
             }
         }
     }
 
 
-    private int matchTitle(int idx, String[] titles,String[] headings){
+    private int matchTitle(int idx, String[] titles, String[] headings) {
         int[] father2 = dt.getFather(idx);
         boolean ok = true;
-        if (headings.length > father2.length){
+        if (headings.length > father2.length) {
             return -1;
         }
         for (int i = 0; i < headings.length; i++) {
-            if (!headings[i].equals(titles[father2[headings.length-i-1]])){
+            if (!headings[i].equals(titles[father2[headings.length - i - 1]])) {
                 ok = false;
                 break;
             }
         }
-        if (ok){
+        if (ok) {
             return father2[0];
         }
         return -1;
     }
 
     // 取出 该标题下的所有内容
-    private int[] getNodeContentRegion(int fIdx){
+    private int[] getNodeContentRegion(int fIdx) {
         int[] rets = new int[2];
         int[] fathers = dt.getFathers();
         int[] depthes = dt.getDepthes();
 
         int nIdx1 = fIdx;
-        int nIdx2 = fathers.length-1;
+        int nIdx2 = fathers.length - 1;
         if (fIdx >= 0) {
             for (int i = fIdx + 1; i < fathers.length; i++) {
                 if (depthes[i] == depthes[fIdx]) {
-                   break;
+                    break;
                 } else {
                     nIdx2 = i;
                 }
@@ -394,63 +399,71 @@ public class MdSearcher {
         return rets;
     }
 
-    protected Map<String,Object> doHeadingMatch(int anchor,String[] titles ,String[] headings){
-        Map<String,Object> rets = new HashMap<>();
-        if (headings.length == 1){
+    protected Map<String, Object> doHeadingMatch(int anchor, String[] titles, String[] headings) {
+        Map<String, Object> rets = new HashMap<>();
+        if (headings.length == 1) {
             int[] nodeContentRegion = getNodeContentRegion(anchor);
             String s2 = getMdSource(nodeContentRegion);
             //System.out.println(s2);
-            rets.put("match",s2);
-        }else {
-            String[] h2 = Arrays.copyOf(headings,headings.length-1);
-            int m2 = matchTitle(anchor, titles,h2);
+            rets.put(KEY_MATCH, s2);
+        } else {
+            String[] h2 = Arrays.copyOf(headings, headings.length - 1);
+            int m2 = matchTitle(anchor, titles, h2);
             //System.out.println("m2="+m2);
-            if (m2 <0){
-                return  null;
-            }else{
+            if (m2 < 0) {
+                return null;
+            } else {
                 int[] nodeContentRegion = getNodeContentRegion(anchor);
                 String s2 = getMdSource(nodeContentRegion);
                 //System.out.println(s2);
-                rets.put("match",s2);
+                rets.put(KEY_MATCH, s2);
             }
         }
         return rets;
     }
 
-    public Map<String,Object> searchByHeadings(String[] headings){
-        Map<String,Object> rets = null;
+    public Map<String, Object> searchByHeadings(String[] headings) {
+        Map<String, Object> rets = null;
+
+        // 如果 headings 是 "/" 表示 获取全文
+        if (headings.length == 1 && headings[0].equals("/")) {
+            String s = dt.getText();
+            rets = new HashMap<>();
+            rets.put(KEY_MATCH, s);
+            return rets;
+        }
 
         int[] depthes = dt.getDepthes();
         String[] titles = new String[depthes.length];
 
         List<Node> nodes = dt.getSource();
         for (int i = 0; i < depthes.length; i++) {
-            if (depthes[i]>= 0){
+            if (depthes[i] >= 0) {
                 String t = getTttle(nodes.get(i));
                 //System.out.println("title="+t);
                 titles[i] = t;
             }
         }
 
-        String lastHeading = headings[headings.length-1];
+        String lastHeading = headings[headings.length - 1];
         TIntArrayList mi = new TIntArrayList();
         for (int i = 0; i < titles.length; i++) {
-            if (titles[i]!= null && titles[i].equals(lastHeading)){
+            if (titles[i] != null && titles[i].equals(lastHeading)) {
                 mi.add(i);
             }
         }
 
         //String[] h2 = Arrays.copyOf(headings,headings.length-1);
 
-        if (mi.size() == 0){
+        if (mi.size() == 0) {
             return null;
-        }else if (mi.size() == 1){
-            rets = doHeadingMatch(mi.getQuick(0),titles,headings);
+        } else if (mi.size() == 1) {
+            rets = doHeadingMatch(mi.getQuick(0), titles, headings);
 //            if (headings.length == 1){
 //                int[] nodeContentRegion = getNodeContentRegion(mi.getQuick(0));
 //                String s2 = getMdSource(nodeContentRegion);
 //                //System.out.println(s2);
-//                rets.put("match",s2);
+//                rets.put(KEY_MATCH,s2);
 //            }else {
 //                int m2 = matchTitle(mi.getQuick(0), titles,h2);
 //                System.out.println("m2="+m2);
@@ -460,20 +473,208 @@ public class MdSearcher {
 //                    int[] nodeContentRegion = getNodeContentRegion(mi.getQuick(0));
 //                    String s2 = getMdSource(nodeContentRegion);
 //                    //System.out.println(s2);
-//                    rets.put("match",s2);
+//                    rets.put(KEY_MATCH,s2);
 //                }
 //            }
 
-        }else {
+        } else {
             for (int i = 0; i < mi.size(); i++) {
                 int nowL = mi.getQuick(i);
-                int[] father2 = dt.getFather(nowL);
+                //int[] father2 = dt.getFather(nowL);
+                Map<String, Object> rets1 = doHeadingMatch(nowL, titles, headings);
+                if (rets1 != null && rets1.get(KEY_MATCH) != null) {
+                    rets = rets1;
+                    break;
+                }
+            }
+        }
+
+        return rets;
+    }
+
+    private static String[] strip(String[] ss) {
+        List<String> ls = new ArrayList<>();
+        for (int i = 0; i < ss.length; i++) {
+            if (!ss[i].isEmpty()) {
+                ls.add(ss[i]);
+            }
+        }
+        return ls.toArray(new String[ls.size()]);
+    }
+
+    private static boolean isEqual(String s1, String s2) {
+        if (s2.startsWith(s1)) {
+            return true;
+        }
+        return false;
+    }
+
+    // 在 String[] ss2 中找到 ss1 的连续子序列(忽略 ss2 中的空字符串 ""),并返回其在 ss2 中的 起始位置 和 终止位置(索引)。
+    public static int[] findSubArrayRange(String[] ss1, String[] ss2, int offset) {
+        ss1 = strip(ss1);
+        for (int i = offset; i < ss2.length; i++) {
+            int idx1 = 0;
+            int idx2 = i;
+            int start = -1;
+            int end = -1;
+
+            while (idx2 < ss2.length && idx1 < ss1.length) {
+                if (ss2[idx2].isEmpty()) {
+                    idx2++; // 忽略空串
+                    continue;
+                }
+
+//                if (!ss2[idx2].equals(ss1[idx1])) {
+//                    break; // 匹配失败,从下一个 i 开始
+//                }
+
+                if (!isEqual(ss1[idx1], ss2[idx2])) {
+                    break;
+                }
 
+                if (start == -1) {
+                    start = idx2;
+                }
+                end = idx2;
+                idx1++;
+                idx2++;
+            }
+
+            if (idx1 == ss1.length) {
+                return new int[]{start, end}; // 返回实际在 ss2 中的起止索引
             }
         }
 
+        return null; // 没有匹配
+    }
+
+
+    class Line {
+        String text;
+        int ss2Index; // 原始在ss2中的索引
+
+        public Line(String text, int ss2Index) {
+            this.text = text;
+            this.ss2Index = ss2Index;
+        }
+    }
+
+    public int[] findSubArrayRangeWithLineSplit(String[] ss1, String[] ss2, int offset) {
+        // 构造逻辑行列表:每一行来自 ss2[i] 的第 j 段
+
+        ss1 = strip(ss1);
+
+        // 尝试在 flatList 中找到 ss1 匹配子序列
+        for (int i = offset; i <= flatList.size() - ss1.length; i++) {
+            boolean matched = true;
+            for (int j = 0; j < ss1.length; j++) {
+                if (!flatList.get(i + j).text.equals(ss1[j])) {
+                    matched = false;
+                    break;
+                }
+            }
+
+            if (matched) {
+                int start = flatList.get(i).ss2Index;
+                int end = flatList.get(i + ss1.length - 1).ss2Index;
+                return new int[]{start, end};
+            }
+        }
+
+        return null; // 未找到
+    }
+
+    // 找出 txt 在md文件中的位置
+    public Map<String, Object> searchByPlainTxt(String txt, int offsetMd) {
+        Map<String, Object> rets = new HashMap<>();
+        List<Node> nodes = dt.getSource();
+        if (txts == null) {
+            txts = new String[nodes.size()];
+            for (int i = 0; i < nodes.size(); i++) {
+                txts[i] = getText(nodes.get(i));
+                //System.out.println(txts[i]);
+            }
+
+            List<Line> fl = new ArrayList<>();
+            for (int i = 0; i < txts.length; i++) {
+                String s = txts[i];
+                if (s.isEmpty()) continue;
+
+                String[] lines = s.split("\n");
+                for (String line : lines) {
+                    if (!line.isEmpty()) {
+                        fl.add(new Line(line, i));
+                    }
+                }
+            }
+            flatList = fl;
+        }
+
+        int idx = 0;
+        for (int i = 0; i < txts.length; i++) {
+            Node node = nodes.get(i);
+            int endOffset = node.getEndOffset();
+            int startOffset = node.getStartOffset();
+            if (offsetMd < endOffset) {
+                idx = i;
+                break;
+            }
+        }
+
+        for (int i = idx; i < txts.length; i++) {
+            if (txts[i].length() > txt.length()) {
+                int i1 = txts[i].indexOf(txt);
+                if (i1 > 0) {
+                    rets.put(GDocConstants.START_OFFSET, nodes.get(i).getStartOffset());
+                    rets.put(GDocConstants.END_OFFSET, nodes.get(i).getEndOffset());
+                    rets.put("inLine", i1);
+                    return rets;
+                }
+            }
+        }
+
+        int fIdx = 0;
+        for (int i = 0; i < flatList.size(); i++) {
+            if (flatList.get(i).ss2Index == idx) {
+                fIdx = i;
+                break;
+            }
+        }
+
+        String[] ss = txt.split("\n");
+        //int[] range = findSubArrayRange(ss, txts, idx);
+        int[] range = findSubArrayRangeWithLineSplit(ss, txts, fIdx);
+
+
+        if (range != null) {
+            int startOffset = nodes.get(range[0]).getStartOffset();
+            int endOffset = nodes.get(range[1]).getEndOffset();
+            rets.put(GDocConstants.START_OFFSET, startOffset);
+            rets.put(GDocConstants.END_OFFSET, endOffset);
+        }
+
+//        if (txt.length() <= txts[idx].length() && offsetMd + txt.length() <= nodes.get(idx).getEndOffset()) {
+//            if (txts[idx].indexOf(txt) >= 0) {
+//                rets.put(GDocConstants.START_OFFSET, nodes.get(idx).getStartOffset());
+//                rets.put(GDocConstants.END_OFFSET, nodes.get(idx).getEndOffset());
+//            }
+//        } else {
+//            String[] ss = txt.split("\n");
+//            //int[] range = findSubArrayRange(ss, txts, idx);
+//            int[] range = findSubArrayRangeWithLineSplit(ss, txts, idx);
+//
+//
+//            if (range != null) {
+//                int startOffset = nodes.get(range[0]).getStartOffset();
+//                int endOffset = nodes.get(range[1]).getEndOffset();
+//                rets.put(GDocConstants.START_OFFSET, startOffset);
+//                rets.put(GDocConstants.END_OFFSET, endOffset);
+//            }
+//        }
         return rets;
     }
+
+
 //    public static void main(String[] args) throws IOException {
 //        String f = "D:\\data\\乙烯\\乙烯1.md";
 //        String content = Files.readString(Path.of(f));

+ 1 - 0
gtbook/src/main/java/org/cnnlp/data/splitter/BaseMdParser.java

@@ -599,6 +599,7 @@ public class BaseMdParser {
     public void parse(String md, BaseParameters params) {
         Map<String, Object> paramsMap = params.toMap();
         DocTree dt = MDHelper.parseToDocTree(md, paramsMap);
+        dt.setText(md);
         parse(dt, paramsMap);
     }
 

+ 3 - 1
gtbook/src/main/java/org/cnnlp/data/splitter/FaqMdSplitter.java

@@ -3,6 +3,7 @@ package org.cnnlp.data.splitter;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import org.cnnlp.data.book.GTBookConstants;
 import org.cnnlp.data.book.MDElement;
+import org.cnnlp.data.document.GDocConstants;
 import org.cnnlp.data.document.GDocument;
 import org.cnnlp.data.util.BaseParameters;
 
@@ -48,7 +49,8 @@ public class FaqMdSplitter extends BaseMdParser implements ISplitter {
                 String s = text.get(0);
                 builder.text(s);
 
-                builder.metadata("raw_content", value.getHtml().get(0));
+                //builder.metadata("raw_content", value.getHtml().get(0));
+                builder.metadata(GDocConstants.RAW_CONTENT, value.getHtml().get(0));
                 if (i >= 1 && nodes[i - 1].getType() == GTBookConstants.MD_COMMENTS) {
                     MDElement comments = (MDElement) nodes[i - 1].getValue();
                     List<String> html = comments.getHtml();

+ 24 - 0
gtbook/src/main/java/org/cnnlp/data/splitter/IMdChunking.java

@@ -0,0 +1,24 @@
+package org.cnnlp.data.splitter;
+
+import org.cnnlp.data.document.GDocument;
+import org.cnnlp.data.util.BaseParameters;
+
+import java.io.IOException;
+import java.util.List;
+
+public interface IMdChunking {
+    // simple/faq/tagged
+    //public static final String MD_TYPE = "mdType";
+
+    //输出的是 章节
+    List<GDocument> splitSimple(String text, BaseParameters params) throws IOException;
+
+    //输出的是 章节
+    List<GDocument> splitFaq(String text, BaseParameters params) throws IOException;
+
+    //输出的是 章节
+    List<GDocument> split(String text, BaseParameters params) throws IOException;
+
+    // 输出的是 chunk
+    List<GDocument> chunking(String text, BaseParameters params) throws IOException;
+}

+ 457 - 0
gtbook/src/main/java/org/cnnlp/data/splitter/MdChunking.java

@@ -0,0 +1,457 @@
+package org.cnnlp.data.splitter;
+
+//import org.cnnlp.data.book.GTBookUtil;
+
+import org.cnnlp.data.document.GDocConstants;
+import org.cnnlp.data.document.GDocument;
+import org.cnnlp.data.md.MdSearcher;
+import org.cnnlp.data.util.BaseParameters;
+import org.cnnlp.data.util.SenUtil;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+public class MdChunking implements IMdChunking {
+
+    // simple/faq/tagged
+    public static final String MD_TYPE = "mdType";
+    public static final String MD_TYPE_SIMPLE = "simple";
+    public static final String MD_TYPE_FAQ = "faq";
+    public static final String MD_TYPE_TAGGED = "tagged";
+
+    private static final String CHUNK_PARAGRAPH_IDX = GDocConstants.PARAGRAPH_CHUNK_IDX;
+
+    protected int defaultChunkSize = 512;
+    protected int defaultChunkOverlap = 64;
+    protected int defaultChunkTitles = 0;
+
+    protected boolean defaultHeadOverlap = false;
+
+    protected String separator = "\n";
+
+    public MdChunking() {
+
+    }
+
+//    private String getMdType(Map<String, Object> params) {
+//        String type = "simple";
+//        if (params != null) {
+//            Object o = params.get(MD_TYPE);
+//            if (o != null) {
+//                type = o.toString();
+//            }
+//        }
+//        return type;
+//    }
+
+    private String getMdType(BaseParameters params) {
+        String type = MD_TYPE_SIMPLE;
+        if (params != null) {
+            Object o = params.get(MD_TYPE);
+            if (o != null) {
+                type = o.toString();
+            }
+        }
+        return type;
+    }
+
+    @Override
+    public List<GDocument> splitSimple(String text, BaseParameters params) throws IOException {
+        //System.out.println("Process " + path.toString());
+        //String json = path.toString() + ".json";
+        SimpleMdSplitter splitter = new SimpleMdSplitter();
+        splitter.setCommentProcessor(KvCommentProcessor.build());
+
+        //BaseParameters params = BaseParameters.defaultParams();
+//        Path path = Paths.get(md);
+        //String baseName = SplitUtils.getFileBaseName(path);
+        //System.out.println("baseName=" + baseName);
+        List<GDocument> docs = splitter.split(text, params);
+        //SplitUtils.toJsonFile(new File(json), docs);
+        return docs;
+    }
+
+    @Override
+    public List<GDocument> splitFaq(String text, BaseParameters params) throws IOException {
+        //System.out.println("Process " + path.toString());
+        //String json = path.toString() + ".json";
+        FaqMdSplitter splitter = new FaqMdSplitter();
+        //BaseParameters params = BaseParameters.defaultParams();
+        List<GDocument> docs = splitter.split(text, params);
+        //SplitUtils.toJsonFile(new File(json), docs);
+        return docs;
+    }
+
+
+    @Override
+    public List<GDocument> split(String text, BaseParameters params) throws IOException {
+        String mdType = getMdType(params);
+        List<GDocument> chunks = null;
+        if (mdType.equals(MD_TYPE_FAQ)) {
+            chunks = splitFaq(text, params);
+        } else {
+            chunks = splitSimple(text, params);
+            // 1、chunk 2、
+            //System.out.println("Content: " + content);
+        }
+        return chunks;
+    }
+
+    @Override
+    public List<GDocument> chunking(String text, BaseParameters params) throws IOException {
+        String mdType = getMdType(params);
+        // 已标注的,直接按标注切分
+        if (mdType.equals(MD_TYPE_TAGGED)) {
+            TaggedMdSplitter splitter = new TaggedMdSplitter();
+            List<GDocument> docs = splitter.split(text, params);
+            return docs;
+        }
+        // 第一步,拆分成章节
+        List<GDocument> docs = split(text, params);
+        List<GDocument> chunks = new ArrayList<>();
+
+        int chunkSize = params.getIntParameter(ChunkConstants.CHUNK_SIZE, defaultChunkSize);
+        int chunkOverlap = params.getIntParameter(ChunkConstants.CHUNK_OVERLAP, defaultChunkOverlap);
+        int chunkTitles = params.getIntParameter(ChunkConstants.CHUNK_TITLES, defaultChunkTitles);
+
+        boolean headOverlap = params.getBooleanParameter(ChunkConstants.HEAD_OVERLAP, defaultHeadOverlap);
+
+        for (GDocument doc : docs) {
+            Map<String, Object> metadata = doc.getMetadata();
+            int gOffset = 0;
+            Object os = metadata.get(GDocConstants.START_OFFSET);
+            if (os != null) {
+                gOffset = (Integer) os;
+            }
+
+            String text1 = doc.getText();
+
+            if (text1.length() > chunkSize) {
+                //String[] ss = text1.split("\n");
+                //  第一步,章节拆分成chunk
+
+                List<SubSequence> ls1 = splitParagraph(text1, chunkSize, chunkOverlap);
+
+                // 找出 text纯文本 对应 md文本中的位置
+                String raw = null;
+                Object o = doc.getMetadata().get(GDocConstants.RAW_CONTENT);
+                if (o != null) {
+                    raw = (String) o;
+
+                    MdSearcher searcher1 = new MdSearcher();
+                    searcher1.load(raw, BaseParameters.defaultParams());
+                    int lastOffset = 0;
+                    int chunkIdx = 0;
+                    for (SubSequence s1 : ls1) {
+                        String text2 = s1.getText();
+                        Map<String, Object> metadata2 = new HashMap<>(metadata);
+                        int startOffet = (Integer) metadata2.get(GDocConstants.START_OFFSET);
+                        int endOffset = (Integer) metadata2.get(GDocConstants.END_OFFSET);
+
+                        Map<String, Object> r1 = searcher1.searchByPlainTxt(text2, lastOffset);
+                        //System.out.println(r1);
+                        if (r1 != null) {
+                            Object o1 = r1.get(GDocConstants.START_OFFSET);
+                            if (o1 != null) {
+                                Object o2 = r1.get(GDocConstants.END_OFFSET);
+                                startOffet = (Integer) o1;
+                                endOffset = (Integer) o2;
+                                String s = raw.substring(startOffet, endOffset);
+                                //System.out.println(text2);
+                                metadata2.put(GDocConstants.RAW_CONTENT, s);
+
+                                lastOffset = startOffet;
+                            }
+                        }
+                        chunkIdx++;
+
+                        // 构造chunk
+                        startOffet = startOffet + gOffset;
+                        endOffset = endOffset + gOffset;
+                        metadata2.put(GDocConstants.START_OFFSET, startOffet);
+                        metadata2.put(GDocConstants.END_OFFSET, endOffset);
+                        metadata2.put(GDocConstants.PARAGRAPH_CHUNK_IDX, chunkIdx);
+
+                        GDocument chunk1 = GDocument.builder().id(SplitUtils.ulid()).text(text2).metadata(metadata2).build();
+                        chunks.add(chunk1);
+                    }
+
+                } else {
+                    int idx = 1;
+
+                    for (SubSequence s1 : ls1) {
+                        // build GDocument
+                        Map<String, Object> metadata2 = new HashMap<>(metadata);
+                        s1.incrementalOffset(gOffset);
+                        metadata2.put(GDocConstants.START_OFFSET, s1.getStartOffset());
+                        metadata2.put(GDocConstants.END_OFFSET, s1.getEndOffset());
+                        metadata2.put(GDocConstants.PARAGRAPH_CHUNK_IDX, idx);
+                        GDocument chunk1 = GDocument.builder().id(SplitUtils.ulid()).text(s1.getText()).metadata(metadata2).build();
+                        chunks.add(chunk1);
+                        idx++;
+                    }
+                }
+            } else {
+                chunks.add(doc);
+            }
+        }
+        return chunks;
+    }
+
+
+    public List<SubSequence> splitParagraph(String text, int chunkSize, int chunkOverlap) {
+        // 先将段落按”\n“拆分
+        List<SubSequence> ls = SplitUtils.splitToIdxSubStr(text);
+        List<SubSequence> rets = new ArrayList<>();
+
+        int counterSize = 0;
+        StringBuilder sb = new StringBuilder();
+
+        int startIdx = 0;
+        for (int i = 0; i < ls.size(); i++) {
+            SubSequence line1 = ls.get(i);
+            String line = line1.getText();
+            if (line.length() > chunkSize) {
+                if (counterSize > 0) {
+                    //buildDocument
+                    //rets.add(sb.toString());
+                    int startOffset = ls.get(startIdx).getStartOffset();
+                    int endOffset = ls.get(i - 1).getEndOffset();
+                    SubSequence sub1 = SubSequence.build(text.substring(startOffset, endOffset), startOffset, endOffset);
+                    rets.add(sub1);
+
+                    counterSize = 0;
+                    startIdx = i;
+                    sb.setLength(0);
+                }
+
+                //TODO: 增加 offset
+                int offset = line1.getStartOffset();
+                List<SubSequence> ss = splitLine(line, chunkSize, chunkSize / 3, chunkOverlap);
+                for (int j = 0; j < ss.size(); j++) {
+                    //buildDocument
+                    SubSequence sub1 = ss.get(j);
+                    sub1.incrementalOffset(offset);
+                    rets.add(sub1);
+                }
+                counterSize = 0;
+                startIdx = i + 1;
+                sb.setLength(0);
+            } else {
+                if (counterSize + line.length() > chunkSize) {
+                    //buildDocument
+                    //rets.add(sb.toString());
+                    int startOffset = ls.get(startIdx).getStartOffset();
+                    int endOffset = ls.get(i - 1).getEndOffset();
+                    //if (endOffset-startOffset > chunkSize) {
+                    //    System.out.println(counterSize);
+                    //}
+                    SubSequence sub1 = SubSequence.build(text.substring(startOffset, endOffset), startOffset, endOffset);
+                    rets.add(sub1);
+
+                    //counterSize = 0;
+                    startIdx = i;
+                    sb.setLength(0);
+                    counterSize = line.length() + 1;
+                    sb.append(line).append(separator);
+                } else {
+                    counterSize += line.length() + 1;
+                    sb.append(line).append(separator);
+                }
+            }
+//            if (sb.length() > 0) {
+//                rets.add(sb.toString());
+//            }
+        }
+
+        if (sb.length() > 0) {
+            int startOffset = ls.get(startIdx).getStartOffset();
+            int endOffset = text.length();
+            SubSequence sub1 = SubSequence.build(text.substring(startOffset), startOffset, endOffset);
+            rets.add(sub1);
+            //rets.add(sb.toString());
+        }
+
+        return rets;
+    }
+
+    public static List<SubSequence> splitLine(String text, int maxLen, int range, int maxOverlap) {
+        List<SubSequence> ls = new ArrayList<>();
+        int num = text.length() / maxLen;
+        int lastPos = 0;
+        int i = 0;
+
+        num = num * 2;
+        while (i < num) {
+            //int nowpos = SenUtil.getSplitPos(text, maxLen*(i+1), 256);
+            int nowpos = SenUtil.getSplitPos(text, lastPos + maxLen, range);
+
+            if (nowpos > lastPos) {
+                String t1 = text.substring(lastPos, nowpos);
+                //ds = te.getOccurence(t1, ds,occurence);
+                SubSequence sub = SubSequence.build(t1, lastPos, nowpos);
+                ls.add(sub);
+
+                lastPos = nowpos;
+
+                if (lastPos >= text.length()) {
+                    break;
+                }
+
+                int overlap1 = SenUtil.getSplitPosFromLeft(text, lastPos, maxOverlap);
+                if (overlap1 < lastPos) {
+                    lastPos = overlap1;
+                }
+            }
+
+            i++;
+        }
+        return ls;
+    }
+
+
+//    public List<String> splitText(String[] ls, int chunkSize, int chunkOverlap) {
+//
+//        List<String> rets = new ArrayList<>();
+//
+//        int counterSize = 0;
+//        StringBuilder sb = new StringBuilder();
+//
+//        for (int i = 0; i < ls.length; i++) {
+//            String line = ls[i];
+//            if (line.length() > chunkSize) {
+//                if (counterSize > 0) {
+//                    //buildDocument
+//                    rets.add(sb.toString());
+//
+//                    counterSize = 0;
+//                    sb.setLength(0);
+//                }
+//                String[] ss = split(line, chunkSize, chunkSize / 3, chunkOverlap);
+//                for (int j = 0; j < ss.length; j++) {
+//                    //buildDocument
+//                    rets.add(ss[j]);
+//                }
+//                counterSize = 0;
+//                sb.setLength(0);
+//            } else {
+//                if (counterSize + line.length() > chunkSize) {
+//                    //buildDocument
+//                    rets.add(sb.toString());
+//
+//                    counterSize = 0;
+//                    sb.setLength(0);
+//                } else {
+//                    counterSize += line.length() + 1;
+//                    sb.append(line).append(separator);
+//                }
+//            }
+//            if (sb.length() > 0) {
+//                rets.add(sb.toString());
+//            }
+//        }
+//
+//        return rets;
+//    }
+
+//    public static String[] splitLine(String text,int maxLen,int range,int maxOverlap){
+//        List<String> ls = new ArrayList<>();
+//        int num = text.length()/maxLen;
+//        int lastPos = 0;
+//        int i=0;
+//
+//        num = num*2;
+//        while(i<num){
+//            //int nowpos = SenUtil.getSplitPos(text, maxLen*(i+1), 256);
+//            int nowpos = SenUtil.getSplitPos(text, lastPos+maxLen, range);
+//
+//            if (nowpos >lastPos){
+//                String t1 = text.substring(lastPos,nowpos);
+//                lastPos = nowpos;
+//                //ds = te.getOccurence(t1, ds,occurence);
+//                ls.add(t1);
+//                if (lastPos >= text.length()){
+//                    break;
+//                }
+//
+//                int overlap1 = SenUtil.getSplitPosFromLeft(text, lastPos, maxOverlap);
+//                if (overlap1 < lastPos){
+//                    lastPos = overlap1;
+//                }
+//            }
+//
+//            i++;
+//        }
+//        return ls.toArray(new String[0]);
+//    }
+
+
+//    public List<Document> splitting(String md, BaseParameters params) {
+//
+//        List<Document> docs = new ArrayList<>();
+//
+//        //Map<String, Object> paramsMap = params.toMap();
+//        parse(md, params);
+//        root.countingTextSize();
+//
+//        int totalSize = root.countingTextSize();
+//        chunkSize = params.getIntParameter(ChunkConstants.CHUNK_SIZE, defaultChunkSize);
+//        chunkOverlap = params.getIntParameter(ChunkConstants.CHUNK_OVERLAP, defaultChunkOverlap);
+//        chunkTitles = params.getIntParameter(ChunkConstants.CHUNK_TITLES, defaultChunkTitles);
+//
+//        boolean headOverlap = params.getBooleanParameter(ChunkConstants.HEAD_OVERLAP, defaultHeadOverlap);
+//
+//        if (totalSize <= chunkSize) {
+//            String content = GTBookUtil.listToString(root.getText(), "\n");
+//            Document doc = Document.builder().text(content).id(SplitUtils.ulid()).build();
+//            return List.of(doc);
+//        }
+//
+//        //System.out.println("size=" + root.getBlockSize());
+//        int[] depthes = docTree.getDepthes();
+//        int[] fathers = docTree.getFathers();
+//
+//        //List<String> outline = getOutline(true);
+//        //System.out.println(outline);
+//
+//        int docLen = depthes.length;
+//        int i = 0;
+//        while (i < docLen) {
+//            GTNode2 nd = nodes[i];
+//            //System.out.println(i + " id=" + nd.getId() + " " + nodes[i].getCharSize() + " = " + nodes[i].getBlockSize() + " level=" + depthes[i]);
+//            if (nd.getBlockSize() > chunkSize) {
+//                List<Document> docs1 = splitting(i);
+//                docs.addAll(docs1);
+//            } else {
+//                int lastId = nd.getLastId();
+//                int nowChunkSize = nd.getBlockSize();
+//                while (lastId < docLen) {
+//                    GTNode2 nd1 = nodes[lastId];
+//                    nowChunkSize = nowChunkSize + nd1.getBlockSize();
+//                    if (nowChunkSize > chunkSize) {
+//                        break;
+//                    }
+//                    lastId = nd1.getLastId();
+//                }
+//                i = lastId;
+//
+//            }
+//            i++;
+//            System.out.println("lastId=" + i);
+//        }
+//
+//        return docs;
+//    }
+
+
+    public static BaseParameters chunkParameters(int chunkSize, int chunkOverlap) {
+        BaseParameters params = new BaseParameters();
+        params.put(ChunkConstants.CHUNK_SIZE, chunkSize);
+        params.put(ChunkConstants.CHUNK_OVERLAP, chunkOverlap);
+        return params;
+    }
+
+}

+ 35 - 6
gtbook/src/main/java/org/cnnlp/data/splitter/SimpleMdSplitter.java

@@ -1,5 +1,6 @@
 package org.cnnlp.data.splitter;
 
+import com.vladsch.flexmark.util.ast.Node;
 import org.cnnlp.data.book.GTBookConstants;
 import org.cnnlp.data.book.IElement;
 import org.cnnlp.data.document.GDocConstants;
@@ -108,6 +109,7 @@ public class SimpleMdSplitter extends BaseMdParser implements ISplitter {
         return metadata;
     }
 
+    @Override
     public List<GDocument> split(String md, BaseParameters params) {
         this.params = params;
         List<GDocument> docs = new ArrayList<>();
@@ -140,18 +142,23 @@ public class SimpleMdSplitter extends BaseMdParser implements ISplitter {
 
                 if (textLen > MIN_PARAGRAPH_LEN) {
                     List<String> fatherLabels = getFatherLabels(nowJ);
-                    List<String> htmls = getHtmls(nowJ, nextJ);
+                    //List<String> htmls = getHtmls(nowJ, nextJ);
                     List<String> texts = getTexts(nowJ, nextJ);
 
                     GDocument.Builder builder = GDocument.builder().id(idGen.generateId()).text(SplitUtils.listToString(texts));
                     builder.metadata(GDocConstants.CURRENT_SECTION_TOC, fatherLabels);
-                    builder.metadata(GDocConstants.RAW_CONTENT, SplitUtils.listToString(htmls));
+                    //builder.metadata(GDocConstants.RAW_CONTENT, SplitUtils.listToString(htmls));
                     builder.metadata(GDocConstants.FIRST_HEADING, nd.getLabel());
 
+                    getCharOffset(builder,nowJ, nextJ);
                     Map<String, Object> metadata = getMetadata(nowJ, nextJ);
                     metadata.forEach((k, v) -> builder.metadata(k, v));
 
-                    docs.add(builder.build());
+                    GDocument doc = builder.build();
+                    docs.add(doc);
+
+                    //Object o = doc.getMetadata().get(GDocConstants.RAW_CONTENT);
+                    //System.out.println(o);
                     nowJ = nextJ;
                 }
                 i = nextJ;
@@ -162,6 +169,25 @@ public class SimpleMdSplitter extends BaseMdParser implements ISplitter {
         return docs;
     }
 
+    protected void getCharOffset(GDocument.Builder builder,int startIndex, int endIndex){
+        String text = docTree.getText();
+        Node mdNode = nodes[startIndex].getMdNode();
+        int startOffset = mdNode.getStartOffset();
+        int endOffset = mdNode.getEndOffset();
+        if (endIndex < nodes.length) {
+            mdNode = nodes[endIndex].getMdNode();
+            endOffset = mdNode.getStartOffset();
+        }else {
+            endOffset = text.length();
+        }
+
+        String s = text.substring(startOffset, endOffset);
+        //System.out.println("s="+s);
+        builder.metadata(GDocConstants.RAW_CONTENT,s);
+        builder.metadata(GDocConstants.START_OFFSET, startOffset);
+        builder.metadata(GDocConstants.END_OFFSET, endOffset);
+    }
+
     private int getNodeTextLength(int index) {
         int len = ((IElement) nodes[index].getValue()).getText().get(0).length();
         return len;
@@ -178,17 +204,19 @@ public class SimpleMdSplitter extends BaseMdParser implements ISplitter {
     protected List<String> getHtmls(int startIndex, int endIndex) {
         List<String> htmls = new ArrayList<>();
         for (int i = startIndex; i < endIndex; i++) {
-            List<String> html = ((IElement) nodes[i].getValue()).getHtml();
+            IElement value = (IElement) nodes[i].getValue();
+            List<String> html = value.getHtml();
             htmls.addAll(html);
         }
+
         return htmls;
     }
 
     protected List<String> getTexts(int startIndex, int endIndex) {
         List<String> texts = new ArrayList<>();
         for (int i = startIndex; i < endIndex; i++) {
-            List<String> html = ((IElement) nodes[i].getValue()).getText();
-            texts.addAll(html);
+            List<String> txt = ((IElement) nodes[i].getValue()).getText();
+            texts.addAll(txt);
         }
         return texts;
     }
@@ -216,6 +244,7 @@ public class SimpleMdSplitter extends BaseMdParser implements ISplitter {
 
     }
 
+    @Override
     public List<GDocument> split(Path path, BaseParameters params) throws IOException {
         preProcess(path, params);
         String md = Files.readString(path);

+ 38 - 2
gtbook/src/main/java/org/cnnlp/data/splitter/SplitUtils.java

@@ -15,8 +15,6 @@ import java.util.*;
 
 public class SplitUtils {
 
-
-
     public static List<Node> getChildren(Node parent) {
         List<Node> nds = new ArrayList<Node>();
         if (parent == null) {
@@ -144,6 +142,13 @@ public class SplitUtils {
         return rets;
     }
 
+    public static String getBaseName(String filename){
+        String nameWithExt = new File(filename).getName();
+        // 去掉扩展名
+        int dotIndex = nameWithExt.lastIndexOf('.');
+        String baseName = (dotIndex == -1) ? nameWithExt : nameWithExt.substring(0, dotIndex);
+        return baseName;
+    }
 
     public static String getFileBaseName(Path path){
         String fileName = path.getFileName().toString();
@@ -162,5 +167,36 @@ public class SplitUtils {
         return ulid.toLowerCase();
     }
 
+//    public record IdxSubStr(String text,int startOffset,int endOffset) {
+//
+//    }
+
+
+    public static List<SubSequence> splitToIdxSubStr(String text) {
+        List<SubSequence> result = new ArrayList<>();
+        int len = text.length();
+        int lineStart = 0;
+
+        int lineEnd = 0;
+
+        for (int i = 0; i < len; i++) {
+            char ch = text.charAt(i);
+            if (ch == '\n') {
+                lineEnd = i+1;
+                String line = text.substring(lineStart, lineEnd);
+                result.add(new SubSequence(line, lineStart, lineEnd));
+                lineStart = i + 1;
+            }
+        }
+
+        // Add last line if it doesn't end with \n
+        if (lineStart < len) {
+            lineEnd = len;
+            result.add(new SubSequence(text.substring(lineStart), lineStart, lineEnd));
+        }
+
+        return result;
+    }
+
 
 }

+ 84 - 0
gtbook/src/main/java/org/cnnlp/data/splitter/SubSequence.java

@@ -0,0 +1,84 @@
+package org.cnnlp.data.splitter;
+
+import org.jetbrains.annotations.NotNull;
+
+import java.io.Serializable;
+import java.util.Objects;
+
+public class SubSequence implements Serializable {
+
+    String text;
+    int startOffset;
+    int endOffset;
+
+    public SubSequence() {
+
+    }
+
+    public SubSequence(String text, int startOffset, int endOffset) {
+        this.text = text;
+        this.startOffset = startOffset;
+        this.endOffset = endOffset;
+    }
+
+//    @Override
+//    public int compareTo(@NotNull SubSequence o) {
+//        return startOffset - o.startOffset;
+//    }
+
+    public String getText() {
+        return text;
+    }
+
+    public void setText(String text) {
+        this.text = text;
+    }
+
+    public int getStartOffset() {
+        return startOffset;
+    }
+
+    public void setStartOffset(int startOffset) {
+        this.startOffset = startOffset;
+    }
+
+    public int getEndOffset() {
+        return endOffset;
+    }
+
+    public void setEndOffset(int endOffset) {
+        this.endOffset = endOffset;
+    }
+
+    public void incrementalOffset(int offset) {
+        startOffset = startOffset + offset;
+        endOffset = endOffset + offset;
+    }
+
+    @Override
+    public boolean equals(Object o) {
+        if (this == o) return true;
+        if (o == null || getClass() != o.getClass()) return false;
+        SubSequence that = (SubSequence) o;
+        return startOffset == that.startOffset && endOffset == that.endOffset && Objects.equals(text, that.text);
+    }
+
+    @Override
+    public int hashCode() {
+        return Objects.hash(text, startOffset, endOffset);
+    }
+
+    @Override
+    public String toString() {
+        return "SubSequence{" +
+                "text='" + text + '\'' +
+                ", startOffset=" + startOffset +
+                ", endOffset=" + endOffset +
+                '}';
+    }
+
+    public static SubSequence build(String text, int startOffset, int endOffset) {
+        SubSequence ss = new SubSequence(text, startOffset, endOffset);
+        return ss;
+    }
+}

+ 198 - 0
gtbook/src/main/java/org/cnnlp/data/splitter/TaggedMdSplitter.java

@@ -0,0 +1,198 @@
+package org.cnnlp.data.splitter;
+
+import com.vladsch.flexmark.ast.Paragraph;
+import com.vladsch.flexmark.ext.obs.comments.Comments;
+import com.vladsch.flexmark.util.ast.Node;
+import com.vladsch.flexmark.util.sequence.BasedSequence;
+import org.cnnlp.data.book.GTBookConstants;
+import org.cnnlp.data.book.MDElement;
+import org.cnnlp.data.document.GDocConstants;
+import org.cnnlp.data.document.GDocument;
+import org.cnnlp.data.util.BaseParameters;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.StringReader;
+import java.io.UncheckedIOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+/* sample.md   (以 %%--%% 来分块)
+---
+ k1:v1
+---
+
+# Title
+
+Some intro text.
+
+%%--%%
+
+<!--
+k2:v2
+-->
+## Section 1
+
+Content of section 1.
+
+%%--%%
+
+## Section 2
+
+More content.
+ */
+
+public class TaggedMdSplitter extends SimpleMdSplitter {
+
+    //public static final String DEFAULT_SPLITTER = "%%--%%";
+    public static final String DEFAULT_SPLITTER = "--";
+    protected String splitter = DEFAULT_SPLITTER;
+
+    public TaggedMdSplitter() {
+        super();
+        setCommentProcessor(KvCommentProcessor.build());
+    }
+
+    public String getSplitter() {
+        return splitter;
+    }
+
+    public void setSplitter(String splitter) {
+        this.splitter = splitter;
+    }
+
+
+    public static List<SubSequence> splitMarkdownWithOffsets(String markdown) {
+        List<SubSequence> result = new ArrayList<>();
+        int offset = 0;
+        int chunkStart = 0;
+        StringBuilder current = new StringBuilder();
+
+        try (BufferedReader reader = new BufferedReader(new StringReader(markdown))) {
+            String line;
+            while ((line = reader.readLine()) != null) {
+                int lineLen = line.length() + 1; // +1 for '\n'
+                if (line.strip().equals("%%--%%")) {
+                    int end = offset - 1; // exclude the newline
+                    if (!current.isEmpty()) {
+                        result.add(new SubSequence(current.toString().strip(), chunkStart, end));
+                    }
+                    current.setLength(0);
+                    chunkStart = offset + lineLen;
+                } else {
+                    current.append(line).append("\n");
+                }
+                offset += lineLen;
+            }
+
+            if (!current.isEmpty()) {
+                result.add(new SubSequence(current.toString().strip(), chunkStart, offset));
+            }
+        } catch (IOException e) {
+            throw new UncheckedIOException(e);
+        }
+
+        return result;
+    }
+
+
+//    @Override
+//    public List<GDocument> split(String md, BaseParameters params) {
+//        this.params = params;
+//        List<GDocument> docs = new ArrayList<>();
+//        //parse(md, params);
+//        List<SubSequence> ss = splitMarkdownWithOffsets(md);
+//        if (ss.size() <= 0) {
+//            return docs;
+//        }
+//
+//        SubSequence s0 = ss.get(0);
+//
+//
+//        for (SubSequence s : ss) {
+//            String text = s.getText();
+//
+//        }
+//        return docs;
+//    }
+//
+
+    protected boolean isChunkSplitter(Node node) {
+        if (node instanceof Paragraph) {
+            Paragraph p1 = (Paragraph) node;
+            Node firstChild = p1.getFirstChild();
+            if (firstChild instanceof Comments) {
+                BasedSequence text = ((Comments) firstChild).getText();
+                String s = text.toString();
+                //System.out.println(s);
+                if (s.equals(splitter)) {
+                    return true;
+                }
+            }
+        }
+        return false;
+    }
+
+    @Override
+    public List<GDocument> split(String md, BaseParameters params) {
+        this.params = params;
+        List<GDocument> docs = new ArrayList<>();
+        parse(md, params);
+        root.countingTextSize();
+
+        int totalSize = root.countingTextSize();
+        int[] depthes = docTree.getDepthes();
+        int[] fathers = docTree.getFathers();
+
+        //List<String> outline = getOutline(true);
+        //System.out.println(outline);
+        int docLen = depthes.length;
+        int i = 0;
+        int nowJ = 0;
+
+        while (i < docLen) {
+            GTNode2 nd = nodes[i];
+
+            int nextJ = docLen;
+            for (int j = i + 1; j < docLen; j++) {
+                if (isChunkSplitter(nodes[j].getMdNode())) {
+                    // 将chunk分隔符的value 置""
+                    nodes[j].setValue(new MDElement());
+                    nextJ = j;
+                    break;
+                }
+            }
+            List<String> fatherLabels = null;
+            String heading = null;
+            if (nextJ < docLen) {
+                fatherLabels = getFatherLabels(nextJ);
+            } else {
+                fatherLabels = getFatherLabels(nextJ - 1);
+            }
+
+            if (nodes[nextJ - 1].getType() == GTBookConstants.MD_HEADING) {
+                heading = nodes[nextJ - 1].getLabel();
+            }
+
+            //List<String> htmls = getHtmls(nowJ, nextJ);
+            List<String> texts = getTexts(nowJ, nextJ);
+
+            GDocument.Builder builder = GDocument.builder().id(idGen.generateId()).text(SplitUtils.listToString(texts));
+            builder.metadata(GDocConstants.CURRENT_SECTION_TOC, fatherLabels);
+            if (heading != null) {
+                builder.metadata(GDocConstants.FIRST_HEADING, nd.getLabel());
+            }
+            getCharOffset(builder, nowJ, nextJ);
+            Map<String, Object> metadata = getMetadata(nowJ, nextJ);
+            metadata.forEach((k, v) -> builder.metadata(k, v));
+
+            GDocument doc = builder.build();
+            docs.add(doc);
+
+            nowJ = nextJ+1;
+            i = nextJ;
+        }
+        return docs;
+    }
+}

+ 3 - 2
gtbook/src/main/java/org/cnnlp/data/splitter/ChunkMdParser.java → gtbook/src/main/java/org/cnnlp/data/splitter/model/ChunkMdParser.java

@@ -1,8 +1,9 @@
-package org.cnnlp.data.splitter;
+package org.cnnlp.data.splitter.model;
 
 import org.cnnlp.data.book.IElement;
+import org.cnnlp.data.splitter.BaseMdParser;
 
-import static org.cnnlp.data.splitter.ChunkSample.ChunkEvents;
+import static org.cnnlp.data.splitter.model.ChunkSample.ChunkEvents;
 
 import java.util.ArrayList;
 import java.util.List;

+ 2 - 1
gtbook/src/main/java/org/cnnlp/data/splitter/ChunkSample.java → gtbook/src/main/java/org/cnnlp/data/splitter/model/ChunkSample.java

@@ -1,7 +1,8 @@
-package org.cnnlp.data.splitter;
+package org.cnnlp.data.splitter.model;
 
 import opennlp.tools.ml.model.Event;
 import org.cnnlp.data.book.IElement;
+import org.cnnlp.data.splitter.GTNode2;
 
 import java.io.Serializable;
 import java.util.*;

+ 1 - 1
gtbook/src/main/java/org/cnnlp/data/splitter/MdChunkModel.java → gtbook/src/main/java/org/cnnlp/data/splitter/model/MdChunkModel.java

@@ -1,4 +1,4 @@
-package org.cnnlp.data.splitter;
+package org.cnnlp.data.splitter.model;
 
 import java.io.IOException;
 import java.nio.file.Paths;

+ 2 - 3
gtbook/src/main/java/org/cnnlp/data/splitter/MdChunkTrainer.java → gtbook/src/main/java/org/cnnlp/data/splitter/model/MdChunkTrainer.java

@@ -1,6 +1,5 @@
-package org.cnnlp.data.splitter;
+package org.cnnlp.data.splitter.model;
 
-import com.vladsch.flexmark.util.ast.Node;
 import opennlp.tools.ml.AbstractTrainer;
 import opennlp.tools.ml.model.DataIndexer;
 import opennlp.tools.ml.model.Event;
@@ -16,7 +15,7 @@ import org.cnnlp.data.util.BaseParameters;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import static org.cnnlp.data.splitter.ChunkSample.ChunkEvents;
+import static org.cnnlp.data.splitter.model.ChunkSample.ChunkEvents;
 
 import java.io.IOException;
 import java.nio.file.DirectoryStream;

+ 1 - 1
gtbook/src/main/java/org/cnnlp/data/splitter/TextUnitMlModel.java → gtbook/src/main/java/org/cnnlp/data/splitter/model/TextUnitMlModel.java

@@ -1,4 +1,4 @@
-package org.cnnlp.data.splitter;
+package org.cnnlp.data.splitter.model;
 
 import opennlp.tools.svm.OneClassModel;
 import org.cnnlp.data.util.ISeriableObject;

+ 1 - 1
gtbook/src/main/java/org/cnnlp/data/splitter/TextUtitPatternModel.java → gtbook/src/main/java/org/cnnlp/data/splitter/model/TextUtitPatternModel.java

@@ -1,4 +1,4 @@
-package org.cnnlp.data.splitter;
+package org.cnnlp.data.splitter.model;
 
 public class TextUtitPatternModel {
 

+ 3 - 3
gtbook/src/main/java/org/cnnlp/data/splitter/MdChunkSplitter.java → gtbook/src/test/java/org/cnnlp/data/splitter/MdChunkSplitter1.java

@@ -17,7 +17,7 @@ import java.util.List;
 import java.util.Map;
 
 @Slf4j
-public class MdChunkSplitter extends BaseMdParser implements DocumentTransformer {
+public class MdChunkSplitter1 extends BaseMdParser implements DocumentTransformer {
 
 
     protected int defaultChunkSize = 512;
@@ -41,7 +41,7 @@ public class MdChunkSplitter extends BaseMdParser implements DocumentTransformer
 
 
 
-    public MdChunkSplitter() {
+    public MdChunkSplitter1() {
     }
 
     public void split(Path path, BaseParameters params) throws IOException {
@@ -258,7 +258,7 @@ public class MdChunkSplitter extends BaseMdParser implements DocumentTransformer
 
     public static void main(String[] args) throws IOException {
         String md = "D:\\testdata\\md\\官网银行卡知识.1.md";
-        MdChunkSplitter splitter = new MdChunkSplitter();
+        MdChunkSplitter1 splitter = new MdChunkSplitter1();
         BaseParameters params = BaseParameters.defaultParams();
         splitter.split(Paths.get(md), params);
 

+ 37 - 0
gtbook/src/test/java/org/cnnlp/data/splitter/MdChunkingTest.java

@@ -0,0 +1,37 @@
+package org.cnnlp.data.splitter;
+
+import org.cnnlp.data.document.GDocConstants;
+import org.cnnlp.data.document.GDocument;
+import org.cnnlp.data.util.BaseParameters;
+
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.List;
+import java.util.Map;
+
+public class MdChunkingTest {
+    public static void main(String[] args) throws IOException {
+        MdChunking chunking = new MdChunking();
+        String p = "D:\\data\\乙烯\\规章制度\\中国石油四川石化有限责任公司原油采购管理办法.md";
+        String s = Files.readString(Path.of(p), Charset.forName("utf-8"));
+        //chunking.splitSimple(s, BaseParameters.defaultParams());
+        p = "D:\\data\\乙烯\\target\\广西操规与操作卡\\20万吨聚丙烯联合装置\\20万吨聚丙烯联合装置操作规程.md";
+        List<GDocument> chunks = chunking.chunking(s, BaseParameters.defaultParams());
+        for (GDocument doc : chunks) {
+            //System.out.println(doc);
+            Map<String, Object> metadata = doc.getMetadata();
+            int startOffet = (Integer) metadata.get(GDocConstants.START_OFFSET);
+            int endOffset = (Integer) metadata.get(GDocConstants.END_OFFSET);
+            String raw1 = metadata.get(GDocConstants.RAW_CONTENT).toString();
+            String raw2 = s.substring(startOffet, endOffset);
+            if (!raw1.equals(raw2)) {
+                //System.out.println(raw1 + " != " + raw2);
+                System.out.println(raw1);
+                System.out.println("======");
+            }
+        }
+    }
+}

+ 1 - 0
gtbook/src/test/java/org/cnnlp/data/splitter/SplitTest.java

@@ -4,6 +4,7 @@ import com.vladsch.flexmark.util.ast.Node;
 import org.cnnlp.data.book.GTBookUtil;
 import org.cnnlp.data.md.DocTree;
 import org.cnnlp.data.md.MDHelper;
+import org.cnnlp.data.splitter.model.ChunkMdParser;
 
 import java.io.IOException;
 import java.nio.charset.Charset;

+ 4 - 4
gtbook/src/test/java/org/cnnlp/data/splitter/SplitTools.java

@@ -84,11 +84,11 @@ public class SplitTools {
 //        String p = "D:\\data\\乙烯\\target";
 //        tools.splitSimpleDir(Paths.get(p));
 
-//        String p = "D:\\data\\乙烯\\规章制度\\中国石油四川石化有限责任公司原油采购管理办法.md";
-//        tools.splitSimple(Paths.get(p));
+        String p = "D:\\data\\乙烯\\规章制度\\中国石油四川石化有限责任公司原油采购管理办法.md";
+        tools.splitSimple(Paths.get(p));
 
-        String dir = "D:\\data\\乙烯\\0624\\安全技术说明书完善\\";
-        tools.splitSimpleDir(Paths.get(dir));
+//        String dir = "D:\\data\\乙烯\\0624\\安全技术说明书完善\\";
+//        tools.splitSimpleDir(Paths.get(dir));
 
 //        String p = "D:\\data\\乙烯\\2\\";
 //        Stream<Path> fl = Files.list(Path.of(p));

+ 34 - 0
gtbook/src/test/java/org/cnnlp/data/splitter/SplitUtilsTest.java

@@ -0,0 +1,34 @@
+package org.cnnlp.data.splitter;
+
+import java.util.List;
+
+public class SplitUtilsTest {
+
+    public static void main(String[] args) {
+        String s ="结焦机理:烃类裂解过程中除生成各种烃类产物外,同时有少量炭生成。这种炭是数百个碳原子稠和形成的炭,其中尚含少量氢。通常,将这种炭称之为焦,管式裂解炉中裂解生成的焦结聚于管壁的过程,称为结焦。\n" +
+                "\n" +
+                "生碳反应须通过生成乙炔的中间阶段,温度在900~1173℃以上,而生焦反应是通过生成芳烃以致稠环芳烃的中间阶段,反应温度在500~600℃以上就能开始。通常加入水蒸汽作为稀释剂,以降低烃的分压,达到减少结焦的目的。\n" +
+                "\n" +
+                "无论裂解原料是单一烃还是混和烃,裂解气的组成都比较复杂。\n" +
+                "\n" +
+                "裂解反应为自由基反应,其大致过程如下(以乙烷裂解为例):\n" +
+                "\n" +
+                "第一步乙烷裂解为两个甲基自由基:\n" +
+                "\n" +
+                "C2H6 → CH30•+CH30•\n" +
+                "\n" +
+                "一个甲基与另一乙烷反应生成甲烷和乙基自由基:\n" +
+                "\n" +
+                "CH30•+ C2H6 → CH4+C2H50•\n" +
+                "\n" +
+                "乙基自由基分解成乙烯和氢自由基:\n" +
+                "\n" +
+                "C2H50• → C2H4+H0•\n" +
+                "\n" +
+                "氢自由基再与乙烷反应生成氢(分子)和乙基自由基:";
+        List<SubSequence> ss = SplitUtils.splitToIdxSubStr(s);
+        for (SubSequence ssi : ss) {
+            System.out.println(ssi);
+        }
+    }
+}

+ 28 - 0
gtbook/src/test/java/org/cnnlp/data/splitter/TaggedMdSplitterTest.java

@@ -0,0 +1,28 @@
+package org.cnnlp.data.splitter;
+
+import org.cnnlp.data.document.GDocument;
+import org.cnnlp.data.util.BaseParameters;
+
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.List;
+
+public class TaggedMdSplitterTest {
+
+    public static void main(String[] args) throws IOException {
+        String p = "D:\\mytest\\md\\demo1.md";
+        String s = Files.readString(Path.of(p), Charset.forName("utf-8"));
+        TaggedMdSplitter tms = new TaggedMdSplitter();
+        //List<SubSequence> subs = tms.splitMarkdownWithOffsets(s);
+        //subs.forEach(System.out::println);
+        List<GDocument> ss = tms.split(s, BaseParameters.defaultParams());
+        for(GDocument gd : ss){
+            System.out.println(gd);
+            System.out.println("------");
+        }
+        System.out.println(ss.size());
+
+    }
+}

+ 1 - 1
gtbook/src/test/java/org/cnnlp/data/splitter/MdChunkTrainerTest.java → gtbook/src/test/java/org/cnnlp/data/splitter/model/MdChunkTrainerTest.java

@@ -1,4 +1,4 @@
-package org.cnnlp.data.splitter;
+package org.cnnlp.data.splitter.model;
 
 import opennlp.tools.ml.model.DataIndexer;
 import opennlp.tools.ml.model.Event;

+ 34 - 1
server/pom.xml

@@ -9,7 +9,7 @@
         <version>1.0.0</version>
     </parent>
 
-    <version>1.2.0</version>
+    <version>1.2.1</version>
     <artifactId>mdserver</artifactId>
 
     <properties>
@@ -17,6 +17,7 @@
         <maven.compiler.target>17</maven.compiler.target>
         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
         <spring.boot.version>3.5.0</spring.boot.version>
+        <spring-ai.version>1.0.1</spring-ai.version>
     </properties>
 
 
@@ -39,6 +40,26 @@
             <version>${spring.boot.version}</version>
         </dependency>
 
+        <dependency>
+            <groupId>org.springframework.ai</groupId>
+            <artifactId>spring-ai-advisors-vector-store</artifactId>
+        </dependency>
+
+        <dependency>
+            <groupId>org.springframework.ai</groupId>
+            <artifactId>spring-ai-starter-model-chat-memory</artifactId>
+        </dependency>
+
+        <dependency>
+            <groupId>org.springframework.ai</groupId>
+            <artifactId>spring-ai-starter-model-openai</artifactId>
+        </dependency>
+
+        <dependency>
+            <groupId>org.springframework.ai</groupId>
+            <artifactId>spring-ai-starter-model-deepseek</artifactId>
+        </dependency>
+
 
         <dependency>
             <groupId>org.postgresql</groupId>
@@ -171,6 +192,18 @@
 
     </dependencies>
 
+    <dependencyManagement>
+        <dependencies>
+            <dependency>
+                <groupId>org.springframework.ai</groupId>
+                <artifactId>spring-ai-bom</artifactId>
+                <version>${spring-ai.version}</version>
+                <type>pom</type>
+                <scope>import</scope>
+            </dependency>
+        </dependencies>
+    </dependencyManagement>
+
     <build>
         <plugins>
             <plugin>

+ 7 - 3
server/src/main/java/com/giantan/data/mds/MdsApplication.java

@@ -1,6 +1,7 @@
 package com.giantan.data.mds;
 
 
+import com.giantan.data.mds.task.impl.BaseTaskHandler;
 import org.springframework.boot.SpringApplication;
 import org.springframework.boot.autoconfigure.SpringBootApplication;
 import org.springframework.context.annotation.ComponentScan;
@@ -10,9 +11,12 @@ import org.springframework.scheduling.annotation.EnableAsync;
 @EnableAsync
 //@ComponentScan(basePackages = "com.giantan")
 public class MdsApplication {
+    private static final org.slf4j.Logger log
+            = org.slf4j.LoggerFactory.getLogger(MdsApplication.class);
 
-	public static void main(String[] args) {
-		SpringApplication.run(MdsApplication.class, args);
-	}
+    public static void main(String[] args) {
+        SpringApplication.run(MdsApplication.class, args);
+        log.info("Mds server started. Version 1.2.1.1");
+    }
 
 }

+ 186 - 0
server/src/main/java/com/giantan/data/mds/bot/ExtractPrompts.java

@@ -0,0 +1,186 @@
+package com.giantan.data.mds.bot;
+
+import org.springframework.ai.chat.prompt.PromptTemplate;
+import org.springframework.ai.template.st.StTemplateRenderer;
+
+import java.util.HashMap;
+import java.util.Map;
+
+public class ExtractPrompts {
+
+//    CHUNK_SIZE = 1024
+//    EMBED_DIMENSION = 1024
+//
+//            # MONGO_URI = "mongodb://mongoadmin:123456@localhost:27017/"
+//            # MILVUS_URI = "http://localhost:19530"
+
+    String TITLE_NODE_TEMPLATE = """
+            <context>
+            {context_str}.
+            </context>
+            
+            Give a title that summarizes all of the unique entities, titles or themes found in the context. The language of the title must be consistent with the language inside <context>.
+            
+            Generate the Title in Chinese.
+            Title: """;
+
+
+    String TITLE_COMBINE_TEMPLATE = """
+            <context>
+            {context_str}.
+            </context>
+            
+            Based on the above candidate titles and content, what is the comprehensive title for this document? 
+            Generate the Title in Chinese.
+            Title: """;
+
+    String KEYWORD_EXTRACT_TEMPLATE = """
+            <context>
+            {context_str}.
+            </context>
+            
+            Give {keywords} unique keywords for this document. Format as comma separated. 
+            Generate the Keywords in Chinese.
+            Keywords: """;
+
+    String QUESTION_GEN_TMPL = """
+            <context>
+            {context_str}.
+            </context>
+            
+            Given the contextual information, generate {num_questions} questions this context can provide specific answers to which are unlikely to be found elsewhere.
+            
+            Higher-level summaries of surrounding context may be provided as well. Try using these summaries to generate better questions that this context can answer.
+            
+            Generate the questions in Chinese.
+            Questions: """;
+
+
+    static String KEYWORD_QUESTION_GEN_TMPL_EN = """
+            <metadata>
+            {metadata_str}
+            </metadata>
+            
+            <context>
+            {context_str}
+            </context>
+            
+            Given the contextual information (<context>) and its supplementary metadata (<metadata>), perform the following:
+            
+            ## Step 1 — Generate {num_questions} unique, context-specific questions in Chinese:
+            - Use <context> as the primary source of information for all questions.
+            - Incorporate metadata **only** to improve precision or remove ambiguity, e.g., specifying the organization, location, or date **when it clearly relates to the context**.
+            - Do not create standalone questions based solely on metadata (e.g., "发布日期是什么?" is invalid if not discussed in context).
+            - Ensure questions are concise, specific, and answerable from the context.
+            
+            ## Step 2 — Extract {num_keywords} keywords or short key phrases in Chinese:
+            - Select the most relevant and representative terms from <context>.
+            - Metadata terms are only included if they directly relate to the main topic in <context>.
+            - Keep keywords concise (1–5 characters/words), avoid duplicates, and prioritize domain-specific terminology.
+            
+            Output the result in JSON format:
+            
+            {
+              "questions": [
+                "问题1",
+                "问题2",
+                ...
+              ],
+              "keywords": [
+                "关键词1",
+                "关键词2",
+                ...
+              ]
+            }
+            """;
+
+    static String KEYWORD_QUESTION_GEN_TMPL_ZH = """
+            <metadata>
+            {metadata_str}
+            </metadata>
+            
+            <context>
+            {context_str}
+            </context>
+            
+            基于以上提供的正文内容(<context>)及其补充元数据(<metadata>),请完成以下任务:
+            
+            ## 步骤 1 — 生成 {num_questions} 个独特且与上下文高度相关的问题(中文):
+            - 以 <context> 为主要信息来源来生成所有问题。
+            - 元数据(metadata)仅用于提升问题的准确性或减少歧义,但必须与正文内容直接相关。
+            - 不要生成仅依赖元数据的独立问题。
+            - 问题应简洁、具体,且可以从正文中直接得到答案。
+            
+            ## 步骤 2 — 抽取 {num_keywords} 个中文关键词或简短关键短语:
+            - 从 <context> 中选取最相关、最具代表性的术语。
+            - 仅在元数据中的信息与正文主题直接相关时,才可将其纳入关键词。
+            - 关键词应简洁(1–5 个字/词),避免重复,并优先选择领域专有术语。
+            
+            {output_format}
+            """;
+
+    static String KEYWORD_QUESTION_GEN_TMPL_OUTPUT_FORMAT_ZH = """
+            最终输出 JSON 格式:
+            {
+              "questions": [
+                "问题1",
+                "问题2",
+                ...
+              ],
+              "keywords": [
+                "关键词1",
+                "关键词2",
+                ...
+              ]
+            }
+            """;
+
+    //    //"适量"
+    //    //"若干"
+    //    //"足够覆盖主要信息的数量"
+    //    //"由模型根据上下文自动决定数量"
+    //    //"生成所有上下文能支持的、独特且合理的问题"
+    static String KEYWORD_DEFAULT = "若干";
+
+    static String QUESTION_DEFAULT = "若干";
+
+    public static String toMetadataStr(Map<String, Object> metadata) {
+        if (metadata == null || metadata.size() <= 0) return "";
+        StringBuilder sb = new StringBuilder();
+
+        metadata.forEach((k, v) -> {
+            sb.append(k).append(": ").append(v).append("\n");
+        });
+        if (sb.length() > 0) sb.deleteCharAt(sb.length() - 1);
+        return sb.toString();
+    }
+
+    public static String getKeywordAndQuestionPrompt(String text, Map<String, Object> metadata) {
+        return getKeywordAndQuestionPrompt(text, metadata, QUESTION_DEFAULT, KEYWORD_DEFAULT, KEYWORD_QUESTION_GEN_TMPL_OUTPUT_FORMAT_ZH);
+    }
+
+    public static String getKeywordAndQuestionPrompt(String text, Map<String, Object> metadata, String num_questions, String num_keywords, String output_format) {
+        PromptTemplate promptTemplate = PromptTemplate.builder()
+                .renderer(StTemplateRenderer.builder().startDelimiterToken('{').endDelimiterToken('}').build())
+                .template(KEYWORD_QUESTION_GEN_TMPL_ZH)
+                .build();
+
+        //String prompt = promptTemplate.render(Map.of("ill_name", question, "sample_json", sampleText2));
+        Map<String, Object> params = new HashMap<>();
+        params.put("context_str", text);
+        params.put("metadata_str", toMetadataStr(metadata));
+        if (num_questions != null) {
+            params.put("num_questions", num_questions);
+        }
+        if (num_keywords != null) {
+            params.put("num_keywords", num_keywords);
+        }
+        if (output_format != null) {
+            params.put("output_format", output_format);
+        }
+
+        String prompt = promptTemplate.render(params);
+
+        return prompt;
+    }
+}

+ 71 - 0
server/src/main/java/com/giantan/data/mds/bot/GChatClient.java

@@ -0,0 +1,71 @@
+package com.giantan.data.mds.bot;
+
+import org.springframework.ai.chat.client.ChatClient;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.core.io.DefaultResourceLoader;
+import org.springframework.core.io.Resource;
+import org.springframework.core.io.ResourceLoader;
+import org.springframework.stereotype.Service;
+
+@Service
+public class GChatClient {
+
+    // 字节跳动 34s
+    private static final String DEEPSEEK_BASE_URL = "https://ark.cn-beijing.volces.com/api/v3"; //https://ark.cn-beijing.volces.com/api/v3
+    private static final String DEFAULT_DEEPSEEK_MODEL = "deepseek-v3-241226";//"deepseek-v3";
+    private static final String DEEPSEEK_API_KEY = "87e85db9-1ffc-49fa-a703-87bd6dd94b11";
+
+    private final ResourceLoader resourceLoader = new DefaultResourceLoader();
+    private Resource systemResource;
+
+//    private DeepSeekChatModel chatModel;
+//
+//
+//    public Deepseek2() {
+//        var deepSeekApi = DeepSeekApi.builder().baseUrl(DEEPSEEK_BASE_URL).apiKey(DEEPSEEK_API_KEY).build();
+//
+//        DeepSeekChatOptions options = DeepSeekChatOptions.builder().model(DEFAULT_DEEPSEEK_MODEL).maxTokens(4000).temperature(1.0).build();
+//        var chatModel = new DeepSeekChatModel(deepSeekApi, options, ToolCallingManager.builder().build(), RetryTemplate.defaultInstance(), ObservationRegistry.create());
+//        this.chatModel = chatModel;
+//    }
+//
+//    public DeepSeekChatModel getChatModel() {
+//        return chatModel;
+//    }
+//
+//    public String ask(String question) {
+//        //SystemPromptTemplate st = new SystemPromptTemplate();
+//
+//        ChatResponse response = chatModel.call(new Prompt(question));
+//        String text = response.getResult().getOutput().getText();
+//        return text;
+//
+//    }
+
+    @Autowired
+    ChatClient deepSeekChatClient;
+
+    @Autowired
+    ChatClient openAiChatClient;
+
+    public String ask(String question) {
+        //SystemPromptTemplate st = new SystemPromptTemplate();
+        ChatClient.CallResponseSpec ret = deepSeekChatClient.prompt(question).call();
+        return ret.content();
+    }
+
+    public String askOpenai(String question) {
+        ChatClient.CallResponseSpec ret = openAiChatClient.prompt(question).call();
+        return ret.content();
+    }
+
+    public String askOpenaiForAijiu(String question) {
+        //SystemPromptTemplate st = new SystemPromptTemplate();
+        String prompt = JingluoPrompts.aijiuPropmt(question);
+        //ChatClient.CallResponseSpec ret = openAiChatClient.prompt(prompt).call();
+        String ret = ask(prompt);
+        return ret;
+    }
+
+
+}

+ 88 - 0
server/src/main/java/com/giantan/data/mds/bot/JingluoPrompts.java

@@ -0,0 +1,88 @@
+package com.giantan.data.mds.bot;
+
+import org.springframework.ai.chat.prompt.PromptTemplate;
+import org.springframework.ai.template.st.StTemplateRenderer;
+
+import java.util.Map;
+
+public class JingluoPrompts {
+
+    public static String aijiuPropmt(String question) {
+        //SystemPromptTemplate st = new SystemPromptTemplate();
+
+        String sampleText = """
+                {
+                  "病因": [
+                    "情绪波动导致肝气郁结,影响脾胃功能",
+                    "饮食不节,过食生冷食物,损伤脾阳",
+                    "外邪侵袭,导致机体阳气不足",
+                    "环境因素,如寒湿气候"
+                  ],
+                  "治疗原理": {
+                    "调和阴阳": "通过艾灸调理体内阴阳平衡,温通寒邪",
+                    "温阳散寒": "利用艾灸的温热效应,温暖下焦,改善脾胃功能",
+                    "理气解郁": "疏通经络,缓解因情绪导致的气滞"
+                  },
+                  "贴敷穴位": [
+                    {
+                      "穴位": "关元",
+                      "位置": "肚脐下3寸"
+                    },
+                    {
+                      "穴位": "气海",
+                      "位置": "肚脐下1.5寸"
+                    },
+                    {
+                      "穴位": "足三里",
+                      "位置": "膝盖下缘3寸"
+                    },
+                    {
+                      "穴位": "中脘",
+                      "位置": "肚脐上4寸"
+                    }
+                  ],
+                  "注意事项": {
+                    "贴敷频率": "每周2-3次,视症状而定",
+                    "时间": "每次1-3小时",
+                    "适应人群": "适合体质虚寒者,孕妇及重病患者需谨慎",
+                    "贴后注意保暖": "避免受寒,保持腹部温暖"
+                  }
+                }
+                """;
+
+        String sampleText2 = """
+                {
+                  "病因": [
+                    ...
+                  ],
+                  "治疗原理": {
+                    ...
+                  },
+                  "贴敷穴位": [
+                    {
+                      "穴位": ...,
+                      "位置": ...
+                    },
+                    {
+                      "穴位": ...,
+                      "位置": ...
+                    },
+                    ...
+                  ],
+                  "注意事项": ...
+                }
+                """;
+        PromptTemplate promptTemplate = PromptTemplate.builder()
+                .renderer(StTemplateRenderer.builder().startDelimiterToken('<').endDelimiterToken('>').build())
+                .template("""
+            你作为中医经络专家,能够从 中医病因、治疗原理、贴敷穴位、注意事项等方面给出<ill_name>的艾灸贴治疗方法吗,以json格式输出,参考样式如: <sample_json>
+            """)
+                .build();
+
+        String prompt = promptTemplate.render(Map.of("ill_name", question, "sample_json", sampleText2));
+        //System.out.println("prompt===\n"+prompt);
+        //System.out.println("====");
+
+        return prompt;
+    }
+}

+ 136 - 0
server/src/main/java/com/giantan/data/mds/chunk/DynamicChunkRepository.java

@@ -1,10 +1,18 @@
 package com.giantan.data.mds.chunk;
 
+import com.fasterxml.jackson.core.type.TypeReference;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import com.giantan.ai.util.id.IdGenerator;
 import com.giantan.ai.util.id.UuidGenerator;
 import org.springframework.jdbc.core.JdbcTemplate;
 
+import java.sql.Array;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
 public class DynamicChunkRepository {
     private final ObjectMapper mapper = new ObjectMapper();
 
@@ -78,4 +86,132 @@ public class DynamicChunkRepository {
         String sql = String.format("DELETE FROM %s ", tableName(collId));
         jdbc.update(sql);
     }
+
+
+    public long deleteByMdId(String collId, Integer mdId) {
+        //String sql = "DELETE FROM %s WHERE md_id = ?";
+        String sql = String.format("DELETE FROM %s WHERE md_id = ?", tableName(collId));
+        return jdbc.update(sql, mdId);
+    }
+
+    public Integer save(String collId, MdChunk chunk) {
+        String sql1 = """
+                    INSERT INTO %s (
+                        md_id, chunk_index, content, plain_text, embedding, chunk_type,
+                        paragraph_start, paragraph_end, offset_start, offset_end, section_path,
+                        keywords, metadata, extra
+                    ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?::jsonb, ?::jsonb)
+                """;
+        String sql = String.format(sql1, tableName(collId));
+
+        int updated = jdbc.update(sql,
+                chunk.getMdId(),
+                chunk.getChunkIndex(),
+                chunk.getContent(),
+                chunk.getPlainText(),
+                chunk.getEmbedding(),
+                chunk.getChunkType(),
+                chunk.getParagraphStart(),
+                chunk.getParagraphEnd(),
+                chunk.getOffsetStart(),
+                chunk.getOffsetEnd(),
+                chunk.getSectionPath(),
+                toSqlArray(chunk.getKeywords()),
+                toJson(chunk.getMetadata()),
+                toJson(chunk.getExtra())
+        );
+        return updated;
+    }
+
+    public List<Integer> saveAll(String collId, List<MdChunk> chunks) {
+        String sql1 = """
+                    INSERT INTO %s (
+                        md_id, chunk_index, content, plain_text, embedding, chunk_type,
+                        paragraph_start, paragraph_end, offset_start, offset_end, section_path,
+                        keywords, metadata, extra
+                    ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?::jsonb, ?::jsonb)
+                """;
+        String sql = String.format(sql1, tableName(collId));
+        List<Integer> rets = new ArrayList<>();
+
+        int batchSize = 50;
+        for (int i = 0; i < chunks.size(); i += batchSize) {
+            int end = Math.min(i + batchSize, chunks.size());
+            List<MdChunk> batch = chunks.subList(i, end);
+
+            int[][] batched = jdbc.batchUpdate(sql, batch, batch.size(), (ps, chunk) -> {
+                ps.setObject(1, chunk.getMdId());
+                ps.setObject(2, chunk.getChunkIndex());
+                ps.setObject(3, chunk.getContent());
+                ps.setObject(4, chunk.getPlainText());
+                ps.setObject(5, chunk.getEmbedding());
+                ps.setObject(6, chunk.getChunkType());
+                ps.setObject(7, chunk.getParagraphStart());
+                ps.setObject(8, chunk.getParagraphEnd());
+                ps.setObject(9, chunk.getOffsetStart());
+                ps.setObject(10, chunk.getOffsetEnd());
+                ps.setObject(11, chunk.getSectionPath());
+                ps.setArray(12, toSqlArray(chunk.getKeywords()));
+                ps.setObject(13, toJson(chunk.getMetadata()));
+                ps.setObject(14, toJson(chunk.getExtra()));
+            });
+            for (int j = 0; j < batched.length; j++) {
+                for (int k = 0; k < batched[j].length; k++) {
+                    rets.add(batched[j][k]);
+                }
+            }
+        }
+        return rets;
+    }
+
+    private Array toSqlArray(List<String> ls) {
+        if (ls == null) {
+            return null;
+        }
+        try {
+            // 使用 JdbcTemplate 连接的 DataSource 创建 Array
+            return jdbc.getDataSource().getConnection().createArrayOf("TEXT", ls.toArray(new String[0]));
+        } catch (SQLException e) {
+            throw new RuntimeException("Error creating SQL Array", e);
+        }
+    }
+
+    private MdChunk mapRow(ResultSet rs, int rowNum) throws SQLException {
+        MdChunk c = new MdChunk();
+        c.setId(rs.getLong("id"));
+        c.setMdId(rs.getInt("md_id"));
+        c.setChunkIndex(rs.getInt("chunk_index"));
+        c.setContent(rs.getString("content"));
+        c.setPlainText(rs.getString("plain_text"));
+        c.setEmbedding(rs.getString("embedding"));
+        c.setChunkType(rs.getString("chunk_type"));
+        c.setParagraphStart(rs.getInt("paragraph_start"));
+        c.setParagraphEnd(rs.getInt("paragraph_end"));
+        c.setOffsetStart(rs.getInt("offset_start"));
+        c.setOffsetEnd(rs.getInt("offset_end"));
+        c.setSectionPath(rs.getString("section_path"));
+        c.setKeywords(SqlArrayUtils.fromStringArray(rs.getArray("keywords")));
+        c.setMetadata(fromJson(rs.getString("metadata")));
+        c.setExtra(fromJson(rs.getString("extra")));
+        c.setCreatedAt(rs.getTimestamp("created_at").toInstant());
+        return c;
+    }
+
+    private String toJson(Map<String, Object> map) {
+        try {
+            return map == null ? null : mapper.writeValueAsString(map);
+        } catch (Exception e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    private Map<String, Object> fromJson(String json) {
+        try {
+            return json == null ? null : mapper.readValue(json, new TypeReference<>() {
+            });
+        } catch (Exception e) {
+            throw new RuntimeException(e);
+        }
+    }
+
 }

+ 2 - 2
server/src/main/java/com/giantan/data/mds/chunk/MdChunkController.java

@@ -5,8 +5,8 @@ import org.springframework.web.bind.annotation.*;
 
 import java.util.List;
 
-@RestController
-@RequestMapping("/api/md-chunks")
+//@RestController
+//@RequestMapping("/api/md-chunks")
 public class MdChunkController {
     private final MdChunkRepository repository;
 

+ 2 - 2
server/src/main/java/com/giantan/data/mds/chunk/MdChunkRepository.java

@@ -11,7 +11,7 @@ import java.sql.SQLException;
 import java.util.List;
 import java.util.Map;
 
-@Repository
+//@Repository
 public class MdChunkRepository {
 
     private final JdbcTemplate jdbcTemplate;
@@ -30,7 +30,7 @@ public class MdChunkRepository {
             ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?::jsonb, ?::jsonb)
         """;
 
-        jdbcTemplate.update(sql,
+        int updated = jdbcTemplate.update(sql,
                 chunk.getMdId(),
                 chunk.getChunkIndex(),
                 chunk.getContent(),

+ 63 - 0
server/src/main/java/com/giantan/data/mds/config/ChatClientConfig.java

@@ -0,0 +1,63 @@
+package com.giantan.data.mds.config;
+
+import io.micrometer.observation.ObservationRegistry;
+import org.springframework.ai.chat.client.ChatClient;
+import org.springframework.ai.deepseek.DeepSeekChatModel;
+import org.springframework.ai.deepseek.DeepSeekChatOptions;
+import org.springframework.ai.deepseek.api.DeepSeekApi;
+import org.springframework.ai.model.tool.ToolCallingManager;
+import org.springframework.ai.openai.OpenAiChatModel;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.beans.factory.annotation.Value;
+import org.springframework.context.annotation.Bean;
+import org.springframework.context.annotation.Configuration;
+import org.springframework.retry.support.RetryTemplate;
+
+@Configuration
+public class ChatClientConfig {
+
+    // 字节跳动 34s
+//    private static final String DEEPSEEK_BASE_URL = "https://ark.cn-beijing.volces.com/api/v3"; //https://ark.cn-beijing.volces.com/api/v3
+//    private static final String DEFAULT_DEEPSEEK_MODEL = "deepseek-v3-241226";//"deepseek-v3";
+//    private static final String DEEPSEEK_API_KEY = "87e85db9-1ffc-49fa-a703-87bd6dd94b11";
+
+    @Value("${spring.ai.deepseek.base-url}")
+    private String url;
+
+    @Value("${spring.ai.deepseek.api-key}")
+    private String apiKey;
+
+    @Value("${spring.ai.deepseek.chat.model}")
+    private String chatModel;
+
+
+//    @Autowired
+//    private DeepSeekChatModel deepSeekChatModel;
+
+    @Autowired
+    private OpenAiChatModel openAiChatModel;
+
+    @Bean
+    public DeepSeekChatModel deepSeekChatModel() {
+       var deepSeekApi = DeepSeekApi.builder().baseUrl(url).apiKey(apiKey).build();
+
+        DeepSeekChatOptions options = DeepSeekChatOptions.builder().model(chatModel)
+                .maxTokens(4000).temperature(1.0).build();
+        var chatModel = new DeepSeekChatModel(deepSeekApi, options, ToolCallingManager.builder().build(), RetryTemplate.defaultInstance(), ObservationRegistry.create());
+        return chatModel;
+    }
+
+    @Bean
+    public ChatClient deepSeekChatClient() {
+        ChatClient client = ChatClient.builder(deepSeekChatModel()).build();
+        return client;
+    }
+
+    @Bean
+    public ChatClient openAiChatClient() {
+        ChatClient client = ChatClient.builder(openAiChatModel).build();
+        return client;
+    }
+
+
+}

+ 10 - 1
server/src/main/java/com/giantan/data/mds/config/TaskConfiguration.java

@@ -1,5 +1,7 @@
 package com.giantan.data.mds.config;
 
+import com.giantan.data.mds.service.IMdChunksService;
+import com.giantan.data.mds.service.IMdFilesService;
 import com.giantan.data.tasks.TaskEventListener;
 import com.giantan.data.tasks.ITaskHandler;
 import com.giantan.data.tasks.TaskHandlerRegistry;
@@ -8,6 +10,7 @@ import com.giantan.data.mds.task.impl.KeywordsTaskHandler;
 import com.giantan.data.mds.task.impl.SliceTaskHandler;
 import com.google.common.eventbus.AsyncEventBus;
 import com.google.common.eventbus.EventBus;
+import org.springframework.beans.factory.annotation.Autowired;
 import org.springframework.context.annotation.Bean;
 import org.springframework.context.annotation.Configuration;
 import org.springframework.scheduling.concurrent.ThreadPoolTaskExecutor;
@@ -23,6 +26,12 @@ class TaskConfiguration {
 //        return new EventBus();
 //    }
 
+    @Autowired
+    IMdFilesService mdFilesService;
+
+    @Autowired
+    IMdChunksService mdChunksService;
+
     @Bean
     public Executor taskExecutor() {
         //return Executors.newFixedThreadPool(10);
@@ -58,7 +67,7 @@ class TaskConfiguration {
 
     @Bean
     public SliceTaskHandler sliceTaskHandler() {
-        return new SliceTaskHandler();
+        return new SliceTaskHandler(mdFilesService,mdChunksService);
     }
 
     @Bean

+ 65 - 0
server/src/main/java/com/giantan/data/mds/controller/ChatController.java

@@ -0,0 +1,65 @@
+package com.giantan.data.mds.controller;
+
+
+import com.giantan.data.kvs.constant.KvConstants;
+import com.giantan.data.mds.bot.GChatClient;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.web.bind.annotation.GetMapping;
+import org.springframework.web.bind.annotation.RequestMapping;
+import org.springframework.web.bind.annotation.RequestParam;
+import org.springframework.web.bind.annotation.RestController;
+
+import java.lang.invoke.MethodHandles;
+import java.util.Map;
+
+//@Slf4j
+@RestController
+@RequestMapping(KvConstants.API_PREFIX+"/bot")
+public class ChatController {
+    private static final org.slf4j.Logger log
+            = org.slf4j.LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+//    private final ChatClient chatClient;
+//
+//    public ChatController(ChatClient.Builder chatClientBuilder) {
+//        this.chatClient = chatClientBuilder.build();
+//    }
+//
+//    @GetMapping("/ask")
+//    public String ask(@RequestParam String question) {
+//        return chatClient.prompt()
+//                .user(question)
+//                .call()
+//                .content();
+//    }
+
+//    private final DeepSeekChatModel chatModel;
+
+//    @Autowired
+//    public ChatController(DeepSeekChatModel chatModel) {
+//        this.chatModel = chatModel;
+//    }
+
+    @Autowired
+    GChatClient deepseek2;
+
+    @GetMapping("/ask")
+    public Map generate(@RequestParam(value = "question", defaultValue = "Tell me a joke") String question) {
+//        String ret = chatModel.call(question);
+//        return Map.of("generation", ret);
+        //System.out.println(question);
+        log.info(question);
+        String ret = deepseek2.ask(question);
+        return Map.of("generation", ret);
+    }
+
+    @GetMapping("/ask2")
+    public String ask2(@RequestParam(value = "question", defaultValue = "Tell me a joke") String question) {
+//        String ret = chatModel.call(question);
+//        return Map.of("generation", ret);
+        //System.out.println(question);
+        log.info(question);
+        String ret = deepseek2.askOpenaiForAijiu(question);
+        return ret;
+    }
+
+}

+ 5 - 18
server/src/main/java/com/giantan/data/mds/controller/DownloadController.java

@@ -1,9 +1,8 @@
 package com.giantan.data.mds.controller;
 
-import com.giantan.data.kvs.kvstore.GBaseKeyValue;
+
 import com.giantan.data.kvs.constant.KvConstants;
-import com.giantan.data.mds.service.IMdDocsService;
-import com.giantan.gfs.service.impl.S3GkbService;
+import com.giantan.data.mds.service.IMdFilesService;
 import jakarta.servlet.http.HttpServletResponse;
 import org.springframework.beans.factory.annotation.Autowired;
 import org.springframework.web.bind.annotation.*;
@@ -17,30 +16,18 @@ import java.net.URLConnection;
 public class DownloadController {
 
     @Autowired
-    S3GkbService gkbStorer;
-
-    //    @Autowired
-//    ICollectionService collectionService;
-    @Autowired
-    IMdDocsService mdDocsService;
+    IMdFilesService mdFilesService;
 
-    private String getObjectPath(String coll, String gid) throws Throwable {
-        GBaseKeyValue r = mdDocsService.findByMdid(coll, gid);
-        return r.getName();
-    }
 
     @GetMapping("/mds/{gid}/download")
     public void downloadFile(@PathVariable String collId, @PathVariable String gid, @RequestParam String filename, HttpServletResponse response) throws Throwable {
 
-        String repository = collId;
-        String fromObject = getObjectPath(collId, gid);
+        //String repository = collId;//String fromObject = getObjectPath(collId, gid);
 
         //如果你希望支持 预览(如 Markdown 在线预览) 而不是下载,可以把 Content-Disposition 改成:
         //response.setHeader("Content-Disposition", "inline; filename=\"" + filename + "\"");
 
-        try (InputStream stream = gkbStorer.download(repository, fromObject)) {
-//                     minioClient.getObject(
-//                GetObjectArgs.builder().bucket(bucket).object(filename).build())) {
+        try (InputStream stream = mdFilesService.download(collId, gid)) {
 
             String contentType = URLConnection.guessContentTypeFromName(filename);
             if (contentType == null) contentType = "application/octet-stream";

+ 72 - 23
server/src/main/java/com/giantan/data/mds/controller/TaskController.java

@@ -12,10 +12,7 @@ import com.giantan.data.tasks.TaskManager;
 import com.giantan.data.tasks.TaskType;
 import com.giantan.data.tasks.TaskStatus;
 
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
+import java.util.*;
 
 @RestController
 @RequestMapping(KvConstants.API_PREFIX + "/collections/{collId}/tasks")
@@ -24,37 +21,89 @@ public class TaskController {
     @Autowired
     private TaskManager manager;
 
+    /*
+    {
+	"type": "SLICE",
+	"objectIds": [2],  //或者 fromId/toId
+	"mdType": "simple",  // faq/tagged
+	"chunkMetadata": {
+		"k1": "v1"
+	    },
+	"chunkSize": 512,
+    "chunkOverlap": 64
+    }
+     */
+
     @PostMapping("/submit")
-    public String start(@RequestBody Map<String, Object> payload) {
-        //   @RequestParam(required = false, defaultValue = "3") int maxRetries,
-        //   @RequestParam(required = false, defaultValue = "1000") long retryDelay
+    public Map submit(@PathVariable String collId, @RequestBody Map<String, Object> payload) {
+        String t = (String) payload.remove("type");
+
+        TaskType type = TaskType.valueOf(t);
+
+        List<Object> objects = null;
+
+        if (payload.containsKey("objectIds")) {
+            objects = (List<Object>) payload.remove("objectIds");
+        } else if (payload.containsKey("fromId") && payload.containsKey("toId")) {
+            int from = (int) payload.remove("fromId");
+            int to = (int) payload.remove("toId");
+            objects = new ArrayList<>();
+            for (int i = from; i <= to; i++) {
+                objects.add(i);
+            }
+        } else {
+            throw new IllegalArgumentException("必须提供 objectIds 或 fromId/toId");
+        }
+
+        //Map<String, Object> params = (Map<String, Object>) payload.getOrDefault("params", new HashMap<>());
+        Map<String, Object> params = new HashMap<>(payload);
 
-        TaskType type = TaskType.valueOf((String) payload.get("type"));
-        List<Object> objects = (List<Object>) payload.get("objectIds");
+        String ret = manager.submit(collId, type, objects, params);
+        return Map.of("taskId", ret);
+    }
 
-        Map<String, Object> params = (Map<String, Object>) payload.getOrDefault("params", new HashMap<>());
-        return manager.submit(type, objects, params);
+    @PostMapping("/{id}/cancel")
+    public Map cancel(@PathVariable String collId, @PathVariable String id) {
+        boolean ok = manager.cancel(collId, id);
+        return Map.of("canceled", ok);
     }
 
-    @PostMapping("/cancel/{id}")
-    public void cancel(@PathVariable String id) {
-        manager.cancel(id);
+    @DeleteMapping("/{id}")
+    public Map delete(@PathVariable String collId, @PathVariable String id) {
+        boolean ok = manager.delete(collId, id);
+        return Map.of("deleted", ok);
     }
 
-    @GetMapping("/status/{id}")
-    public Map<String, TaskStatus> status(@PathVariable String id) {
-        TaskContext ctx = manager.getTask(id);
+    @GetMapping("/{id}/status")
+    public Map<String, TaskStatus> status(@PathVariable String collId, @PathVariable String id) {
+        TaskContext ctx = manager.getTask(collId, id);
         return ctx != null ? ctx.getObjectStatus() : Collections.emptyMap();
     }
 
     @GetMapping("/{id}")
-    public TaskContext getTask(@PathVariable String id) {
-        return manager.getTask(id);
+    public TaskContext getTask(@PathVariable String collId, @PathVariable String id) {
+        return manager.getTask(collId, id);
+    }
+
+    @DeleteMapping("/cleanup")
+    public Map cleanup(@PathVariable String collId) {
+        int r = manager.cleanupNow(collId);
+        //return ResponseEntity.ok("Task cleanup triggered.");
+        return Map.of("deleted", r);
     }
 
-    @PostMapping("/cleanup")
-    public ResponseEntity<String> cleanup() {
-        manager.cleanupNow();
-        return ResponseEntity.ok("Task cleanup triggered.");
+    @GetMapping
+    public Collection<TaskContext> listAllTasks(@PathVariable String collId) {
+        return manager.allTasks(collId);
+//                .stream()
+//                .filter(t -> collId.equals(t.getCollection()))
+//                .collect(Collectors.toList());
     }
+
+    @GetMapping("/status/{status}")
+    public Collection<TaskContext> listTasksByStatus(@PathVariable String collId, @PathVariable String status) {
+        //TaskStatus statusEnum = TaskStatus.valueOf(status.toUpperCase());
+        return manager.findByStatus(collId, status);
+    }
+
 }

+ 2 - 0
server/src/main/java/com/giantan/data/mds/repository/MdDynamicChunkRepository.java

@@ -20,4 +20,6 @@ public class MdDynamicChunkRepository extends DynamicChunkRepository {
     public void init() {
         setSchema("mddb", "chunks_");
     }
+
+
 }

+ 5 - 1
server/src/main/java/com/giantan/data/mds/service/FileProcessingService.java

@@ -1,5 +1,6 @@
 package com.giantan.data.mds.service;
 
+import com.giantan.data.tasks.TaskManager;
 import com.giantan.gfs.service.impl.S3GkbService;
 import com.giantan.gfs.storer.util.FileUtil;
 import com.giantan.gfs.storer.util.J7Zip;
@@ -27,11 +28,13 @@ public class FileProcessingService {
     @Autowired
     private TaskStatusManager taskStatusManager;
 
+//    @Autowired
+//    TaskManager taskManager;
+
     @Autowired
     S3GkbService gkbStorer;
 
     @Autowired
-
     MdCollectionsService mdCollectionsService;
 
     //collectionService.createEntry(collId, data)
@@ -77,6 +80,7 @@ public class FileProcessingService {
             //e.printStackTrace();
             TaskStatus status = new TaskStatus(coll, taskId, "失败", e.getMessage(), System.currentTimeMillis(), System.currentTimeMillis());
             taskStatusManager.markFailed(taskId, status);
+
             log.error("任务 {} 失败", taskId, e);
             //log.error("Error occurred: " + e);
             throw e;

+ 1 - 0
server/src/main/java/com/giantan/data/mds/service/IDynamicDocService.java

@@ -5,6 +5,7 @@ import com.giantan.data.kvs.kvstore.GBaseKeyValue;
 import java.util.List;
 import java.util.Map;
 
+// 没有使用
 public interface IDynamicDocService {
 
     GBaseKeyValue createEntry(String collId, Map<String, Object> data) throws Throwable;

+ 15 - 0
server/src/main/java/com/giantan/data/mds/service/IMdChunksService.java

@@ -0,0 +1,15 @@
+package com.giantan.data.mds.service;
+
+import com.giantan.data.mds.chunk.MdChunk;
+import org.cnnlp.data.document.GDocument;
+
+import java.util.List;
+
+public interface IMdChunksService {
+
+    long deleteByMdId(String collId, Integer mdId);
+
+    List<Integer> saveAll(String collId, List<MdChunk> chunks) throws Throwable;
+
+    Integer save(String coll, MdChunk chunk) throws Throwable;
+}

+ 15 - 0
server/src/main/java/com/giantan/data/mds/service/IMdFilesService.java

@@ -0,0 +1,15 @@
+package com.giantan.data.mds.service;
+
+import com.giantan.data.kvs.kvstore.GBaseKeyValue;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+public interface IMdFilesService {
+
+    public InputStream download(String repository, String fullName) throws Throwable;
+
+    String getMdFileContent(String coll, String mdId) throws Throwable;
+
+    GBaseKeyValue findByMdid(String coll, String mdId) throws Throwable;
+}

+ 1 - 2
server/src/main/java/com/giantan/data/mds/service/MdCache.java

@@ -1,8 +1,7 @@
 package com.giantan.data.mds.service;
 
 
-import com.giantan.mds.MdSearcher;
-import org.cnnlp.data.md.DocTree;
+import org.cnnlp.data.md.MdSearcher;
 import org.springframework.stereotype.Component;
 import com.github.benmanes.caffeine.cache.Cache;
 import com.github.benmanes.caffeine.cache.Caffeine;

+ 71 - 0
server/src/main/java/com/giantan/data/mds/service/MdChunksService.java

@@ -0,0 +1,71 @@
+package com.giantan.data.mds.service;
+
+import com.giantan.data.mds.chunk.MdChunk;
+import com.giantan.data.mds.repository.MdDynamicChunkRepository;
+import jakarta.annotation.PostConstruct;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.stereotype.Service;
+
+import java.util.List;
+
+@Service
+public class MdChunksService implements IMdChunksService {
+
+    @Autowired
+    MdCollectionsService mdCollectionsService;
+
+    @Autowired
+    MdDynamicChunkRepository mdDynamicChunkRepository;
+
+    public MdChunksService() {
+
+    }
+
+    @PostConstruct
+    public void init() {
+    }
+
+    @Override
+    public long deleteByMdId(String coll, Integer mdId) {
+        int collId = mdCollectionsService.getCollectionId(coll);
+        long r = mdDynamicChunkRepository.deleteByMdId(Integer.toString(collId), mdId);
+        return r;
+    }
+
+    @Override
+    public List<Integer> saveAll(String coll, List<MdChunk> chunks) throws Throwable {
+        int collId = mdCollectionsService.getCollectionId(coll);
+        List<Integer> rets = mdDynamicChunkRepository.saveAll(Integer.toString(collId), chunks);
+        return rets;
+    }
+
+    @Override
+    public Integer save(String coll, MdChunk chunk) throws Throwable {
+        int collId = mdCollectionsService.getCollectionId(coll);
+        Integer ret = mdDynamicChunkRepository.save(Integer.toString(collId), chunk);
+        return ret;
+    }
+
+
+
+//    private MdChunk mapRow(GDocument rs) throws SQLException {
+//        MdChunk c = new MdChunk();
+//        //c.setId(rs.getLong("id"));
+//        c.setMdId(rs.getInt("md_id"));
+//        c.setChunkIndex(rs.g);
+//        c.setContent(rs.getString("content"));
+//        c.setPlainText(rs.getString("plain_text"));
+//        c.setEmbedding(rs.getString("embedding"));
+//        c.setChunkType(rs.getString("chunk_type"));
+//        c.setParagraphStart(rs.getInt("paragraph_start"));
+//        c.setParagraphEnd(rs.getInt("paragraph_end"));
+//        c.setOffsetStart(rs.getInt("offset_start"));
+//        c.setOffsetEnd(rs.getInt("offset_end"));
+//        c.setSectionPath(rs.getString("section_path"));
+//        c.setKeywords(SqlArrayUtils.fromStringArray(rs.getArray("keywords")));
+//        c.setMetadata(fromJson(rs.getString("metadata")));
+//        c.setExtra(fromJson(rs.getString("extra")));
+//        c.setCreatedAt(rs.getTimestamp("created_at").toInstant());
+//        return c;
+//    }
+}

+ 1 - 1
server/src/main/java/com/giantan/data/mds/service/MdDocsService.java

@@ -4,7 +4,7 @@ import com.giantan.data.kvs.kvstore.GBaseKeyValue;
 import com.giantan.data.mds.repository.MdDynamicRepository;
 import com.giantan.gfs.service.impl.S3GkbService;
 import com.giantan.gfs.storer.util.FileUtil;
-import com.giantan.mds.MdSearcher;
+import org.cnnlp.data.md.MdSearcher;
 import org.cnnlp.data.util.BaseParameters;
 import org.springframework.beans.factory.annotation.Autowired;
 import org.springframework.scheduling.annotation.Async;

+ 57 - 0
server/src/main/java/com/giantan/data/mds/service/MdFilesService.java

@@ -0,0 +1,57 @@
+package com.giantan.data.mds.service;
+
+import com.giantan.data.kvs.kvstore.GBaseKeyValue;
+import com.giantan.gfs.service.impl.S3GkbService;
+import org.apache.commons.io.IOUtils;
+import org.cnnlp.data.md.MdSearcher;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.stereotype.Service;
+
+import java.io.InputStream;
+
+@Service
+public class MdFilesService implements IMdFilesService {
+
+    @Autowired
+    S3GkbService gkbStorer;
+    //
+    @Autowired
+    IMdDocsService mdDocsService;
+
+    @Autowired
+    MdCache mdCache;
+
+    private String getObjectPath(String coll, String gid) throws Throwable {
+        GBaseKeyValue r = mdDocsService.findByMdid(coll, gid);
+        return r.getName();
+    }
+
+    @Override
+    public InputStream download(String coll, String mdId) throws Throwable {
+        String repository = coll;
+
+        String fromObject = getObjectPath(coll, mdId);
+        return  gkbStorer.download(repository, fromObject);
+    }
+
+    @Override
+    public String getMdFileContent(String coll, String mdId) throws Throwable {
+        String key = coll + ":" + mdId;
+        MdSearcher searcher = mdCache.get(key);
+        if (searcher != null) {
+            return searcher.getText();
+        }else {
+            try (InputStream stream = download(coll, mdId)) {
+                String s = IOUtils.toString(stream, "UTF-8");
+                return s;
+            }
+        }
+    }
+
+    @Override
+    public GBaseKeyValue findByMdid(String coll, String mdId) throws Throwable{
+        GBaseKeyValue r = mdDocsService.findByMdid(coll, mdId);
+        return r;
+    }
+
+}

+ 39 - 0
server/src/main/java/com/giantan/data/mds/task/impl/BaseTaskHandler.java

@@ -0,0 +1,39 @@
+package com.giantan.data.mds.task.impl;
+
+import com.giantan.data.tasks.ITaskHandler;
+import com.giantan.data.tasks.TaskContext;
+import com.giantan.data.tasks.TaskStatus;
+import com.giantan.data.tasks.TaskType;
+
+public abstract class BaseTaskHandler implements ITaskHandler {
+
+    private static final org.slf4j.Logger log
+            = org.slf4j.LoggerFactory.getLogger(BaseTaskHandler.class);
+
+    public void handle(TaskContext context) {
+        boolean isCanceled = false;
+        log.info(getType() + " task: {} started", context.getTaskId());
+        for (Object objectId : context.getObjectIds()) {
+            if (context.isCancelled()) {
+                context.setStatus(TaskStatus.CANCELLED);
+                log.info(getType() + " task: {} (objectId: {}) cancelled", context.getTaskId(), objectId);
+                isCanceled = true;
+                break;
+            }
+            try {
+                doing(context, objectId);
+                context.logSuccess(objectId.toString());
+            } catch (Exception e) {
+                context.logFailure(objectId.toString(), e.getMessage());
+                log.error(getType() + " task: {} (objectId: {}) error: {}", context.getTaskId(), objectId, e.getMessage());
+            }
+        }
+        if (!isCanceled) {
+            log.info(getType() + " task: {} finished", context.getTaskId());
+        }
+    }
+
+    public abstract TaskType getType();
+
+    public abstract void doing(TaskContext context, Object objectId);
+}

+ 189 - 14
server/src/main/java/com/giantan/data/mds/task/impl/SliceTaskHandler.java

@@ -1,27 +1,202 @@
 package com.giantan.data.mds.task.impl;
 
+import com.giantan.data.kvs.kvstore.GBaseKeyValue;
+import com.giantan.data.mds.chunk.MdChunk;
+import com.giantan.data.mds.service.IMdChunksService;
+import com.giantan.data.mds.service.IMdFilesService;
 import com.giantan.data.tasks.TaskContext;
-import com.giantan.data.tasks.ITaskHandler;
 import com.giantan.data.tasks.TaskType;
+import org.cnnlp.data.document.GDocConstants;
+import org.cnnlp.data.document.GDocument;
+import org.cnnlp.data.splitter.*;
+import org.cnnlp.data.util.BaseParameters;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.file.Path;
+import java.util.*;
+
 
 //@Component
-public class SliceTaskHandler implements ITaskHandler {
-    @Override
-    public void handle(TaskContext context) {
-        for (Object objectId : context.getObjectIds()) {
-            if (context.isCancelled()) break;
-            try {
-                System.out.println("Slicing object: " + objectId);
-                Thread.sleep(300); // simulate
-                context.logSuccess(objectId.toString());
-            } catch (Exception e) {
-                context.logFailure(objectId.toString(), e.getMessage());
-            }
-        }
+public class SliceTaskHandler extends BaseTaskHandler {
+
+    private static final org.slf4j.Logger log
+            = org.slf4j.LoggerFactory.getLogger(SliceTaskHandler.class);
+
+    private static final String MD_TYPE = "mdType";
+    private static final String CHUNK_METADATA = "chunkMetadata";
+    private static final String FILE_NAME = "_fileName";
+
+    IMdFilesService mdFilesService;
+    IMdChunksService mdChunksService;
+
+    public SliceTaskHandler(IMdFilesService mdFilesService, IMdChunksService mdChunksService) {
+        this.mdFilesService = mdFilesService;
+        this.mdChunksService = mdChunksService;
     }
 
+//    @Override
+//    public void handle(TaskContext context) {
+//        for (Object objectId : context.getObjectIds()) {
+//            if (context.isCancelled()) {
+//                context.setStatus(TaskStatus.CANCELLED);
+//                log.info(getType()+" task: {}(objectId: {}) cancelled", context.getTaskId(),objectId);
+//                break;
+//            }
+//            try {
+//                System.out.println("Slicing object: " + objectId);
+//                Thread.sleep(15000); // simulate
+//                context.logSuccess(objectId.toString());
+//            } catch (Exception e) {
+//                context.logFailure(objectId.toString(), e.getMessage());
+//                log.error(getType()+" task: {}(objectId: {}) error: ", context.getTaskId(),objectId,e.getMessage());
+//            }
+//        }
+//    }
+
+
     @Override
     public TaskType getType() {
         return TaskType.SLICE;
     }
+
+    @Override
+    public void doing(TaskContext context, Object objectId) {
+
+        //System.out.println("Slicing object: " + objectId);
+        try {
+            String coll = context.getCollection();
+            Map<String, Object> params = context.getParams();
+
+            String mdId = objectId.toString();
+            GBaseKeyValue mdMeta = mdFilesService.findByMdid(coll, mdId);
+            if (mdMeta != null) {
+                String text = mdFilesService.getMdFileContent(coll, mdId);
+                String mdType = getMdType(params);
+                String name = mdMeta.getName();
+                String baseName = SplitUtils.getBaseName(name);
+
+                List<GDocument> chunks = splitToChunks(text, mdType, params);
+
+                if (chunks != null) {
+                    // 删除 gid 的chunks
+                    // 存 chunks
+                    //System.out.println(chunks.get(0));
+                    Map<String, Object> chunkMetadata = new HashMap<>();
+                    Object mo = params.get(CHUNK_METADATA);
+                    if (mo != null && mo instanceof Map) {
+                        chunkMetadata.putAll((Map) mo);
+                    }
+                    mdChunksService.deleteByMdId(coll, toInt(mdId));
+                    List<MdChunk> mdChunks = new ArrayList<MdChunk>();
+                    for (int i = 0; i < chunks.size(); i++) {
+                        MdChunk chunk = toChunk(toInt(mdId), chunks.get(i), i, baseName, chunkMetadata);
+                        mdChunks.add(chunk);
+                    }
+
+                    List<Integer> rets = mdChunksService.saveAll(coll, mdChunks);
+                    //System.out.println("saved="+rets.size());
+                }
+            }
+        } catch (Exception e) {
+            throw new RuntimeException(e);
+        } catch (Throwable e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    private String getMdType(Map<String, Object> params) {
+        String type = MdChunking.MD_TYPE_SIMPLE;
+        if (params != null) {
+            Object o = params.get(MD_TYPE);
+            if (o != null) {
+                type = o.toString();
+            }
+        }
+        return type;
+    }
+
+
+//    public List<GDocument> splitSimple(String text) throws IOException {
+//        SimpleMdSplitter splitter = new SimpleMdSplitter();
+//        splitter.setCommentProcessor(KvCommentProcessor.build());
+//        BaseParameters params = BaseParameters.defaultParams();
+//        //Path path = Paths.get(md);
+//        //String baseName = SplitUtils.getFileBaseName(path);
+//        //System.out.println("baseName=" + baseName);
+//        List<GDocument> docs = splitter.split(text, params);
+//        //SplitUtils.toJsonFile(new File(json), docs);
+//        return docs;
+//    }
+//
+//    public List<GDocument> splitFaq(String text) throws IOException {
+//        FaqMdSplitter splitter = new FaqMdSplitter();
+//        BaseParameters params = BaseParameters.defaultParams();
+//        List<GDocument> docs = splitter.split(text, params);
+//        //SplitUtils.toJsonFile(new File(json), docs);
+//        return docs;
+//    }
+
+    private Integer toInt(Object o) {
+        if (o instanceof Integer) {
+            return (Integer) o;
+        }
+        return Integer.parseInt(o.toString());
+    }
+
+    protected MdChunk toChunk(Integer mdId, GDocument doc, int idx, String baseName, Map<String, Object> userMetadata) {
+        MdChunk chunk = new MdChunk();
+        chunk.setMdId(mdId);
+        chunk.setChunkIndex(idx);
+        chunk.setPlainText(doc.getText());
+        chunk.setSectionPath(doc.getId());
+        chunk.setCreatedAt(new Date().toInstant());
+
+        Map<String, Object> metadata = doc.getMetadata();
+        Object o = metadata.remove(GDocConstants.RAW_CONTENT);
+        if (o != null) {
+            chunk.setContent(o.toString());
+        }
+
+        o = metadata.remove(GDocConstants.START_OFFSET);
+        if (o != null) {
+            chunk.setOffsetStart(toInt(o));
+        }
+        o = metadata.remove(GDocConstants.END_OFFSET);
+        if (o != null) {
+            chunk.setOffsetEnd(toInt(o));
+        }
+
+        o = metadata.remove(GDocConstants.FROM_IDX);
+        if (o != null) {
+            chunk.setParagraphStart(toInt(o));
+        }
+        o = metadata.remove(GDocConstants.TO_IDX);
+        if (o != null) {
+            chunk.setParagraphEnd(toInt(o));
+        }
+
+        Map<String, Object> metadata1 = chunk.getMetadata();
+        if (metadata1 == null) {
+            metadata1 = new HashMap<String, Object>();
+            chunk.setMetadata(metadata1);
+        }
+
+        if (userMetadata.size() > 0) {
+            metadata1.putAll(userMetadata);
+        }
+        metadata1.put(FILE_NAME, baseName);
+        metadata1.putAll(metadata);
+
+        return chunk;
+    }
+
+    public List<GDocument> splitToChunks(String text, String type, Map<String, Object> params) throws IOException {
+        BaseParameters params2 = new BaseParameters(params);
+        params2.put(MD_TYPE, type);
+        IMdChunking chunker = new MdChunking();
+        List<GDocument> chunks = chunker.chunking(text, params2);
+        return chunks;
+    }
+
 }

+ 16 - 6
server/src/main/java/com/giantan/data/tasks/ITaskManager.java

@@ -7,13 +7,23 @@ import java.util.List;
 import java.util.Map;
 
 public interface ITaskManager {
-    String submit(TaskType type, List<Object> objectIds, Map<String, Object> params);
-    boolean cancel(String taskId);
-    TaskContext getTask(String taskId);
-    Collection<TaskContext> allTasks();
+    String submit(String coll, TaskType type, List<Object> objectIds, Map<String, Object> params);
 
-    void updateExtra(String taskId, Map<String, Object> extra);
+    boolean cancel(String coll, String taskId);
+
+    TaskContext getTask(String coll, String taskId);
+
+    Collection<TaskContext> allTasks(String coll);
+
+    //int cleanupTasks(String coll);
+
+    void updateExtra(String coll, String taskId, Map<String, Object> extra);
+
+    int cleanupNow(String coll);
 
-    void cleanupNow();
     EventBus getEventBus();
+
+    boolean delete(String coll, String id);
+
+    Collection<TaskContext> findByStatus(String coll, String status);
 }

+ 19 - 2
server/src/main/java/com/giantan/data/tasks/TaskContext.java

@@ -2,6 +2,7 @@ package com.giantan.data.tasks;
 
 import com.giantan.data.tasks.repository.TaskStatusHistory;
 
+import java.io.Serializable;
 import java.time.Instant;
 import java.util.ArrayList;
 import java.util.HashMap;
@@ -11,25 +12,29 @@ import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.atomic.AtomicBoolean;
 
 
-public class TaskContext {
+public class TaskContext implements Serializable {
 
     private final String taskId;
+    private final String collection;
     private final TaskType type;
     private final List<Object> objectIds;
+
     private final Map<String, Object> params;
     private final AtomicBoolean cancelled = new AtomicBoolean(false);
     private volatile TaskStatus status;
     private final Map<String, TaskStatus> objectStatus = new ConcurrentHashMap<>();
     private Map<String, Object> extra = new HashMap<>();
     private volatile String error;
+
     private Instant createdAt;
     private Instant completedAt;
     private int retryCount;
     private int maxRetries;
     private long retryDelayMillis;
 
-    public TaskContext(String taskId, TaskType type, List<Object> objectIds, Map<String, Object> params) {
+    public TaskContext(String taskId, String collection,TaskType type, List<Object> objectIds, Map<String, Object> params) {
         this.taskId = taskId;
+        this.collection = collection;
         this.type = type;
         this.objectIds = objectIds;
         this.params = params;
@@ -40,6 +45,7 @@ public class TaskContext {
     public static TaskContext from(TaskStatusHistory history) {
         TaskContext ctx = new TaskContext(
                 history.getTaskId(),
+                history.getCollection(),
                 TaskType.valueOf(history.getTaskType()),
                 new ArrayList<>(history.getObjectIds()),
                 history.getParams()
@@ -69,6 +75,10 @@ public class TaskContext {
         return objectIds;
     }
 
+    public String getCollection() {
+        return collection;
+    }
+
     public boolean isCancelled() {
         return cancelled.get();
     }
@@ -77,6 +87,10 @@ public class TaskContext {
         cancelled.set(true);
     }
 
+    public Instant getCreatedAt() {
+        return createdAt;
+    }
+
     public TaskStatus getStatus() {
         return status;
     }
@@ -151,6 +165,9 @@ public class TaskContext {
     public String getError() { return error; }
     public void setError(String error) { this.error = error; }
 
+    public Map<String, Object> getParams() {
+        return params;
+    }
 
     public Map<String, Object> getExtra() {
         return extra;

+ 3 - 1
server/src/main/java/com/giantan/data/tasks/TaskEvent.java

@@ -5,12 +5,14 @@ import java.util.Map;
 
 public class TaskEvent {
     public String taskId;
+    public String collection;
     public TaskType type;
     public List<Object> objectIds;
     public Map<String, Object> params;
 
-    public TaskEvent(String taskId, TaskType type, List<Object> objectIds, Map<String, Object> params) {
+    public TaskEvent(String taskId, String collection, TaskType type, List<Object> objectIds, Map<String, Object> params) {
         this.taskId = taskId;
+        this.collection = collection;
         this.type = type;
         this.objectIds = objectIds;
         this.params = params;

+ 1 - 1
server/src/main/java/com/giantan/data/tasks/TaskEventListener.java

@@ -44,7 +44,7 @@ public class TaskEventListener {
 
     @Subscribe
     public void onTask(TaskEvent event) {
-        TaskContext context = manager.getTask(event.taskId);
+        TaskContext context = manager.getTask(event.collection,event.taskId);
         if (context == null || context.isCancelled()) return;
 
         ITaskHandler handler = registry.get(event.type);

+ 75 - 18
server/src/main/java/com/giantan/data/tasks/TaskManager.java

@@ -4,48 +4,46 @@ import com.google.common.eventbus.EventBus;
 
 import java.time.Duration;
 import java.time.Instant;
-import java.util.Collection;
-import java.util.List;
-import java.util.Map;
-import java.util.UUID;
+import java.util.*;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.Executors;
 import java.util.concurrent.ScheduledExecutorService;
 import java.util.concurrent.TimeUnit;
 import java.util.stream.Collectors;
 
-public class TaskManager implements ITaskManager{
+public class TaskManager implements ITaskManager {
 
     private final Map<String, TaskContext> tasks = new ConcurrentHashMap<>();
     private final EventBus eventBus;
 
     //@Value("${task.cleanup.expire-minutes:10}")
-    private long expireMinutes = 10;
+    private long expireMinutes = 30;
 
     //@Value("${task.cleanup.keep-last:10}")
-    private int keepLastCount = 10;
+    private int keepLastCount = 32;
 
     private final ScheduledExecutorService cleaner = Executors.newSingleThreadScheduledExecutor();
 
 
     public TaskManager(EventBus eventBus) {
         this.eventBus = eventBus;
-        cleaner.scheduleAtFixedRate(this::cleanupTasks, 1, 3, TimeUnit.MINUTES);
+        cleaner.scheduleAtFixedRate(this::cleanupTasks, 1, expireMinutes, TimeUnit.MINUTES);
     }
 
     public EventBus getEventBus() {
         return eventBus;
     }
 
-    public String submit(TaskType type, List<Object> objectIds, Map<String, Object> params) {
+
+    public String submit(String coll, TaskType type, List<Object> objectIds, Map<String, Object> params) {
         String taskId = UUID.randomUUID().toString();
-        TaskContext context = new TaskContext(taskId, type, objectIds, params);
+        TaskContext context = new TaskContext(taskId, coll, type, objectIds, params);
         tasks.put(taskId, context);
-        eventBus.post(new TaskEvent(taskId, type, objectIds, params));
+        eventBus.post(new TaskEvent(taskId, coll, type, objectIds, params));
         return taskId;
     }
 
-    public boolean cancel(String taskId) {
+    public boolean cancel(String coll, String taskId) {
         TaskContext context = tasks.get(taskId);
         if (context != null) {
             context.cancel();
@@ -54,12 +52,45 @@ public class TaskManager implements ITaskManager{
         return false;
     }
 
-    public TaskContext getTask(String taskId) {
+    @Override
+    public boolean delete(String coll, String taskId) {
+        TaskContext context = tasks.get(taskId);
+        if (context != null && context.isTerminal()) {
+            tasks.remove(taskId);
+            return true;
+        }
+        return false;
+    }
+
+    @Override
+    public Collection<TaskContext> findByStatus(String coll, String status) {
+        TaskStatus taskType = TaskStatus.valueOf(status.toUpperCase());
+        Collection<TaskContext> values = tasks.values();
+        Collection<TaskContext> values2 = new ArrayList<>();
+
+        for (TaskContext context : values) {
+
+            if (context.getCollection().equals(coll) && context.getStatus() == taskType) {
+                values2.add(context);
+            }
+        }
+        return values2;
+    }
+
+    public TaskContext getTask(String coll, String taskId) {
         return tasks.get(taskId);
     }
 
-    public Collection<TaskContext> allTasks() {
-        return tasks.values();
+    @Override
+    public Collection<TaskContext> allTasks(String coll) {
+        Collection<TaskContext> values = tasks.values();
+        Collection<TaskContext> values2 = new ArrayList<>();
+        for (TaskContext context : values) {
+            if (context.getCollection().equals(coll)) {
+                values2.add(context);
+            }
+        }
+        return values2;
     }
 
     private void cleanupTasks() {
@@ -83,16 +114,42 @@ public class TaskManager implements ITaskManager{
         }
     }
 
+    //@Override
+    protected int cleanupTasks(String coll) {
+        //Instant now = Instant.now();
+        List<Map.Entry<String, TaskContext>> candidates = tasks.entrySet()
+                .stream()
+                .filter(e -> {
+                    TaskContext c = e.getValue();
+                    return c.getCollection().equals(coll) && c.isTerminal();
+                })
+                //.sorted((a, b) -> b.getValue().getCompletedAt().compareTo(a.getValue().getCompletedAt()))
+                .collect(Collectors.toList());
+
+        int count = candidates.size();
+        for (Map.Entry<String, TaskContext> e : candidates) {
+            tasks.remove(e.getKey());
+        }
+        // 保留最近 keepLastCount 个任务
+//        if (candidates.size() > keepLastCount) {
+//            List<Map.Entry<String, TaskContext>> toRemove = candidates.subList(keepLastCount, candidates.size());
+//            for (Map.Entry<String, TaskContext> e : toRemove) {
+//                tasks.remove(e.getKey());
+//            }
+//        }
+        return count;
+    }
+
     @Override
-    public void updateExtra(String taskId, Map<String, Object> extra){
+    public void updateExtra(String coll, String taskId, Map<String, Object> extra) {
         TaskContext taskContext = tasks.get(taskId);
         //Map<String, Object> extra1 = taskContext.getExtra();
         taskContext.setExtra(extra);
     }
 
 
-    public void cleanupNow() {
-        cleanupTasks(); // 可被接口调用
+    public int cleanupNow(String coll) {
+        return cleanupTasks(coll); // 可被接口调用
     }
 
 }

+ 35 - 35
server/src/main/java/com/giantan/data/tasks/controller/TaskController.java

@@ -12,43 +12,43 @@ import java.util.Map;
 //@RequestMapping("/api/tasks")
 public class TaskController {
 
-    private final TaskManager taskManager;
-
-    public TaskController(TaskManager taskManager) {
-        this.taskManager = taskManager;
-    }
-
-    @GetMapping("/{taskId}")
-    public ResponseEntity<TaskContext> getTask(@PathVariable String taskId) {
-        TaskContext context = taskManager.getTask(taskId);
-        return context != null ? ResponseEntity.ok(context) : ResponseEntity.notFound().build();
-    }
-
-    @PostMapping("/submit")
-    public ResponseEntity<String> submit(@RequestBody TaskRequest request) {
-        String id = taskManager.submit(request.getType(), request.getObjectIds(), request.getParams());
-        return ResponseEntity.ok(id);
-    }
-
-    @PostMapping("/{taskId}/cancel")
-    public ResponseEntity<?> cancel(@PathVariable String taskId) {
-        boolean ok = taskManager.cancel(taskId);
-        return ok ? ResponseEntity.ok().build() : ResponseEntity.notFound().build();
-    }
-
-    @PatchMapping("/{taskId}/extra")
-    public ResponseEntity<?> updateExtra(
-            @PathVariable String taskId,
-            @RequestBody Map<String, Object> extra
-    ) {
-//        TaskStatusHistory current = repository.findCurrentByTaskId(taskId);
-//        if (current == null) return ResponseEntity.notFound().build();
+//    private final TaskManager taskManager;
+//
+//    public TaskController(TaskManager taskManager) {
+//        this.taskManager = taskManager;
+//    }
+//
+//    @GetMapping("/{taskId}")
+//    public ResponseEntity<TaskContext> getTask(@PathVariable String taskId) {
+//        TaskContext context = taskManager.getTask(taskId);
+//        return context != null ? ResponseEntity.ok(context) : ResponseEntity.notFound().build();
+//    }
+//
+//    @PostMapping("/submit")
+//    public ResponseEntity<String> submit(@RequestBody TaskRequest request) {
+//        String id = taskManager.submit(request.getType(), request.getObjectIds(), request.getParams());
+//        return ResponseEntity.ok(id);
+//    }
+//
+//    @PostMapping("/{taskId}/cancel")
+//    public ResponseEntity<?> cancel(@PathVariable String taskId) {
+//        boolean ok = taskManager.cancel(taskId);
+//        return ok ? ResponseEntity.ok().build() : ResponseEntity.notFound().build();
+//    }
 //
-//        repository.updateExtra(taskId, extra);
+//    @PatchMapping("/{taskId}/extra")
+//    public ResponseEntity<?> updateExtra(
+//            @PathVariable String taskId,
+//            @RequestBody Map<String, Object> extra
+//    ) {
+////        TaskStatusHistory current = repository.findCurrentByTaskId(taskId);
+////        if (current == null) return ResponseEntity.notFound().build();
+////
+////        repository.updateExtra(taskId, extra);
+////        return ResponseEntity.ok().build();
+//        taskManager.updateExtra(taskId,extra);
 //        return ResponseEntity.ok().build();
-        taskManager.updateExtra(taskId,extra);
-        return ResponseEntity.ok().build();
-    }
+//    }
 }
 
 

+ 1 - 0
server/src/main/java/com/giantan/data/tasks/repository/DynamicTaskRepository.java

@@ -23,6 +23,7 @@ public class DynamicTaskRepository {
                     id BIGSERIAL PRIMARY KEY,
                       task_id VARCHAR(64) NOT NULL,
                       task_type VARCHAR(32) NOT NULL,
+                      collection VARCHAR(64),
                       object_ids TEXT,
                       object_statuses JSONB,
                       params JSONB,

+ 42 - 7
server/src/main/java/com/giantan/data/tasks/repository/PersistentTaskManager.java

@@ -11,40 +11,75 @@ import java.util.Map;
 
 public class PersistentTaskManager implements ITaskManager {
     @Override
-    public String submit(TaskType type, List<Object> objectIds, Map<String, Object> params) {
+    public String submit(String collection, TaskType type, List<Object> objectIds, Map<String, Object> params) {
         return "";
     }
 
     @Override
-    public boolean cancel(String taskId) {
+    public boolean cancel(String coll, String taskId) {
         return false;
     }
 
     @Override
-    public TaskContext getTask(String taskId) {
+    public TaskContext getTask(String coll, String taskId) {
         return null;
     }
 
     @Override
-    public Collection<TaskContext> allTasks() {
+    public Collection<TaskContext> allTasks(String coll) {
         return List.of();
     }
 
     @Override
-    public void updateExtra(String taskId, Map<String, Object> extra) {
+    public void updateExtra(String coll, String taskId, Map<String, Object> extra) {
 
     }
 
     @Override
-    public void cleanupNow() {
-
+    public int cleanupNow(String coll) {
+        return 0;
     }
 
+//    @Override
+//    public boolean cancel(String taskId) {
+//        return false;
+//    }
+//
+//    @Override
+//    public TaskContext getTask(String taskId) {
+//        return null;
+//    }
+//
+//    @Override
+//    public Collection<TaskContext> allTasks() {
+//        return List.of();
+//    }
+//
+//    @Override
+//    public void updateExtra(String taskId, Map<String, Object> extra) {
+//
+//    }
+//
+//    @Override
+//    public void cleanupNow() {
+//
+//    }
+
     @Override
     public EventBus getEventBus() {
         return null;
     }
 
+    @Override
+    public boolean delete(String coll, String id) {
+        return false;
+    }
+
+    @Override
+    public Collection<TaskContext> findByStatus(String coll, String status) {
+        return List.of();
+    }
+
 //    private final TaskStatusHistoryRepository repository;
 //    private final EventBus eventBus;
 //

+ 1 - 0
server/src/main/java/com/giantan/data/tasks/repository/TaskStatusHistory.java

@@ -31,6 +31,7 @@ import java.util.Map;
 public class TaskStatusHistory {
     private Long id;
     private String taskId;
+    private String collection;
     private String taskType;
     private List<String> objectIds;
     private Map<String, TaskObjectStatus> objectStatuses;

+ 16 - 1
server/src/main/resources/application.yml

@@ -23,6 +23,19 @@ spring:
     connection-timeout: 30000
     pool-name: MyHikariPool
 
+  ai:
+    deepseek:
+      api-key: 87e85db9-1ffc-49fa-a703-87bd6dd94b11
+      base-url: https://ark.cn-beijing.volces.com/api/v3
+      chat:
+        model: deepseek-v3-241226
+
+    openai:
+      api-key: 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
+#      base-url: https://ark.cn-beijing.volces.com/api
+#      chat:
+#        options:
+#          model: gpt-4o
 
 
 local:
@@ -43,4 +56,6 @@ oss:
 task:
   cleanup:
     expire-minutes: 10
-    keep-last: 10
+    keep-last: 10
+
+

+ 16 - 0
server/src/test/java/com/giantan/data/mds/MdsApplicationTests.java

@@ -1,13 +1,29 @@
 package com.giantan.data.mds;
 
+import com.giantan.data.mds.bot.GChatClient;
 import org.junit.jupiter.api.Test;
+import org.springframework.beans.factory.annotation.Autowired;
 import org.springframework.boot.test.context.SpringBootTest;
 
 @SpringBootTest
 class MdsApplicationTests {
 
+	@Autowired
+	GChatClient deepseek2;
+
 	@Test
 	void contextLoads() {
+		System.out.println("Hello World");
+		//fetchInfo();
+
+		String s="发烧";
+		long t= System.currentTimeMillis();
+		System.out.println(s);
+		//String ret = deepseek2.askOpenaiForAijiu(s);
+		String ret = deepseek2.ask(s);
+		t = System.currentTimeMillis()-t;
+		System.out.println(ret);
+		System.out.println("used time = "+t);
 	}
 
 }

+ 155 - 12
tools/src/test/java/com/giantan/mds/MdSearcherTest.java

@@ -1,11 +1,15 @@
 package com.giantan.mds;
 
 import com.google.gson.Gson;
+import org.cnnlp.data.document.GDocConstants;
+import org.cnnlp.data.md.MdSearcher;
 import org.cnnlp.data.util.BaseParameters;
 
 import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.List;
 import java.util.Map;
 
@@ -53,18 +57,157 @@ public class MdSearcherTest {
 
     }
 
+    public static int findSubArrayStartIndex(String[] ss1, String[] ss2) {
 
-    public static void main(String[] args) throws IOException {
-        String f = "D:\\data\\乙烯\\target\\四川操规与应急操作卡\\1000万吨年常减压蒸馏装置\\1000万吨年常减压蒸馏装置操作规程.md";
-        String content = Files.readString(Path.of(f));
-        MdSearcher searcher = new MdSearcher();
-        searcher.load(content, BaseParameters.defaultParams());
-        String[] headings = {
-                "[第十章 安全生产及环境保护]()"
-//                "[10.6 本装置主要有害物、介质(易燃易爆、有毒)的有关参数]()",
-//                "[10.6.1 本装置危险化学品]()"
-        };
-        Map<String, Object> map = searcher.searchByHeadings(headings);
-        System.out.println("map="+map);
+        for (int i = 0; i < ss2.length; i++) {
+            int idx1 = 0;
+            int idx2 = i;
+
+            while (idx2 < ss2.length && idx1 < ss1.length) {
+                if (ss2[idx2].isEmpty()) {
+                    idx2++; // 跳过空串
+                    continue;
+                }
+
+                if (!ss2[idx2].equals(ss1[idx1])) {
+                    //continue outer; // 匹配失败,跳出外层循环
+                    break ;
+                }
+
+                idx1++;
+                idx2++;
+            }
+
+            if (idx1 == ss1.length) {
+                return i; // 找到起始点
+            }
+        }
+
+        return -1; // 未找到
     }
+
+
+    public static int[] findSubArrayRange(String[] ss1, String[] ss2) {
+
+        for (int i = 0; i < ss2.length; i++) {
+            int idx1 = 0;
+            int idx2 = i;
+            int start = -1;
+            int end = -1;
+
+            while (idx2 < ss2.length && idx1 < ss1.length) {
+                if (ss2[idx2].isEmpty()) {
+                    idx2++; // 忽略空串
+                    continue;
+                }
+
+                if (!ss2[idx2].equals(ss1[idx1])) {
+                    break; // 匹配失败,从下一个 i 开始
+                }
+
+                if (start == -1) {
+                    start = idx2;
+                }
+                end = idx2;
+                idx1++;
+                idx2++;
+            }
+
+            if (idx1 == ss1.length) {
+                return new int[]{start, end}; // 返回实际在 ss2 中的起止索引
+            }
+        }
+
+        return null; // 没有匹配
+    }
+
+    public void findSubArrayRangeTest(String[] args) {
+        String[] ss1 = {"a", "b"};
+        String[] ss2 = {"a2","", "a", "","b", "c"};
+        int pos = findSubArrayStartIndex(ss1, ss2);
+        System.out.println(pos); // 输出 1
+
+        int[] range = findSubArrayRange(ss1, ss2);
+        System.out.println(Arrays.toString(range));
+    }
+
+//    public static void main(String[] args) throws IOException {
+//        String f = "D:\\data\\乙烯\\target\\四川操规与应急操作卡\\1000万吨年常减压蒸馏装置\\1000万吨年常减压蒸馏装置操作规程.md";
+//        String content = Files.readString(Path.of(f));
+//        MdSearcher searcher = new MdSearcher();
+//        searcher.load(content, BaseParameters.defaultParams());
+//        String[] headings = {
+//                "[第十章 安全生产及环境保护]()"
+////                "[10.6 本装置主要有害物、介质(易燃易爆、有毒)的有关参数]()",
+////                "[10.6.1 本装置危险化学品]()"
+//        };
+//        //Map<String, Object> map = searcher.searchByHeadings(headings);
+//        //System.out.println("map="+map);
+//
+//        String t1 = "10.6 本装置主要有害物、介质(易燃易爆、有毒)的有关参数]()";
+//        Map<String, Object> map2 = searcher.searchByPlainTxt(t1,481363);//481404
+//
+//        System.out.println("map="+map2);
+//
+//        String text = searcher.getText();
+//        System.out.println(text.substring((Integer)map2.get(GDocConstants.START_OFFSET),(Integer)map2.get(GDocConstants.END_OFFSET)));
+//    }
+
+    public static int[] findSubArrayRangeWithLineSplit(String[] ss1, String[] ss2) {
+        // 构造逻辑行列表:每一行来自 ss2[i] 的第 j 段
+        class Line {
+            String text;
+            int ss2Index; // 原始在ss2中的索引
+            public Line(String text, int ss2Index) {
+                this.text = text;
+                this.ss2Index = ss2Index;
+            }
+        }
+
+        List<Line> flatList = new ArrayList<>();
+        for (int i = 0; i < ss2.length; i++) {
+            String s = ss2[i];
+            if (s.isEmpty()) continue;
+
+            String[] lines = s.split("\n");
+            for (String line : lines) {
+                if (!line.isEmpty()) {
+                    flatList.add(new Line(line, i));
+                }
+            }
+        }
+
+        // 尝试在 flatList 中找到 ss1 匹配子序列
+        for (int i = 0; i <= flatList.size() - ss1.length; i++) {
+            boolean matched = true;
+            for (int j = 0; j < ss1.length; j++) {
+                if (!flatList.get(i + j).text.equals(ss1[j])) {
+                    matched = false;
+                    break;
+                }
+            }
+
+            if (matched) {
+                int start = flatList.get(i).ss2Index;
+                int end = flatList.get(i + ss1.length - 1).ss2Index;
+                return new int[]{start, end};
+            }
+        }
+
+        return null; // 未找到
+    }
+
+    public static void main(String[] args) {
+        String[] ss1 = {"aa", "bd","c"};
+        String[] ss2 = {"", "aa\n\nbd\n", "", "c\n"};
+
+        int[] range = findSubArrayRangeWithLineSplit(ss1, ss2);
+        if (range != null) {
+            System.out.println("Start: " + range[0] + ", End: " + range[1]);
+        } else {
+            System.out.println("Not found");
+        }
+    }
+
+
 }