3
0

2 Revīzijas 9354e3a99a ... 4afc0dc8e2

Autors SHA1 Ziņojums Datums
  dwp 4afc0dc8e2 DEFAULT_CHUNK_SIZE = 768 ; //512 3 mēneši atpakaļ
  dwp a05313fae4 增加了 upload 上传文件重名检测 4 mēneši atpakaļ
39 mainītis faili ar 1967 papildinājumiem un 481 dzēšanām
  1. 6 1
      .idea/misc.xml
  2. 6 0
      gtbook/src/main/java/org/cnnlp/data/splitter/ChunkConstants.java
  3. 2 2
      gtbook/src/main/java/org/cnnlp/data/splitter/MdChunking.java
  4. 8 15
      gtbook/src/test/java/org/cnnlp/data/splitter/MdChunkSplitter1.java
  5. 0 1
      pom.xml
  6. 10 8
      server/pom.xml
  7. 258 76
      server/src/main/java/com/giantan/ai/util/dict/GLookups.java
  8. 259 0
      server/src/main/java/com/giantan/ai/util/trie/ConcurrentGTrieNode.java
  9. 37 91
      server/src/main/java/com/giantan/ai/util/trie/GTrieNode.java
  10. 2 2
      server/src/main/java/com/giantan/data/dk/repository/DkIndexer.java
  11. 1 1
      server/src/main/java/com/giantan/data/index/HybridIndexer.java
  12. 67 0
      server/src/main/java/com/giantan/data/index/HybridSearch.java
  13. 5 0
      server/src/main/java/com/giantan/data/index/IHybridSearch.java
  14. 15 0
      server/src/main/java/com/giantan/data/index/IndexConfig.java
  15. 36 4
      server/src/main/java/com/giantan/data/index/MilvusSearchRequestBuilder.java
  16. 3 1
      server/src/main/java/com/giantan/data/kvs/repository/GEntity.java
  17. 1 1
      server/src/main/java/com/giantan/data/mds/MdsApplication.java
  18. 20 5
      server/src/main/java/com/giantan/data/mds/controller/MdDocsController.java
  19. 1 2
      server/src/main/java/com/giantan/data/mds/controller/ProxySearchContoller.java
  20. 13 6
      server/src/main/java/com/giantan/data/mds/controller/TaxonomyController.java
  21. 3 1
      server/src/main/java/com/giantan/data/mds/service/IMdDocsService.java
  22. 4 1
      server/src/main/java/com/giantan/data/mds/service/impl/MdCollectionsService.java
  23. 64 62
      server/src/main/java/com/giantan/data/mds/service/impl/MdDocsService.java
  24. 7 13
      server/src/main/java/com/giantan/data/mds/service/impl/MdTaxonomyService.java
  25. 185 32
      server/src/main/java/com/giantan/data/mds/task/impl/BaseTaskHandler.java
  26. 1 0
      server/src/main/java/com/giantan/data/qa/constant/QaConstants.java
  27. 30 0
      server/src/main/java/com/giantan/data/qa/controller/CollsSearchController.java
  28. 1 2
      server/src/main/java/com/giantan/data/qa/controller/QaProxySearchContoller.java
  29. 63 0
      server/src/main/java/com/giantan/data/qa/model/CollsSearchRequest.java
  30. 14 0
      server/src/main/java/com/giantan/data/qa/model/SearchType.java
  31. 4 0
      server/src/main/java/com/giantan/data/qa/repository/QaDocRepository.java
  32. 44 5
      server/src/main/java/com/giantan/data/qa/repository/QaIndexer.java
  33. 302 0
      server/src/main/java/com/giantan/data/qa/service/CollsSearchService.java
  34. 72 0
      server/src/main/java/com/giantan/data/qa/service/GoeSimService.java
  35. 3 0
      server/src/main/java/com/giantan/data/qa/service/QaDocsService.java
  36. 419 0
      server/src/main/java/com/giantan/data/qa/service/TextSimilarity.java
  37. 1 2
      server/src/test/java/com/giantan/data/mds/MdSearcherTest.java
  38. 0 37
      tools/pom.xml
  39. 0 110
      tools/src/test/java/com/giantan/mds/Main.java

+ 6 - 1
.idea/misc.xml

@@ -7,8 +7,13 @@
         <option value="$PROJECT_DIR$/pom.xml" />
       </list>
     </option>
+    <option name="ignoredFiles">
+      <set>
+        <option value="$PROJECT_DIR$/tools/pom.xml" />
+      </set>
+    </option>
   </component>
-  <component name="ProjectRootManager" version="2" languageLevel="JDK_17" default="true" project-jdk-name="17" project-jdk-type="JavaSDK">
+  <component name="ProjectRootManager" version="2" languageLevel="JDK_21" default="true" project-jdk-name="21" project-jdk-type="JavaSDK">
     <output url="file://$PROJECT_DIR$/out" />
   </component>
 </project>

+ 6 - 0
gtbook/src/main/java/org/cnnlp/data/splitter/ChunkConstants.java

@@ -7,4 +7,10 @@ public class ChunkConstants {
     public static final String HEAD_OVERLAP = "headOverlap";
 
     public static final String CHUNK_TITLES = "chunkTitles";  //默认带上 几级标题
+
+    public static final int DEFAULT_CHUNK_SIZE = 768; // 512
+
+    public static final int DEFAULT_CHUNK_OVERLAP = 100; // 64
+
+    public static final int DEFAULT_HEAD_OVERLAP = 2;
 }

+ 2 - 2
gtbook/src/main/java/org/cnnlp/data/splitter/MdChunking.java

@@ -24,8 +24,8 @@ public class MdChunking implements IMdChunking {
 
     private static final String CHUNK_PARAGRAPH_IDX = GDocConstants.PARAGRAPH_CHUNK_IDX;
 
-    protected int defaultChunkSize = 512;
-    protected int defaultChunkOverlap = 64;
+    protected int defaultChunkSize = ChunkConstants.DEFAULT_CHUNK_SIZE;
+    protected int defaultChunkOverlap = ChunkConstants.DEFAULT_CHUNK_OVERLAP;
     protected int defaultChunkTitles = 0;
 
     protected boolean defaultHeadOverlap = false;

+ 8 - 15
gtbook/src/test/java/org/cnnlp/data/splitter/MdChunkSplitter1.java

@@ -20,8 +20,8 @@ import java.util.Map;
 public class MdChunkSplitter1 extends BaseMdParser implements DocumentTransformer {
 
 
-    protected int defaultChunkSize = 512;
-    protected int defaultChunkOverlap = 64;
+    protected int defaultChunkSize = ChunkConstants.DEFAULT_CHUNK_SIZE;  // 原来512
+    protected int defaultChunkOverlap = ChunkConstants.DEFAULT_CHUNK_OVERLAP;
     protected int defaultChunkTitles = 0;
 
     protected boolean defaultHeadOverlap = false;
@@ -256,17 +256,10 @@ public class MdChunkSplitter1 extends BaseMdParser implements DocumentTransforme
 //        }
 //    }
 
-    public static void main(String[] args) throws IOException {
-        String md = "D:\\testdata\\md\\官网银行卡知识.1.md";
-        MdChunkSplitter1 splitter = new MdChunkSplitter1();
-        BaseParameters params = BaseParameters.defaultParams();
-        splitter.split(Paths.get(md), params);
-
-
-//        Ulid ulid = UlidCreator.getUlid();
-//        System.out.println(ulid.toLowerCase());
-//        ulid = UlidCreator.getUlid();
-//        System.out.println(ulid.toLowerCase());
-
-    }
+//    public static void main(String[] args) throws IOException {
+//        String md = "D:\\testdata\\md\\官网银行卡知识.1.md";
+//        MdChunkSplitter1 splitter = new MdChunkSplitter1();
+//        BaseParameters params = BaseParameters.defaultParams();
+//        splitter.split(Paths.get(md), params);
+//    }
 }

+ 0 - 1
pom.xml

@@ -12,7 +12,6 @@
         <module>gtbook</module>
         <module>server</module>
         <module>gfs</module>
-        <module>tools</module>
     </modules>
 
     <properties>

+ 10 - 8
server/pom.xml

@@ -9,12 +9,12 @@
         <version>1.0.0</version>
     </parent>
 
-    <version>3.0.1</version>
+    <version>3.1.2</version>
     <artifactId>mdserver</artifactId>
 
     <properties>
-        <maven.compiler.source>17</maven.compiler.source>
-        <maven.compiler.target>17</maven.compiler.target>
+        <maven.compiler.source>21</maven.compiler.source>
+        <maven.compiler.target>21</maven.compiler.target>
         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
         <spring.boot.version>3.5.4</spring.boot.version>
         <spring-ai.version>1.0.1</spring-ai.version>
@@ -99,6 +99,13 @@
             <version>32.1.3-jre</version>
         </dependency>
 
+        <!-- https://mvnrepository.com/artifact/com.google.code.gson/gson -->
+        <dependency>
+            <groupId>com.google.code.gson</groupId>
+            <artifactId>gson</artifactId>
+            <version>2.12.1</version>
+        </dependency>
+
         <!-- https://mvnrepository.com/artifact/com.github.ben-manes.caffeine/caffeine -->
         <dependency>
             <groupId>com.github.ben-manes.caffeine</groupId>
@@ -134,11 +141,6 @@
 
         </dependency>
 
-        <dependency>
-            <groupId>com.giantan.mds</groupId>
-            <artifactId>tools</artifactId>
-            <version>1.0.0</version>
-        </dependency>
 
         <!--		<dependency>-->
         <!--			<groupId>com.giantan.ai.domain</groupId>-->

+ 258 - 76
server/src/main/java/com/giantan/ai/util/dict/GLookups.java

@@ -6,8 +6,10 @@ import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.concurrent.locks.Lock;
+import java.util.concurrent.locks.ReadWriteLock;
+import java.util.concurrent.locks.ReentrantReadWriteLock;
 
-//import static com.giantan.ai.domain.entry.constant.EntryConstants.*;
 
 public class GLookups {
     public static final int NO_VALUE = -1;
@@ -16,10 +18,14 @@ public class GLookups {
     public static final String FIELD_LENGTH = "length";
     public static final String FIELD_VALUES = "values";
 
+    //    ITrieTokenizer tokenizer;
+    //    List<IItem> words;
+    private volatile LookupState state;
 
-    ITrieTokenizer tokenizer;
-    List<IItem> words;
     ISegementer segementer;
+    private final ReadWriteLock rwLock = new ReentrantReadWriteLock();
+    private final Lock readLock = rwLock.readLock();
+    private final Lock writeLock = rwLock.writeLock();
 
     public GLookups() {
         segementer = new Segmenter();
@@ -27,14 +33,44 @@ public class GLookups {
     }
 
     private void init() {
-        tokenizer = new IntTrieTokenizer();
-        tokenizer.setSegmenter(segementer);
-        words = new ArrayList<IItem>();
+        writeLock.lock();
+        try {
+            ITrieTokenizer tokenizer = new IntTrieTokenizer();
+            tokenizer.setSegmenter(segementer);
+            List<IItem> words = new ArrayList<IItem>();
+            LookupState state1 = new LookupState(tokenizer, words);
+            this.state = state1;
+        } finally {
+            writeLock.unlock();
+        }
+    }
+
+    public void setState(LookupState state) {
+        writeLock.lock();
+        try {
+            this.state = state;
+        } finally {
+            writeLock.unlock();
+        }
     }
 
     public void setSegementer(ISegementer segementer) {
-        this.segementer = segementer;
-        tokenizer.setSegmenter(segementer);
+        writeLock.lock();
+        try {
+            this.segementer = segementer;
+            //tokenizer.setSegmenter(segementer);
+        } finally {
+            writeLock.unlock();
+        }
+    }
+
+    public ISegementer getSegementer() {
+        readLock.lock();
+        try {
+            return segementer;
+        } finally {
+            readLock.unlock();
+        }
     }
 
     public void clear() {
@@ -42,123 +78,246 @@ public class GLookups {
     }
 
     public List<Triple> tagging(String query) {
-        List<Triple> triples = tokenizer.chunkTokenize(query);
-        return triples;
+        readLock.lock();
+        try {
+            LookupState current = state;
+            List<Triple> triples = current.getTokenizer().chunkTokenize(query);
+            return triples;
+        } finally {
+            readLock.unlock();
+        }
     }
 
 
-    private Map<String,Object> getValue(Triple triple){
+//    private Map<String, Object> getValue(Triple triple) {
+//        LookupState current = state;
+//        int value = triple.getValue();
+//        if (value == NO_VALUE) return null;
+//        IItem item = current.getWords().get(value);
+//        if (item == null) return null;
+//        Map<String, Object> r = new HashMap<String, Object>();
+//        r.put(FIELD_OFFSET, triple.getIndex());
+//        r.put(FIELD_LENGTH, triple.getLen());
+//        r.put(FIELD_VALUES, item.getValues());
+//        return r;
+//    }
+
+    private Map<String, Object> getValue(LookupState current, Triple triple) {
         int value = triple.getValue();
         if (value == NO_VALUE) return null;
+        List<IItem> words = current.getWords();
+        if (value < 0 || value >= words.size()) return null; // 防御
         IItem item = words.get(value);
         if (item == null) return null;
-        Map<String,Object> r = new HashMap<String,Object>();
-        r.put(FIELD_OFFSET,triple.getIndex());
-        r.put(FIELD_LENGTH,triple.getLen());
-        r.put(FIELD_VALUES, item.getValues());
+
+        Map<String, Object> r = new HashMap<>();
+        r.put(FIELD_OFFSET, triple.getIndex());
+        r.put(FIELD_LENGTH, triple.getLen());
+        r.put(FIELD_VALUES, item.getValues()); // 注意:这里最好返回 copy(见下)
         return r;
     }
 
-    public List<Map<String,Object>> taggingAndGetValues(String query) {
-        List<Triple> triples = tokenizer.chunkTokenize(query);
-        if (triples!= null && triples.size()>0) {
-            List<Map<String,Object>> rets = new ArrayList<>();
-            for (Triple triple : triples) {
-                Map<String, Object> value = getValue(triple);
-                if (value != null) {
-                    rets.add(value);
+    public List<Map<String, Object>> taggingAndGetValues(String query) {
+        readLock.lock();
+        try {
+            LookupState current = state;
+            List<Triple> triples = current.getTokenizer().chunkTokenize(query);
+            if (triples != null && triples.size() > 0) {
+                List<Map<String, Object>> rets = new ArrayList<>();
+                for (Triple triple : triples) {
+                    Map<String, Object> value = getValue(current, triple);
+                    if (value != null) {
+                        rets.add(value);
+                    }
                 }
+                return rets;
             }
-            return rets;
+            return List.of();
+        } finally {
+            readLock.unlock();
         }
-        return List.of();
     }
 
-    public synchronized int addLabel(String label, int value) {
-        int v1 = tokenizer.getValue(label);
+//    public int addLabel(String label, int value) {
+//        writeLock.lock();
+//        try {
+//            LookupState current = state;
+//            int v1 = current.getTokenizer().getValue(label);
+//            if (v1 <= NO_VALUE) {
+//                int idx = current.getWords().size();
+//                current.getTokenizer().addWord(label, idx);
+//                current.getWords().add(buildItem(label, idx, value));
+//                return idx;
+//            } else {
+//                IItem item = current.getWords().get(v1);
+//                if (item != null) {
+//                    item.addValue(value);
+//                    return item.getId();
+//                } else {
+//                    item = buildItem(label, v1, value);
+//                    current.getWords().set(v1, item);
+//                    return item.getId();
+//                }
+//            }
+//        } finally {
+//            writeLock.unlock();
+//        }
+//    }
+//
+//    public void addLabel(String label, List<String> altLabels, int value) {
+//        writeLock.lock();
+//        try {
+//            if (label != null) {
+//                addLabel(label, value);
+//            }
+//            for (String altLabel : altLabels) {
+//                addLabel(altLabel, value);
+//            }
+//        } finally {
+//            writeLock.unlock();
+//        }
+//    }
+
+    private int addLabelUnderWriteLock(LookupState current, String label, int value) {
+        int v1 = current.getTokenizer().getValue(label);
         if (v1 <= NO_VALUE) {
-            int idx = words.size();
-            tokenizer.addWord(label, idx);
-            words.add(buildItem(label, idx, value));
+            int idx = current.getWords().size();
+            current.getTokenizer().addWord(label, idx);
+            current.getWords().add(buildItem(label, idx, value));
             return idx;
         } else {
-            IItem item = words.get(v1);
+            IItem item = current.getWords().get(v1);
             if (item != null) {
                 item.addValue(value);
                 return item.getId();
             } else {
                 item = buildItem(label, v1, value);
-                words.set(v1, item);
+                current.getWords().set(v1, item);
                 return item.getId();
             }
         }
     }
 
-    public void addLabel(String label, List<String> altLabels, int value) {
-        if (label != null){
-            addLabel(label, value);
+    public int addLabel(String label, int value) {
+        int r = -1;
+        writeLock.lock();
+        try {
+            LookupState current = state;
+            if (label != null) {
+                r = addLabelUnderWriteLock(current, label, value);
+            }
+        } finally {
+            writeLock.unlock();
         }
-        for (String altLabel : altLabels) {
-            addLabel(altLabel, value);
+        return r;
+    }
+
+    public void addLabel(String label, List<String> altLabels, int value) {
+        writeLock.lock();
+        try {
+            LookupState current = state;
+            if (label != null) addLabelUnderWriteLock(current, label, value);
+            if (altLabels != null) {
+                for (String alt : altLabels) {
+                    if (alt != null) addLabelUnderWriteLock(current, alt, value);
+                }
+            }
+        } finally {
+            writeLock.unlock();
         }
     }
 
-    public synchronized void removeLabel(String label) {
-        int v1 = tokenizer.getValue(label);
-        if (v1 > NO_VALUE) {
-            IItem item = words.get(v1);
-            if (item != null) {
-                words.set(v1, null);
+
+    public void removeLabel(String label) {
+        writeLock.lock();
+        try {
+            LookupState current = state;
+            int v1 = current.getTokenizer().getValue(label);
+            if (v1 > NO_VALUE) {
+                IItem item = current.getWords().get(v1);
+                if (item != null) {
+                    current.getWords().set(v1, null);
+                }
+                current.getTokenizer().removeWord(label);
             }
-            tokenizer.removeWord(label);
+        } finally {
+            writeLock.unlock();
         }
     }
 
-    public synchronized void removeLabel(String label, int value) {
-        int v1 = tokenizer.getValue(label);
-        if (v1 > NO_VALUE) {
-            IItem item = words.get(v1);
-            if (item != null) {
-                boolean b = item.deleteValue(value);
-                if (b) {
-                    if (item.isEmpty()) {
-                        words.set(v1, null);
-                        tokenizer.removeWord(label);
+    public void removeLabel(String label, int value) {
+        writeLock.lock();
+        try {
+            LookupState current = state;
+            int v1 = current.getTokenizer().getValue(label);
+            if (v1 > NO_VALUE) {
+                IItem item = current.getWords().get(v1);
+                if (item != null) {
+                    boolean b = item.deleteValue(value);
+                    if (b) {
+                        if (item.isEmpty()) {
+                            current.getWords().set(v1, null);
+                            current.getTokenizer().removeWord(label);
+                        }
                     }
                 }
-            }
 
+            }
+        } finally {
+            writeLock.unlock();
         }
     }
 
     public List<String> tokenize(String text) {
-        return tokenizer.tokenize(text);
+        readLock.lock();
+        try {
+            LookupState current = state;
+            return current.getTokenizer().tokenize(text);
+        } finally {
+            readLock.unlock();
+        }
     }
 
     public IItem getItem(String label) {
-        IItem item = null;
-        int v1 = tokenizer.getValue(label);
-        if (v1 > NO_VALUE) {
-            item = words.get(v1);
+        readLock.lock();
+        try {
+            LookupState current = state;
+            IItem item = null;
+            int v1 = current.getTokenizer().getValue(label);
+            if (v1 > NO_VALUE) {
+                item = current.getWords().get(v1);
+            }
+            return item;
+        } finally {
+            readLock.unlock();
         }
-        return item;
     }
 
     public int[] getValue(String label) {
-        int[] value = null;
-        IItem item = getItem(label);
-        if (item != null) {
-            value = item.getValues();
+        readLock.lock();
+        try {
+            int[] value = null;
+            IItem item = getItem(label);
+            if (item != null) {
+                value = item.getValues();
+            }
+            return value;
+        } finally {
+            readLock.unlock();
         }
-        return value;
     }
 
     public int getId(String label) {
-        IItem item = getItem(label);
-        if (item != null) {
-            return item.getId();
+        readLock.lock();
+        try {
+            IItem item = getItem(label);
+            if (item != null) {
+                return item.getId();
+            }
+            return NO_VALUE;
+        } finally {
+            readLock.unlock();
         }
-        return NO_VALUE;
     }
 
 //    public List<String> getAllEntry() {
@@ -172,13 +331,37 @@ public class GLookups {
 //    }
 
     public List<GItem> getAllEntry() {
-        List<GItem> ls = new ArrayList<>();
-        for (IItem item : words) {
-            if (item != null) {
-                ls.add((GItem) item);
+        readLock.lock();
+        try {
+            LookupState current = state;
+            List<GItem> ls = new ArrayList<>();
+            for (IItem item : current.getWords()) {
+                if (item != null) {
+                    ls.add((GItem) item);
+                }
             }
+            return ls;
+        } finally {
+            readLock.unlock();
+        }
+    }
+
+    public static final class LookupState {
+        private final ITrieTokenizer tokenizer;
+        private final List<IItem> words;
+
+        private LookupState(ITrieTokenizer tokenizer, List<IItem> words) {
+            this.tokenizer = tokenizer;
+            this.words = words;
+        }
+
+        public ITrieTokenizer getTokenizer() {
+            return tokenizer;
+        }
+
+        public List<IItem> getWords() {
+            return words;
         }
-        return ls;
     }
 
 
@@ -194,4 +377,3 @@ public class GLookups {
 
 
 }
-

+ 259 - 0
server/src/main/java/com/giantan/ai/util/trie/ConcurrentGTrieNode.java

@@ -0,0 +1,259 @@
+package com.giantan.ai.util.trie;
+
+import cnnlp.util.Int2ArrayList;
+import cnnlp.util.IntsToIntMap;
+import gnu.trove.TIntArrayList;
+
+import java.io.Externalizable;
+import java.io.IOException;
+import java.io.ObjectInput;
+import java.io.ObjectOutput;
+import java.util.Arrays;
+import java.util.List;
+import java.util.concurrent.locks.StampedLock;
+
+/*
+使用 StampedLock 提高读性能
+读操作优先用 tryOptimisticRead(),大多数情况下无锁
+写操作使用 writeLock(),确保数据一致性
+
+锁映射基于 hashCode(),确保同一对象始终获取相同的锁
+使用 LOCK_SIZE = 2^N 避免热点锁
+提高读性能,读多写少场景下无锁
+ */
+public class ConcurrentGTrieNode implements Externalizable, Cloneable {
+
+    int value = IntsToIntMap.NULL_VALUE;
+    int[] prefix;
+    IntsToIntMap iimap = new IntsToIntMap(4);
+
+    //transient final ReadWriteLock lock = new ReentrantReadWriteLock();
+    private static final int LOCK_SIZE = 8;  // 共享8个锁
+    private static final StampedLock[] LOCKS = new StampedLock[LOCK_SIZE];
+
+    static {
+        for (int i = 0; i < LOCK_SIZE; i++) {
+            LOCKS[i] = new StampedLock();
+        }
+    }
+
+    public ConcurrentGTrieNode() {
+    }
+
+    public ConcurrentGTrieNode(int[] prefix) {
+        this.prefix = prefix;
+    }
+
+//    public GTrieNode(int value, int[] prefix) {
+//        this.value = value;
+//        this.prefix = prefix;
+//    }
+
+    private StampedLock getLock() {
+        int index = (System.identityHashCode(this) & (LOCK_SIZE - 1)); // 保证同一对象总是取相同的锁
+        return LOCKS[index];
+    }
+
+    public int getValue() {
+        return value;
+    }
+
+    public void setValue(int value) {
+        this.value = value;
+        iimap.setData(value);
+    }
+
+    public int[] getPrefix() {
+        return prefix;
+    }
+
+    public void setPrefix(int[] prefix) {
+        this.prefix = prefix;
+    }
+
+    public int size() {
+        int size = iimap.size();
+//        if (getValue() != IntsToIntMap.NULL_VALUE) {
+        if (iimap.getData() != IntsToIntMap.NULL_VALUE) {
+            size++;
+        }
+        return size;
+    }
+
+    /////////////
+
+    private void addWord(int[] ss, int value) {
+        iimap.add(ss, value);
+        StampedLock lock = getLock();
+        long stamp = lock.writeLock();
+        try {
+            iimap.add(ss, value);
+        } finally {
+            lock.unlockWrite(stamp);
+        }
+    }
+
+    private void removeWord(int[] ss) {
+        StampedLock lock = getLock();
+        long stamp = lock.writeLock();
+        try {
+            iimap.remove(ss);
+        } finally {
+            lock.unlockWrite(stamp);
+        }
+    }
+
+    private Int2ArrayList getAllMatches(int[] ss, int offset) {
+        StampedLock lock = getLock();
+        long stamp = lock.tryOptimisticRead();
+        Int2ArrayList matches = iimap.getAllMatches(ss, offset);
+        if (!lock.validate(stamp)) {
+            stamp = lock.readLock();
+            try {
+                matches = iimap.getAllMatches(ss, offset);
+            } finally {
+                lock.unlockRead(stamp);
+            }
+        }
+        return matches;
+    }
+
+    private int[] get(int[] keys, int offset, int[] rets) {
+        StampedLock lock = getLock();
+        long stamp = lock.tryOptimisticRead();
+        iimap.get(keys, offset, rets);
+        if (!lock.validate(stamp)) {
+            stamp = lock.readLock();
+            try {
+                iimap.get(keys, offset, rets);
+            } finally {
+                lock.unlockRead(stamp);
+            }
+        }
+        return rets;
+    }
+
+    private int get(int[] keys) {
+        StampedLock lock = getLock();
+        long stamp = lock.tryOptimisticRead();
+        int value = iimap.get(keys);
+        if (!lock.validate(stamp)) {
+            stamp = lock.readLock();
+            try {
+                value = iimap.get(keys);
+            } finally {
+                lock.unlockRead(stamp);
+            }
+        }
+        return value;
+    }
+    /////////////////
+
+    //rets[0]= value;rets[1]=匹配上的长度
+    //2025.3.17 iimap.getAllMatches() 和 iimap.get() 是一样的
+    public Int2ArrayList getMaxMatched(int[] ss, int offset) {
+        if (offset >= ss.length) {
+            return null;
+        }
+
+        //Int2ArrayList matches = iimap.getAllMatches(ss, offset);
+        Int2ArrayList matches = getAllMatches(ss, offset);
+
+        if (matches == null) {
+            matches = new Int2ArrayList();
+            if (getValue() != IntsToIntMap.NULL_VALUE) {
+                matches.add(getValue(), prefix.length);
+            }
+        } else {
+            for (int i = 0; i < matches.size(); i++) {
+                matches.setV2(i, matches.getV2(i) + prefix.length);
+            }
+        }
+        return matches;
+    }
+
+    // offset 已经不用考虑前缀
+    public int[] getValue(int[] ss, int offset) {
+        int[] rets = new int[2];
+        if (offset >= ss.length) {
+            int v1 = getValue();
+            if (v1 != IntsToIntMap.NULL_VALUE) {
+                rets[0] = v1;
+                rets[1] = prefix.length;
+            }
+        } else {
+            //iimap.get(ss, offset, rets);
+            get(ss, offset, rets);
+            if (rets[0] == -1) {
+                int v1 = getValue();
+                if (v1 != IntsToIntMap.NULL_VALUE) {
+                    rets[0] = v1;
+                    rets[1] = prefix.length;
+                }
+            } else {
+                rets[1] = rets[1] + prefix.length;
+            }
+        }
+        return rets;
+    }
+
+    public boolean isEmpty() {
+        if (iimap.size() <= 0 && getValue() == IntsToIntMap.NULL_VALUE) {
+            return true;
+        }
+        return false;
+    }
+
+    ///////////////
+
+    // ss已经去掉了前缀
+    public void put(int[] ss, int value) {
+        //iimap.add(ss, value);
+        addWord(ss, value);
+    }
+
+    // ss已经去掉了前缀
+    public void remove(int[] ss) {
+        removeWord(ss);
+    }
+
+    ////////////////
+
+    // ss 没有去掉前缀
+    public int getValue(int[] ss) {
+        int ret = IntsToIntMap.NULL_VALUE;
+        //if (ss.length <= GTrie.PREFIX_LEN) {
+        if (ss.length <= prefix.length) {
+            ret = getValue();
+        } else {
+            int[] ss1 = Arrays.copyOfRange(ss, prefix.length, ss.length);
+            //ret = iimap.get(ss1);
+            ret = get(ss1);
+        }
+        return ret;
+    }
+
+    public List<TIntArrayList> startsWith(int[] ss) {
+        List<TIntArrayList> rets = iimap.startsWith(Arrays.copyOfRange(ss, prefix.length, ss.length));
+        if(rets != null && !rets.isEmpty()) {
+            for (TIntArrayList ret : rets) {
+                ret.insert(0, prefix);
+            }
+        }
+        return rets;
+    }
+
+    @Override
+    public void writeExternal(ObjectOutput out) throws IOException {
+        out.writeInt(value);
+        out.writeObject(prefix);
+        iimap.writeExternal(out);
+    }
+
+    @Override
+    public void readExternal(ObjectInput in) throws IOException, ClassNotFoundException {
+        value = in.readInt();
+        prefix = (int[]) in.readObject();
+        iimap.readExternal(in);
+    }
+}

+ 37 - 91
server/src/main/java/com/giantan/ai/util/trie/GTrieNode.java

@@ -2,6 +2,7 @@ package com.giantan.ai.util.trie;
 
 //import com.giantan.ai.entry.Int2ArrayList;
 //import com.giantan.ai.entry.IntsToIntMap;
+
 import cnnlp.util.Int2ArrayList;
 import cnnlp.util.IntsToIntMap;
 import gnu.trove.TIntArrayList;
@@ -23,22 +24,15 @@ import java.util.concurrent.locks.StampedLock;
 使用 LOCK_SIZE = 2^N 避免热点锁
 提高读性能,读多写少场景下无锁
  */
+
+// 同步版本放在 ConcurrentGTrieNode.java
+// 这个用于在上层object中做同步
 public class GTrieNode implements Externalizable, Cloneable {
 
     int value = IntsToIntMap.NULL_VALUE;
     int[] prefix;
     IntsToIntMap iimap = new IntsToIntMap(4);
 
-    //transient final ReadWriteLock lock = new ReentrantReadWriteLock();
-    private static final int LOCK_SIZE = 8;  // 共享8个锁
-    private static final StampedLock[] LOCKS = new StampedLock[LOCK_SIZE];
-
-    static {
-        for (int i = 0; i < LOCK_SIZE; i++) {
-            LOCKS[i] = new StampedLock();
-        }
-    }
-
     public GTrieNode() {
     }
 
@@ -51,10 +45,6 @@ public class GTrieNode implements Externalizable, Cloneable {
 //        this.prefix = prefix;
 //    }
 
-    private StampedLock getLock() {
-        int index = (System.identityHashCode(this) & (LOCK_SIZE - 1)); // 保证同一对象总是取相同的锁
-        return LOCKS[index];
-    }
 
     public int getValue() {
         return value;
@@ -75,80 +65,35 @@ public class GTrieNode implements Externalizable, Cloneable {
 
     public int size() {
         int size = iimap.size();
-//        if (getValue() != IntsToIntMap.NULL_VALUE) {
         if (iimap.getData() != IntsToIntMap.NULL_VALUE) {
             size++;
         }
         return size;
     }
 
-    /////////////
-
-    private void addWord(int[] ss, int value) {
-        iimap.add(ss, value);
-        StampedLock lock = getLock();
-        long stamp = lock.writeLock();
-        try {
-            iimap.add(ss, value);
-        } finally {
-            lock.unlockWrite(stamp);
-        }
-    }
-
-    private void removeWord(int[] ss) {
-        StampedLock lock = getLock();
-        long stamp = lock.writeLock();
-        try {
-            iimap.remove(ss);
-        } finally {
-            lock.unlockWrite(stamp);
-        }
-    }
-
-    private Int2ArrayList getAllMatches(int[] ss, int offset) {
-        StampedLock lock = getLock();
-        long stamp = lock.tryOptimisticRead();
-        Int2ArrayList matches = iimap.getAllMatches(ss, offset);
-        if (!lock.validate(stamp)) {
-            stamp = lock.readLock();
-            try {
-                matches = iimap.getAllMatches(ss, offset);
-            } finally {
-                lock.unlockRead(stamp);
-            }
-        }
-        return matches;
-    }
-
-    private int[] get(int[] keys, int offset, int[] rets) {
-        StampedLock lock = getLock();
-        long stamp = lock.tryOptimisticRead();
-        iimap.get(keys, offset, rets);
-        if (!lock.validate(stamp)) {
-            stamp = lock.readLock();
-            try {
-                iimap.get(keys, offset, rets);
-            } finally {
-                lock.unlockRead(stamp);
-            }
-        }
-        return rets;
-    }
+//    private void addWord(int[] ss, int value) {
+//        iimap.add(ss, value);
+//    }
+//
+//    private void removeWord(int[] ss) {
+//        iimap.remove(ss);
+//    }
+//
+//    private Int2ArrayList getAllMatches(int[] ss, int offset) {
+//        Int2ArrayList matches = iimap.getAllMatches(ss, offset);
+//        return matches;
+//    }
+//
+//    private int[] get(int[] keys, int offset, int[] rets) {
+//        iimap.get(keys, offset, rets);
+//        return rets;
+//    }
+//
+//    private int get(int[] keys) {
+//        int value = iimap.get(keys);
+//        return value;
+//    }
 
-    private int get(int[] keys) {
-        StampedLock lock = getLock();
-        long stamp = lock.tryOptimisticRead();
-        int value = iimap.get(keys);
-        if (!lock.validate(stamp)) {
-            stamp = lock.readLock();
-            try {
-                value = iimap.get(keys);
-            } finally {
-                lock.unlockRead(stamp);
-            }
-        }
-        return value;
-    }
     /////////////////
 
     //rets[0]= value;rets[1]=匹配上的长度
@@ -158,8 +103,8 @@ public class GTrieNode implements Externalizable, Cloneable {
             return null;
         }
 
-        //Int2ArrayList matches = iimap.getAllMatches(ss, offset);
-        Int2ArrayList matches = getAllMatches(ss, offset);
+        Int2ArrayList matches = iimap.getAllMatches(ss, offset);
+        //Int2ArrayList matches = getAllMatches(ss, offset);
 
         if (matches == null) {
             matches = new Int2ArrayList();
@@ -184,8 +129,8 @@ public class GTrieNode implements Externalizable, Cloneable {
                 rets[1] = prefix.length;
             }
         } else {
-            //iimap.get(ss, offset, rets);
-            get(ss, offset, rets);
+            iimap.get(ss, offset, rets);
+            //get(ss, offset, rets);
             if (rets[0] == -1) {
                 int v1 = getValue();
                 if (v1 != IntsToIntMap.NULL_VALUE) {
@@ -210,13 +155,14 @@ public class GTrieNode implements Externalizable, Cloneable {
 
     // ss已经去掉了前缀
     public void put(int[] ss, int value) {
-        //iimap.add(ss, value);
-        addWord(ss, value);
+        iimap.add(ss, value);
+        //addWord(ss, value);
     }
 
     // ss已经去掉了前缀
     public void remove(int[] ss) {
-        removeWord(ss);
+        //removeWord(ss);
+        iimap.remove(ss);
     }
 
     ////////////////
@@ -229,15 +175,15 @@ public class GTrieNode implements Externalizable, Cloneable {
             ret = getValue();
         } else {
             int[] ss1 = Arrays.copyOfRange(ss, prefix.length, ss.length);
-            //ret = iimap.get(ss1);
-            ret = get(ss1);
+            ret = iimap.get(ss1);
+            //ret = get(ss1);
         }
         return ret;
     }
 
     public List<TIntArrayList> startsWith(int[] ss) {
         List<TIntArrayList> rets = iimap.startsWith(Arrays.copyOfRange(ss, prefix.length, ss.length));
-        if(rets != null && !rets.isEmpty()) {
+        if (rets != null && !rets.isEmpty()) {
             for (TIntArrayList ret : rets) {
                 ret.insert(0, prefix);
             }

+ 2 - 2
server/src/main/java/com/giantan/data/dk/repository/DkIndexer.java

@@ -398,7 +398,7 @@ public class DkIndexer extends HybridIndexer implements IDkIndexer {
         String gid = entity.getGid();
         int i = 0;
         try {
-            i = hybridSearch.deleteDocumentsByIdFilter(getMappedIndexName(collection), gid);
+            i = hybridSearch.deleteDocumentsByIdLike(getMappedIndexName(collection), gid);
         } catch (Exception e) {
             throw new RuntimeException(e);
         }
@@ -412,7 +412,7 @@ public class DkIndexer extends HybridIndexer implements IDkIndexer {
         try {
             for (GTool e : entities) {
                 String gid = e.getGid();
-                count = count + hybridSearch.deleteDocumentsByIdFilter(getMappedIndexName(collection), gid);
+                count = count + hybridSearch.deleteDocumentsByIdLike(getMappedIndexName(collection), gid);
             }
         } catch (Exception e) {
             throw new RuntimeException(e);

+ 1 - 1
server/src/main/java/com/giantan/data/index/HybridIndexer.java

@@ -8,7 +8,7 @@ public class HybridIndexer {
     public static final String COLL_ID = "__cid";
     public static final String DOC_ID = "__did";
 
-
+    public static final String TABLE_ID = "tableId";
 
     protected String defaultIndexPrefix = "qas";
     protected String defaultChunkMode = "single";

+ 67 - 0
server/src/main/java/com/giantan/data/index/HybridSearch.java

@@ -14,6 +14,7 @@ import java.net.http.HttpClient;
 import java.net.http.HttpRequest;
 import java.net.http.HttpResponse;
 import java.util.ArrayList;
+import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 
@@ -208,6 +209,11 @@ public class HybridSearch implements IHybridSearch {
         return 0;
     }
 
+    @Override
+    public int deleteDocumentsByIdLike(String coll, String filter) throws IOException, InterruptedException {
+        return deleteDocumentsByIdFilter(coll, filter + "%");
+    }
+
     @Override
     public int deleteDocumentsByFilter(String coll, Map<String, Object> query) throws IOException, InterruptedException {
         String js = JsonUtils.toJson(query);
@@ -281,6 +287,35 @@ public class HybridSearch implements IHybridSearch {
 
     @Override
     public List<DocSearchResp> hybridSearch(String coll, Map<String, Object> query) throws IOException, InterruptedException {
+//        Map<String, Object> query2 = MilvusSearchRequestBuilder.from(query);
+//        String body = JsonUtils.toJsonString(query2);
+//        HttpRequest request = HttpRequest.newBuilder()
+//                .uri(URI.create(url + coll + "/documents/hybridSearch"))
+//                .header("Content-Type", "application/json")
+//                .header("User-Agent", clientInfo)
+//                .method("POST", HttpRequest.BodyPublishers.ofString(body))
+//                .build();
+//        HttpResponse<String> response = HttpClient.newHttpClient().send(request, HttpResponse.BodyHandlers.ofString());
+//        Map<String, Object> ret = JsonUtils.fromJsonString(response.body());
+//        Object o = ret.get("data");
+//        List<DocSearchResp> rets = new ArrayList<>();
+//        if (o != null && o instanceof List ls) {
+//            for (Object r1 : ls) {
+//                if (r1 != null && r1 instanceof Map m) {
+//                    rets.add(DocSearchResp.fromMap(m));
+//                }
+//            }
+//        }
+//        return rets;
+        return hybridSearch(coll, query, true);
+    }
+
+    @Override
+    public List<DocSearchResp> hybridSearch(String coll, Map<String, Object> query, boolean isIncludeColl) throws IOException, InterruptedException {
+        if (isIncludeColl) {
+            Object tn = query.remove(HybridIndexer.TABLE_ID);
+            query.put(HybridIndexer.COLL_ID, tn);
+        }
         Map<String, Object> query2 = MilvusSearchRequestBuilder.from(query);
         String body = JsonUtils.toJsonString(query2);
         HttpRequest request = HttpRequest.newBuilder()
@@ -303,6 +338,38 @@ public class HybridSearch implements IHybridSearch {
         return rets;
     }
 
+    @Override
+    public Map<String, List<DocSearchResp>> federatedSearch(Map<String, Object> query) throws IOException, InterruptedException {
+        String body = JsonUtils.toJsonString(query);
+        HttpRequest request = HttpRequest.newBuilder()
+                .uri(URI.create(url + "_search"))
+                .header("Content-Type", "application/json")
+                .header("User-Agent", clientInfo)
+                .method("POST", HttpRequest.BodyPublishers.ofString(body))
+                .build();
+        HttpResponse<String> response = HttpClient.newHttpClient().send(request, HttpResponse.BodyHandlers.ofString());
+        Map<String, Object> ret = JsonUtils.fromJsonString(response.body());
+        Object o = ret.get("data");
+        Map<String, List<DocSearchResp>> rs = new HashMap<>();
+
+        if (o != null && o instanceof Map) {
+            Map<String, Object> m = (Map<String, Object>) o;
+            //System.out.println(m);
+            m.forEach((k, v) -> {
+                if (v != null && v instanceof List ls) {
+                    List<DocSearchResp> ls2 = new ArrayList<>();
+                    for (Object r1 : ls) {
+                        if (r1 != null && r1 instanceof Map m1) {
+                            ls2.add(DocSearchResp.fromMap(m1));
+                        }
+                    }
+                    rs.put(k, ls2);
+                }
+            });
+        }
+        return rs;
+    }
+
     @Override
     public int createCollection(String coll) throws IOException, InterruptedException {
         HttpRequest request = HttpRequest.newBuilder()

+ 5 - 0
server/src/main/java/com/giantan/data/index/IHybridSearch.java

@@ -26,6 +26,7 @@ public interface IHybridSearch {
 
     // 可以是  doc_id 也可以是 "doc_id%",例如 01k2zv5r5f75cz9k305wb8d489%
     int deleteDocumentsByIdFilter(String coll, String filter) throws IOException, InterruptedException;
+    int deleteDocumentsByIdLike(String coll, String filter) throws IOException, InterruptedException;
 
     int deleteDocumentsByFilter(String coll, Map<String, Object> query) throws IOException, InterruptedException;
 
@@ -35,6 +36,10 @@ public interface IHybridSearch {
 
     List<DocSearchResp> hybridSearch(String coll, Map<String, Object> query) throws IOException, InterruptedException;
 
+    List<DocSearchResp> hybridSearch(String coll, Map<String, Object> query,boolean isIncludeColl) throws IOException, InterruptedException;
+
+    Map<String,List<DocSearchResp>> federatedSearch(Map<String, Object> query) throws IOException, InterruptedException;
+
     /////////
     int createCollection(String coll) throws IOException, InterruptedException;
 

+ 15 - 0
server/src/main/java/com/giantan/data/index/IndexConfig.java

@@ -1,5 +1,19 @@
 package com.giantan.data.index;
 
+/*
+    // 每个collection 一个索引库
+	"indexStrategy": {
+		"indexMode": "table2collection",
+		"collectionPrefix": "qas_"
+	}
+
+	// 共享一个索引库qascolls
+	"indexStrategy": {
+		"indexMode": "singleCollection",
+		"globalCollection": "qascolls"
+	}
+
+ */
 public class IndexConfig {
     //可配置项(用户可控)
     //字段选择:["name", "description", "tags", "attributes.color"]
@@ -8,6 +22,7 @@ public class IndexConfig {
     //空值处理:跳过 | 输出 "null"
     //public static final String CHUNK_TEMPLATE = "chunkTemplate";
     public static final String CHUNK_MODE = "chunkMode";
+    public static final String CHUNK_MODE_DEFAULT = "default";
     public static final String CHUNK_MODE_SINGLE = "single";
     public static final String CHUNK_MODE_MULTIPLE = "multiple";
     public static final String CHUNK_MODE_CUSTOM = "custom";

+ 36 - 4
server/src/main/java/com/giantan/data/index/MilvusSearchRequestBuilder.java

@@ -7,6 +7,21 @@ import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 
+/*  {
+			"id": "01k3mnzprggve1jaqmyayf8p48-0",
+			"text": "怎样获取这个月的石油价格走势",
+			"tags": [
+				"炼化"
+			],
+			"metadata": {
+				"__did": "3",
+				"__cid": "qas_2"
+			},
+			"createTime": "1756262033518",
+			"score": 0.8174635954358648
+		},
+ */
+
 // 构建 milvus 的 filterExpression
 /*
 	public Builder from(Map<String, Object> request) {
@@ -94,7 +109,7 @@ import java.util.Map;
 public class MilvusSearchRequestBuilder {
 
     public static final String TIME_FIELD_NAME = "create_time";
-    public static final String FILTER_FIELD_NAME = "filterExpression";
+    public static final String FILTER_EXPRESSION = "filterExpression";
     // tags tagsAll
     // path
 
@@ -165,9 +180,26 @@ public class MilvusSearchRequestBuilder {
             }
         }
 
-        Object o4 = ret.remove(FILTER_FIELD_NAME);
+        Object o4 = ret.remove(HybridIndexer.COLL_ID);
+        //Object o4 = ret.remove("collId");
+        String collId = null;
         if (o4 != null) {
-            String fs = (String)o4;
+            String co = (String) o4;
+            collId = String.format("metadata[\""+HybridIndexer.COLL_ID+"\"] == '%s'", co);
+
+            if (filter != null && filter.length() > 0) {
+                if (collId != null) {
+                    filter = String.format("%s && %s", filter, collId);
+                }
+            } else {
+                filter = collId;
+            }
+        }
+
+
+        Object o5 = ret.remove(FILTER_EXPRESSION);
+        if (o5 != null) {
+            String fs = (String)o5;
             if (filter != null && filter.length() > 0) {
                 if (fs != null) {
                     filter = String.format("%s && %s", filter, fs);
@@ -178,7 +210,7 @@ public class MilvusSearchRequestBuilder {
         }
 
         if (filter != null) {
-            ret.put(FILTER_FIELD_NAME, filter);
+            ret.put(FILTER_EXPRESSION, filter);
         }
         return ret;
     }

+ 3 - 1
server/src/main/java/com/giantan/data/kvs/repository/GEntity.java

@@ -1,9 +1,11 @@
 package com.giantan.data.kvs.repository;
 
+import lombok.Data;
+
 import java.util.List;
 import java.util.Map;
 
-//@Data
+@Data
 public class GEntity {
     private Long id;
     private String gid;

+ 1 - 1
server/src/main/java/com/giantan/data/mds/MdsApplication.java

@@ -16,7 +16,7 @@ public class MdsApplication {
 
     public static void main(String[] args) {
         SpringApplication.run(MdsApplication.class, args);
-        log.info("Mds server started. Version 3.0.1");
+        log.info("Mds server started. Version 3.1.");
     }
 
 }

+ 20 - 5
server/src/main/java/com/giantan/data/mds/controller/MdDocsController.java

@@ -15,6 +15,7 @@ import org.springframework.web.multipart.MultipartFile;
 
 import java.io.IOException;
 import java.lang.invoke.MethodHandles;
+import java.nio.file.FileAlreadyExistsException;
 import java.util.List;
 import java.util.Map;
 import java.util.UUID;
@@ -85,6 +86,7 @@ public class MdDocsController {
 //    }
 
     //@PostMapping("/upload")
+    //objectPath 如果以"/"结尾就是path,否则是文件名
     @PostMapping("/mds")
     public R<Map<String, Object>> upload(@PathVariable String collId,
                                          @RequestParam("file") MultipartFile file,
@@ -94,11 +96,16 @@ public class MdDocsController {
         //System.out.println("taskId = " + taskId);
         //System.out.println("file = " + file.getOriginalFilename());
         log.info("上传文件: {}, taskId: {}", file.getOriginalFilename(), taskId);
-
-        //taskStatusManager.putProcessing(taskId, new TaskStatus(collId, taskId, "处理中", "", System.currentTimeMillis(), 0));
-        Map<String, Object> ret = mdDocsService.processAsyncDirect(collId, taskId, file, params);
-
-        return R.data(ret);
+        try {
+            Map<String, Object> ret = mdDocsService.processAsyncDirect(collId, taskId, file, params);
+            return R.data(ret);
+        } catch (FileAlreadyExistsException e) {
+            log.error("上传文件: {} 失败, taskId: {}", file.getOriginalFilename(), taskId);
+            return R.fail(409, "文件处理失败:" + e.getMessage());
+        } catch (Throwable e) {
+            log.error("任务 {} 失败", taskId, e);
+            return R.fail(500, "文件处理失败:" + e.getMessage());
+        }
     }
 
     @DeleteMapping("/mds/{gid}")
@@ -124,6 +131,14 @@ public class MdDocsController {
         return R.data(ret);
     }
 
+    @GetMapping("/mds/by-name")
+    public R<?> getByName(@PathVariable String collId, @RequestParam("name") String name
+    ) throws Throwable {
+        GBaseKeyValue ret = mdDocsService.findByName(collId, name);
+        return R.data(ret);
+    }
+
+
 //    @DeleteMapping("/mds")
 //    public R<Map<String, Object>> deleteByName(
 //            @PathVariable String collId,

+ 1 - 2
server/src/main/java/com/giantan/data/mds/controller/MdSearchContoller.java → server/src/main/java/com/giantan/data/mds/controller/ProxySearchContoller.java

@@ -2,7 +2,6 @@ package com.giantan.data.mds.controller;
 
 import com.giantan.data.mds.constant.MdConstants;
 import com.giantan.data.mds.repository.MdIndexer;
-import com.giantan.data.mds.service.IMdDocsService;
 import jakarta.servlet.http.HttpServletRequest;
 import org.springframework.beans.factory.annotation.Autowired;
 import org.springframework.beans.factory.annotation.Value;
@@ -21,7 +20,7 @@ import java.util.Collections;
 
 @RestController
 @RequestMapping(MdConstants.API_PREFIX + "/collections/{coll}")
-public class MdSearchContoller {
+public class ProxySearchContoller {
     private static final org.slf4j.Logger log
             = org.slf4j.LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
 

+ 13 - 6
server/src/main/java/com/giantan/data/mds/controller/TaxonomyController.java

@@ -10,6 +10,7 @@ import org.springframework.web.bind.annotation.*;
 import org.springframework.web.multipart.MultipartFile;
 
 import java.lang.invoke.MethodHandles;
+import java.nio.file.FileAlreadyExistsException;
 import java.util.List;
 import java.util.Map;
 import java.util.UUID;
@@ -78,20 +79,26 @@ public class TaxonomyController {
 
 
     @PostMapping("/{nodeId}/mds")
-    public R<Map<String, Object>> upload(@PathVariable String collName,
+    public R upload(@PathVariable String collName,
                                          @PathVariable String nodeId,
                                          @RequestParam("file") MultipartFile file,
                                          @RequestParam Map<String, String> params
-    ) throws Exception {
+    ) {
         String taskId = UUID.randomUUID().toString();
         //System.out.println("taskId = " + taskId);
         //System.out.println("file = " + file.getOriginalFilename());
         log.info("上传文件: {}, taskId: {}", file.getOriginalFilename(), taskId);
 
-        //taskStatusManager.putProcessing(taskId, new TaskStatus(collId, taskId, "处理中", "", System.currentTimeMillis(), 0));
-        Map<String, Object> ret = dynamicTaxonomyService.processAsyncDirect(collName, nodeId, taskId, file, params);
-
-        return R.data(ret);
+        try {
+            Map<String, Object> ret = dynamicTaxonomyService.processAsyncDirect(collName, nodeId, taskId, file, params);
+            return R.data(ret);
+        }catch (FileAlreadyExistsException e){
+            log.error("上传文件: {} 失败, taskId: {}", file.getOriginalFilename(), taskId);
+            return R.fail(409, "文件处理失败:" + e.getMessage());
+        } catch (Throwable e) {
+            log.error("任务 {} 失败", taskId, e);
+            return R.fail(500, "文件处理失败:" + e.getMessage());
+        }
     }
 
     @DeleteMapping("/{nodeId}/mds")

+ 3 - 1
server/src/main/java/com/giantan/data/mds/service/IMdDocsService.java

@@ -9,7 +9,7 @@ import java.util.Map;
 
 public interface IMdDocsService {
     //@Async
-    Map<String, Object> processAsyncDirect(String coll, String taskId, MultipartFile file, Map<String, String> params);
+    Map<String, Object> processAsyncDirect(String coll, String taskId, MultipartFile file, Map<String, String> params) throws Throwable;
 
     int delete(String coll, String gid) throws Throwable;
 
@@ -27,6 +27,8 @@ public interface IMdDocsService {
 
     GBaseKeyValue findByMdid(String coll, String mdId) throws Throwable;
 
+    GBaseKeyValue findByName(String coll, String name) throws Throwable;
+
     Object getMetadataByKey(String coll, String mdId, String key) throws Throwable;
 
     Object getAttributes(String coll, String mdId) throws Throwable;

+ 4 - 1
server/src/main/java/com/giantan/data/mds/service/impl/MdCollectionsService.java

@@ -314,6 +314,9 @@ public class MdCollectionsService {   //extends KvCollectionService
     public long deleteCollection(String name) throws Throwable {
 
         GBaseKeyValue kv = getKvByName(name);
+        if (kv == null) {
+            return 0;
+        }
         Integer intId = kv.getIntId();
         String id = kv.getId();
 
@@ -431,7 +434,7 @@ public class MdCollectionsService {   //extends KvCollectionService
         chunksRepository.createTable(id);
         extraRepository.createTable(id);
         taskRepository.createTable(id);
-        indexer.createCollection(id);
+        indexer.createCollection(ret.getName());
         return ret;
     }
 

+ 64 - 62
server/src/main/java/com/giantan/data/mds/service/impl/MdDocsService.java

@@ -18,6 +18,7 @@ import org.apache.commons.io.IOUtils;
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.nio.file.FileAlreadyExistsException;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
@@ -97,75 +98,66 @@ public class MdDocsService implements IMdDocsService {
     }
 
     @Override
-    public Map<String, Object> processAsyncDirect(String coll, String taskId, MultipartFile file, Map<String, String> params) {
+    public Map<String, Object> processAsyncDirect(String coll, String taskId, MultipartFile file, Map<String, String> params) throws Throwable {
 
         Map<String, Object> ret = new HashMap<>();
-        try {
-            String uri = params.get("objectPath");
-            String original = file.getOriginalFilename();
-            String p = normalizePath(uri);
-            //String objectPath = getObjectPath(uri, original);
-            String objectPath = null;
-            if (FileUtil.isEndsWithSeparator(p)) {
-                objectPath = p + original;
-            } else {
-                objectPath = p;
-                int i1 = p.lastIndexOf("/");
-                p = p.substring(0, i1 + 1);
-            }
-            int collId = mdCollectionsService.getCollectionOrNew(coll);
-
-            params.put("path", p);
-            params.put("name", objectPath);
-            //TaskStatus status = new TaskStatus(coll, taskId, "成功", "处理完成", System.currentTimeMillis(), System.currentTimeMillis());
-            //taskStatusManager.markSuccess(taskId, status);
-            ret.put("taskId", taskId);
-            ret.put("uploadFile", original);
-            //List<GBaseKeyValue> kvs = mdCollectionsService.findByName(coll, objectPath);
-            //List<GBaseKeyValue> kvs = collection.findByName(objectPath);
 
-            List<GBaseKeyValue> kvs = mdDynamicRepository.findByPath(Integer.toString(collId), objectPath);
-            boolean isNewFile = true;
-
-            if (kvs != null && kvs.size() > 0) {
-                isNewFile = false;
-                ret.put("gid", kvs.get(0).getGid());
+        String uri = params.get("objectPath");
+        String original = file.getOriginalFilename();
+        String p = normalizePath(uri);
+        //String objectPath = getObjectPath(uri, original);
+        String objectPath = null;
+        if (FileUtil.isEndsWithSeparator(p)) {
+            objectPath = p + original;
+        } else {
+            objectPath = p;
+            int i1 = p.lastIndexOf("/");
+            p = p.substring(0, i1 + 1);
+        }
+        int collId = mdCollectionsService.getCollectionOrNew(coll);
+
+        params.put("path", p);
+        params.put("name", objectPath);
+        //TaskStatus status = new TaskStatus(coll, taskId, "成功", "处理完成", System.currentTimeMillis(), System.currentTimeMillis());
+        //taskStatusManager.markSuccess(taskId, status);
+        ret.put("taskId", taskId);
+        ret.put("uploadFile", original);
+
+        //List<GBaseKeyValue> kvs = mdDynamicRepository.findByPath(Integer.toString(collId), objectPath);
+        List<GBaseKeyValue> kvs = mdDynamicRepository.findByName(Integer.toString(collId), objectPath);
+        boolean isNewFile = true;
+
+        if (kvs != null && kvs.size() > 0) {
+            isNewFile = false;
+            ret.put("gid", kvs.get(0).getGid());
+            log.info("任务 {} 的objectPath {} 已存在,metadata = {}", taskId, objectPath, kvs.get(0));
+            throw new FileAlreadyExistsException("文件已存在,请更换文件名");
+        } else {
+            ret.put("gid", taskId);
+        }
 
-                log.info("任务 {} 的objectPath {} 已存在,metadata = {}", taskId, objectPath, kvs.get(0));
-            } else {
-                ret.put("gid", taskId);
-            }
+        backupAndStore(coll, collId, taskId, file, params, isNewFile);
 
-            backupAndStore(coll, collId, taskId, file, params, isNewFile);
-        } catch (Throwable e) {
-            log.error("任务 {} 失败", taskId, e);
-        }
         return ret;
     }
 
     @Async
-    public void backupAndStore(String coll, int collId, String taskId, MultipartFile file, Map<String, String> params, boolean isNewFile) {
-        try {
-            String uri = params.get("objectPath");
-            String original = file.getOriginalFilename();
-            //String objectPath = gkbStorer.backupAndStore(file.getInputStream(), coll, original, uri);
-            String objectPath = gkbStorer.storeDirect(file.getInputStream(), coll, original, uri);
-            if (isNewFile) {
-                //params.put("name", objectPath);
-                params.put("gid", taskId);
-                //String objectPath2 = getObjectPath(uri, original);
-                //params.put("name", objectPath2);
-
-                //mdCollectionsService.createEntry(coll, toObjectMap(params));
-                Map<String, Object> data = toObjectMap(params);
-                mdDynamicRepository.save(Integer.toString(collId), GBaseKeyValue.build(data));
-            }
-            log.info("任务 {} 成功", taskId);
-        } catch (Throwable e) {
-            //TaskStatus status = new TaskStatus(coll, taskId, "失败", e.getMessage(), System.currentTimeMillis(), System.currentTimeMillis());
-            //taskStatusManager.markFailed(taskId, status);
-            log.error("任务 {} 失败", taskId, e);
-        }
+    public void backupAndStore(String coll, int collId, String taskId, MultipartFile file, Map<String, String> params, boolean isNewFile) throws Throwable {
+
+        String uri = params.get("objectPath");
+        String original = file.getOriginalFilename();
+        //String objectPath = gkbStorer.backupAndStore(file.getInputStream(), coll, original, uri);
+        String objectPath = gkbStorer.storeDirect(file.getInputStream(), coll, original, uri);
+        //if (isNewFile) {
+        params.put("gid", taskId);
+        //String objectPath2 = getObjectPath(uri, original);
+        //params.put("name", objectPath2);
+
+        Map<String, Object> data = toObjectMap(params);
+        mdDynamicRepository.save(Integer.toString(collId), GBaseKeyValue.build(data));
+        //}
+        log.info("任务 {} 成功", taskId);
+
     }
 
     private Map<String, Object> toObjectMap(Map<String, String> params) {
@@ -472,6 +464,16 @@ public class MdDocsService implements IMdDocsService {
         return r;
     }
 
+    @Override
+    public GBaseKeyValue findByName(String coll, String name) throws Throwable {
+        String collId = getStrOfCollId(coll);
+        List<GBaseKeyValue> rs = mdDynamicRepository.findByName(collId, name);
+        if (rs != null && rs.size() > 0) {
+            return rs.get(0);
+        }
+        return null;
+    }
+
     @Override
     public Object getMetadataByKey(String coll, String mdId, String key) throws Throwable {
         GBaseKeyValue kv = findByMdid(coll, mdId);
@@ -615,7 +617,7 @@ public class MdDocsService implements IMdDocsService {
         List<String> uids = mdDynamicChunkRepository.findUidsByMdId(collId, mdId);
         if (uids != null && !uids.isEmpty()) {
             //hybridSearch.delete(coll, uids);
-            mdIndexer.delete(coll,uids);
+            mdIndexer.delete(coll, uids);
             log.info("删除索引文件,mdid: {}", gid);
             long r = mdDynamicChunkRepository.deleteByMdId(collId, mdId);
             log.info("删除chunk文件,mdid: {}", gid);
@@ -636,7 +638,7 @@ public class MdDocsService implements IMdDocsService {
         List<String> uids = mdDynamicChunkRepository.findUidsByMdId(collId, mdId);
         if (uids != null && !uids.isEmpty()) {
             //hybridSearch.delete(coll, uids);
-            mdIndexer.delete(coll,uids);
+            mdIndexer.delete(coll, uids);
             log.info("删除索引文件,mdid: {}", gid);
             //long r = mdDynamicChunkRepository.deleteByMdId(collId, mdId);
             //log.info("删除chunk文件: {}", gid);

+ 7 - 13
server/src/main/java/com/giantan/data/mds/service/impl/MdTaxonomyService.java

@@ -121,19 +121,13 @@ public class MdTaxonomyService {
         return ret;
     }
 
-    public Map<String, Object> processAsyncDirect(String coll, String nodeId, String taskId, MultipartFile file, Map<String, String> params) throws Exception {
-        try {
-            String collId = getStrOfCollId(coll);
-            int intId = getIntId(coll, nodeId);
-            String path = getPath(collId, intId);
-            params.put("objectPath", path);
-            Map<String, Object> ret = mdDocsService.processAsyncDirect(coll, taskId, file, params);
-            return ret;
-        } catch (Exception e) {
-
-        }
-        return null;
-
+    public Map<String, Object> processAsyncDirect(String coll, String nodeId, String taskId, MultipartFile file, Map<String, String> params) throws Throwable {
+        String collId = getStrOfCollId(coll);
+        int intId = getIntId(coll, nodeId);
+        String path = getPath(collId, intId);
+        params.put("objectPath", path);
+        Map<String, Object> ret = mdDocsService.processAsyncDirect(coll, taskId, file, params);
+        return ret;
     }
 
     public int deleteFolder(String coll, String nodeId, String taskId) throws Exception {

+ 185 - 32
server/src/main/java/com/giantan/data/mds/task/impl/BaseTaskHandler.java

@@ -5,6 +5,7 @@ import com.giantan.data.tasks.*;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Map;
+import java.util.concurrent.*;
 
 public abstract class BaseTaskHandler implements ITaskHandler {
     public static final String TYPE = "type";
@@ -27,7 +28,7 @@ public abstract class BaseTaskHandler implements ITaskHandler {
         taskContext.setObjectIds(objects);
         Map<String, Object> payload = taskContext.getParams();
         if (payload.containsKey(MD_IDS)) {
-            Object o =  payload.get(MD_IDS);
+            Object o = payload.get(MD_IDS);
             if (o != null && o instanceof List) {
                 objects = (List<Object>) o;
             }
@@ -38,7 +39,7 @@ public abstract class BaseTaskHandler implements ITaskHandler {
             //objects = new ArrayList<>();
             //for (int i = from; i <= to; i++) {
             for (int i = from; i < to; i++) {
-                objects.add(i);
+                objects.add(Integer.valueOf(i));
             }
             taskContext.setObjectIds(objects);
         } else {
@@ -47,71 +48,223 @@ public abstract class BaseTaskHandler implements ITaskHandler {
     }
 
     protected Long toLong(Object o) {
+        if (o == null) {
+            return null;
+        }
         if (o instanceof Integer) {
             int i1 = (Integer) o;
             long l1 = i1;
-            return l1;
+            return Long.valueOf(l1);
         } else if (o instanceof Long) {
             return (Long) o;
         }
-        return Long.parseLong(o.toString());
+        return Long.valueOf(o.toString());
     }
 
     protected Integer toInt(Object o) {
         if (o instanceof Integer) {
             int i1 = (Integer) o;
-            return i1;
+            return Integer.valueOf(i1);
         } else if (o instanceof Long) {
-            Long l1 =  (Long) o;
+            Long l1 = (Long) o;
             int i1 = l1.intValue();
-            return i1;
+            return Integer.valueOf(i1);
         }
-        return Integer.parseInt(o.toString());
+        return Integer.valueOf(o.toString());
     }
 
+//    public void handle(TaskContext context) {
+//        boolean isCanceled = false;
+//        long startTime = System.currentTimeMillis();
+//        preProcess(context);
+//
+//        List<Object> objectIds = context.getObjectIds();
+//        if (objectIds == null || objectIds.isEmpty()) {
+//            //long endTime = System.currentTimeMillis()-startTime;
+//            //log.info(getType() + " task: {} finished. Used time = {}", context.getTaskId(), endTime);
+//            context.setStatus(TaskState.SUCCESS);
+//            log.info(getType() + " task: {} finished. No mdIds/chunkIds provided.", context.getTaskId());
+//            return;
+//        }
+//        log.info(getType() + " task: {} started", context.getTaskId());
+//        for (Object objectId : objectIds) {
+//            if (context.isCancelled()) {
+//                context.setStatus(TaskState.CANCELLED);
+//                log.info(getType() + " task: {} (objectId: {}) cancelled", context.getTaskId(), objectId);
+//                isCanceled = true;
+//                break;
+//            }
+//            try {
+//                doing(context, objectId);
+//                //context.logSuccess(objectId.toString());
+//            } catch (Exception e) {
+//                //context.logFailure(objectId.toString(), e.getMessage());
+//                log.error(getType() + " task: {} (objectId: {}) error: {}", context.getTaskId(), objectId, e.getMessage());
+//            }
+//        }
+//        if (!isCanceled) {
+//
+//            Map<String, TaskOperationsStatus> objectStatus = context.getObjectStatus();
+//            long endTime = System.currentTimeMillis() - startTime;
+//            if (objectStatus.containsValue(TaskState.FAILED)) {
+//                context.setStatus(TaskState.FAILED);
+//                log.info("{} task: {} finished. Used time = {}",
+//                        getType(),
+//                        context.getTaskId(),
+//                        endTime);
+//            } else {
+//                context.setStatus(TaskState.SUCCESS);
+//                log.info("{} task: {} succeed. Used time = {}",
+//                        getType(),
+//                        context.getTaskId(),
+//                        endTime);
+//            }
+//        }
+//    }
+
     public void handle(TaskContext context) {
-        boolean isCanceled = false;
         long startTime = System.currentTimeMillis();
         preProcess(context);
 
         List<Object> objectIds = context.getObjectIds();
         if (objectIds == null || objectIds.isEmpty()) {
-            //long endTime = System.currentTimeMillis()-startTime;
-            //log.info(getType() + " task: {} finished. Used time = {}", context.getTaskId(), endTime);
             context.setStatus(TaskState.SUCCESS);
-            log.info(getType() + " task: {} finished. No mdIds/chunkIds provided.", context.getTaskId());
+            log.info("{} task: {} finished. No mdIds/chunkIds provided.", getType(), context.getTaskId());
             return;
         }
-        log.info(getType() + " task: {} started", context.getTaskId());
-        for (Object objectId : objectIds) {
-            if (context.isCancelled()) {
-                context.setStatus(TaskState.CANCELLED);
-                log.info(getType() + " task: {} (objectId: {}) cancelled", context.getTaskId(), objectId);
-                isCanceled = true;
-                break;
+
+        log.info("{} task: {} started", getType(), context.getTaskId());
+
+        try (ExecutorService executor = Executors.newVirtualThreadPerTaskExecutor()) {
+
+            List<Future<?>> futures = new ArrayList<>();
+
+            for (Object objectId : objectIds) {
+                futures.add(executor.submit(() -> {
+                    if (context.isCancelled()) {
+                        return;
+                    }
+                    try {
+                        doing(context, objectId);
+                    } catch (Exception e) {
+                        log.error(
+                                "{} task: {} (objectId: {}) error: {}",
+                                getType(),
+                                context.getTaskId(),
+                                objectId,
+                                e.getMessage(),
+                                e
+                        );
+                        //context.markFailed(objectId, e);
+                    }
+                }));
             }
-            try {
-                doing(context, objectId);
-                //context.logSuccess(objectId.toString());
-            } catch (Exception e) {
-                //context.logFailure(objectId.toString(), e.getMessage());
-                log.error(getType() + " task: {} (objectId: {}) error: {}", context.getTaskId(), objectId, e.getMessage());
+
+            // 等待完成 & 响应取消
+            for (Future<?> future : futures) {
+                if (context.isCancelled()) {
+                    context.setStatus(TaskState.CANCELLED);
+                    break;
+                }
+                try {
+                    future.get(); // 虚拟线程阻塞是“便宜的”
+                } catch (CancellationException ignored) {
+                } catch (ExecutionException e) {
+                    // 已在 task 内处理
+                }
             }
+        } catch (InterruptedException e) {
+            Thread.currentThread().interrupt();
+            context.setStatus(TaskState.CANCELLED);
         }
-        if (!isCanceled) {
 
-            Map<String, TaskOperationsStatus> objectStatus = context.getObjectStatus();
-            long endTime = System.currentTimeMillis()-startTime;
-            if (objectStatus.containsValue(TaskState.FAILED)){
+        // 汇总状态
+        if (!context.isCancelled()) {
+            long endTime = System.currentTimeMillis() - startTime;
+            if (context.getObjectStatus().containsValue(TaskState.FAILED)) {
                 context.setStatus(TaskState.FAILED);
-                log.info(getType() + " task: {} finished. Used time = {}", context.getTaskId(), endTime);
-            }else{
+                log.info("{} task: {} finished with failure. Used time = {}", getType(), context.getTaskId(), endTime);
+            } else {
                 context.setStatus(TaskState.SUCCESS);
-                log.info(getType() + " task: {} succeed. Used time = {}", context.getTaskId(), endTime);
+                log.info("{} task: {} succeed. Used time = {}", getType(), context.getTaskId(), endTime);
             }
         }
     }
 
+
+//    public void handle2(TaskContext context) {
+//        long startTime = System.currentTimeMillis();
+//        preProcess(context);
+//
+//        List<Object> objectIds = context.getObjectIds();
+//        if (objectIds == null || objectIds.isEmpty()) {
+//            context.setStatus(TaskState.SUCCESS);
+//            log.info(getType() + " task: {} finished. No mdIds/chunkIds provided.", context.getTaskId());
+//            return;
+//        }
+//
+//        log.info(getType() + " task: {} started", context.getTaskId());
+//
+//        int threadNum = Math.min(objectIds.size(), Runtime.getRuntime().availableProcessors());
+//        ExecutorService executor = Executors.newFixedThreadPool(threadNum);
+//        CompletionService<Void> completionService = new ExecutorCompletionService<>(executor);
+//
+//        AtomicBoolean cancelled = new AtomicBoolean(false);
+//
+//        try {
+//            // 提交任务
+//            for (Object objectId : objectIds) {
+//                completionService.submit(() -> {
+//                    if (context.isCancelled() || cancelled.get()) {
+//                        return null;
+//                    }
+//                    try {
+//                        doing(context, objectId);
+//                    } catch (Exception e) {
+//                        log.error(
+//                                getType() + " task: {} (objectId: {}) error: {}",
+//                                context.getTaskId(),
+//                                objectId,
+//                                e.getMessage(),
+//                                e
+//                        );
+//                        context.markFailed(objectId, e); // 建议封装
+//                    }
+//                    return null;
+//                });
+//            }
+//
+//            // 等待完成 + 响应取消
+//            for (int i = 0; i < objectIds.size(); i++) {
+//                if (context.isCancelled()) {
+//                    cancelled.set(true);
+//                    context.setStatus(TaskState.CANCELLED);
+//                    log.info(getType() + " task: {} cancelled", context.getTaskId());
+//                    break;
+//                }
+//                completionService.take(); // 阻塞直到一个任务完成
+//            }
+//        } catch (InterruptedException e) {
+//            Thread.currentThread().interrupt();
+//            context.setStatus(TaskState.CANCELLED);
+//        } finally {
+//            executor.shutdownNow();
+//        }
+//
+//        // 汇总状态
+//        if (!context.isCancelled()) {
+//            long endTime = System.currentTimeMillis() - startTime;
+//            if (context.getObjectStatus().containsValue(TaskState.FAILED)) {
+//                context.setStatus(TaskState.FAILED);
+//                log.info(getType() + " task: {} finished with failure. Used time = {}", context.getTaskId(), endTime);
+//            } else {
+//                context.setStatus(TaskState.SUCCESS);
+//                log.info(getType() + " task: {} succeed. Used time = {}", context.getTaskId(), endTime);
+//            }
+//        }
+//    }
+
+
     public abstract TaskType getType();
 
     public abstract void doing(TaskContext context, Object objectId);

+ 1 - 0
server/src/main/java/com/giantan/data/qa/constant/QaConstants.java

@@ -3,6 +3,7 @@ package com.giantan.data.qa.constant;
 public class QaConstants {
 
     public static final String API_PREFIX = "/v1/qa";
+    public static final String API_PREFIX_V2 = "/v2/qa";
     public static final String FIELD_ENTITY = "entity";
 
     public static final String FIELD_QUERY = "query";

+ 30 - 0
server/src/main/java/com/giantan/data/qa/controller/CollsSearchController.java

@@ -0,0 +1,30 @@
+package com.giantan.data.qa.controller;
+
+import com.giantan.ai.common.reponse.R;
+import com.giantan.data.kvs.kvstore.GBaseKeyValue;
+import com.giantan.data.qa.constant.QaConstants;
+import com.giantan.data.qa.model.CollsSearchRequest;
+import com.giantan.data.qa.service.CollsSearchService;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.http.ResponseEntity;
+import org.springframework.web.bind.annotation.*;
+
+import java.util.List;
+
+@RestController
+@RequestMapping(QaConstants.API_PREFIX + "/collections/_search")
+public class CollsSearchController {
+
+    @Autowired
+    CollsSearchService searchService;
+
+    @PostMapping()
+    public ResponseEntity<R> federatedSearch(@RequestBody CollsSearchRequest query) throws Throwable {
+        //List<GBaseKeyValue> rets = searchService.hybridSearch(query);
+        List<GBaseKeyValue> rets = searchService.federatedSearch(query);
+        return ResponseEntity.ok(R.data(rets));
+    }
+
+
+
+}

+ 1 - 2
server/src/main/java/com/giantan/data/qa/controller/QaSearchContoller.java → server/src/main/java/com/giantan/data/qa/controller/QaProxySearchContoller.java

@@ -2,7 +2,6 @@ package com.giantan.data.qa.controller;
 
 import com.giantan.data.qa.constant.QaConstants;
 import com.giantan.data.qa.repository.QaIndexer;
-import com.giantan.data.qa.service.QaDocsService;
 import jakarta.servlet.http.HttpServletRequest;
 import org.springframework.beans.factory.annotation.Autowired;
 import org.springframework.beans.factory.annotation.Value;
@@ -18,7 +17,7 @@ import java.util.Collections;
 
 @RestController
 @RequestMapping(QaConstants.API_PREFIX + "/collections/{coll}")
-public class QaSearchContoller {
+public class QaProxySearchContoller {
     private static final org.slf4j.Logger log
             = org.slf4j.LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
 

+ 63 - 0
server/src/main/java/com/giantan/data/qa/model/CollsSearchRequest.java

@@ -0,0 +1,63 @@
+package com.giantan.data.qa.model;
+
+import lombok.Data;
+
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+@Data
+public class CollsSearchRequest {
+    String query;
+    List<String> collections;
+    Integer topK = 3;
+    SearchType searchType = SearchType.HYBRID;
+    List<String> tags;
+    String tagsMatch = "any"; // = "any" | "all";
+    String path;
+    String fromTime;
+    String toTime;
+
+    Double similarityThreshold;
+    String filterExpression;
+
+    public Map<String,Object> toMap() {
+        Map<String,Object> map = new HashMap<String,Object>();
+        map.put("query", query);
+        //map.put("collections", collections);
+        map.put("topK", topK);
+
+        map.put("collections", collections);
+        map.put("searchType", searchType);
+
+        if (tags != null && !tags.isEmpty()) {
+            map.put("tags", tags);
+            if (tagsMatch != null && !tagsMatch.isEmpty()) {
+                if (tagsMatch.equals("all")) {
+                    map.put("tagsAll", true);
+                }else{
+                    map.put("tagsAll", false);
+                }
+            }else{
+                map.put("tagsAll", false);
+            }
+        }
+        if (path != null && !path.isEmpty()) {
+            map.put("path", path);
+        }
+        if (fromTime != null && !fromTime.isEmpty()) {
+            map.put("fromTime", fromTime);
+        }
+        if (toTime != null && !toTime.isEmpty()) {
+            map.put("toTime", toTime);
+        }
+        if (similarityThreshold != null) {
+            map.put("similarityThreshold", similarityThreshold);
+        }
+        if (filterExpression != null && !filterExpression.isEmpty()) {
+            map.put("filterExpression", filterExpression);
+        }
+        return map;
+    }
+
+}

+ 14 - 0
server/src/main/java/com/giantan/data/qa/model/SearchType.java

@@ -0,0 +1,14 @@
+package com.giantan.data.qa.model;
+
+import com.fasterxml.jackson.annotation.JsonCreator;
+
+public enum SearchType {
+    FULLTEXT,
+    SIMILARITY,
+    HYBRID;
+
+    @JsonCreator
+    public static SearchType from(String value) {
+        return SearchType.valueOf(value.toUpperCase());
+    }
+}

+ 4 - 0
server/src/main/java/com/giantan/data/qa/repository/QaDocRepository.java

@@ -43,6 +43,10 @@ public class QaDocRepository extends GIndexedRepository {
         return qaIndexer.getMappedIndexName(collId);
     }
 
+    public String getTableName(String collId) {
+        return tableName(collId);
+    }
+
     public int fullIndexing(String collId) throws IOException, InterruptedException {
         int batchSize = 100;
         String sql = String.format("SELECT * FROM %s.%s  WHERE id > ? ORDER BY id LIMIT ?", schema, tableName(collId));

+ 44 - 5
server/src/main/java/com/giantan/data/qa/repository/QaIndexer.java

@@ -152,16 +152,55 @@ public class QaIndexer extends HybridIndexer implements IIndexer {
     private List<DocReq> toDocReq(String coll, GEntity entity) {
         String chunkMode = getChunkMode(coll);
         List<DocReq> rets = null;
-        if (chunkMode.equals(IndexConfig.CHUNK_MODE_MULTIPLE)) {
+        if (chunkMode.equalsIgnoreCase(IndexConfig.CHUNK_MODE_MULTIPLE)) {
             rets = toDocReq2(coll, entity);
-        } else if (chunkMode.equals(IndexConfig.CHUNK_MODE_CUSTOM)) {
+        } else if (chunkMode.equalsIgnoreCase(IndexConfig.CHUNK_MODE_CUSTOM)) {
             rets = toDocReq3(coll, entity);
-        } else {
+        } else if (chunkMode.equalsIgnoreCase(IndexConfig.CHUNK_MODE_SINGLE)) {
             rets = toDocReq1(coll, entity);
+        } else {
+            rets = toDocReqDefault(coll, entity);
         }
         return rets;
     }
 
+    private List<DocReq> toDocReqDefault(String coll, GEntity entity) {
+        List<DocReq> lst = new ArrayList<DocReq>();
+        DocReq dr1 = new DocReq();
+        dr1.setId(entity.getGid() + "-0");
+        dr1.setText(entity.getName());
+        dr1.setTags(entity.getTags());
+        buildMetadata(coll, entity, dr1);
+
+        lst.add(dr1);
+
+        if (entity.getDescription() != null && !entity.getDescription().isEmpty()) {
+            DocReq dr2 = new DocReq();
+            dr2.setId(entity.getGid() + "-1");
+            dr2.setText(entity.getName() + splitter + entity.getDescription());
+            dr2.setTags(entity.getTags());
+            buildMetadata(coll, entity, dr2);
+            lst.add(dr2);
+        }
+
+        int idx = 2;
+        List<String> altlabels = entity.getAltlabels();
+        if (altlabels != null && !altlabels.isEmpty()) {
+            StringBuilder sb = new StringBuilder();
+            sb.append(entity.getName());
+            for (int i = 0; i < altlabels.size(); i++) {
+                sb.append(splitter).append(altlabels.get(i));
+            }
+            DocReq dr2 = new DocReq();
+            dr2.setId(entity.getGid() + "-" + idx);
+            dr2.setText(sb.toString());
+            dr2.setTags(entity.getTags());
+            buildMetadata(coll, entity, dr2);
+            lst.add(dr2);
+        }
+        return lst;
+    }
+
     private List<DocReq> toDocReq1(String coll, GEntity entity) {
         List<DocReq> lst = new ArrayList<DocReq>();
         DocReq dr1 = new DocReq();
@@ -314,7 +353,7 @@ public class QaIndexer extends HybridIndexer implements IIndexer {
         String gid = entity.getGid();
         int i = 0;
         try {
-            i = hybridSearch.deleteDocumentsByIdFilter(getMappedIndexName(collection), gid);
+            i = hybridSearch.deleteDocumentsByIdLike(getMappedIndexName(collection), gid);
         } catch (Exception e) {
             throw new RuntimeException(e);
         }
@@ -328,7 +367,7 @@ public class QaIndexer extends HybridIndexer implements IIndexer {
         try {
             for (GEntity e : entities) {
                 String gid = e.getGid();
-                count = count + hybridSearch.deleteDocumentsByIdFilter(getMappedIndexName(collection), gid);
+                count = count + hybridSearch.deleteDocumentsByIdLike(getMappedIndexName(collection), gid);
             }
         } catch (Exception e) {
             throw new RuntimeException(e);

+ 302 - 0
server/src/main/java/com/giantan/data/qa/service/CollsSearchService.java

@@ -0,0 +1,302 @@
+package com.giantan.data.qa.service;
+
+import cnnlp.util.MultiValueHashMap;
+import com.giantan.data.index.HybridIndexer;
+import com.giantan.data.index.IHybridSearch;
+import com.giantan.data.index.dto.DocSearchResp;
+import com.giantan.data.kvs.kvstore.GBaseKeyValue;
+import com.giantan.data.qa.model.CollsSearchRequest;
+import com.giantan.data.qa.repository.QaDocRepository;
+import com.giantan.data.qa.repository.QaIndexer;
+import com.giantan.data.tasks.TaskContext;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.stereotype.Service;
+
+import java.io.IOException;
+import java.util.*;
+import java.util.stream.Collectors;
+
+@Service
+public class CollsSearchService {
+    private static final org.slf4j.Logger log
+            = org.slf4j.LoggerFactory.getLogger(CollsSearchService.class);
+
+    @Autowired
+    QaCollectionService qaCollectionService;
+
+    @Autowired
+    QaDocRepository qaDocRepository;
+
+    @Autowired
+    IHybridSearch hybridSearch;
+
+    @Autowired
+    GoeSimService goeSimService;
+
+//    @Autowired
+//    QaDocsService qaDocsService;
+
+    public List<GBaseKeyValue> hybridSearch(CollsSearchRequest query) throws Throwable {
+        List<String> colls = query.getCollections();
+        if (colls == null || colls.isEmpty()) {
+            //throw new RuntimeException("No collections found");
+            colls = new ArrayList<>();
+            List<GBaseKeyValue> allCollections = qaCollectionService.getAllCollections();
+            for (GBaseKeyValue coll : allCollections) {
+                colls.add(coll.getName());
+            }
+        }
+
+        List<GBaseKeyValue> rs = new ArrayList<>();
+
+        for (String coll : colls) {
+            Map<String, Object> queryMap = query.toMap();
+            List<GBaseKeyValue> rs1 = hybridSearch(coll, queryMap);
+            if (rs1 != null) {
+                rs.addAll(rs1);
+            }
+        }
+        if (rs != null && !rs.isEmpty()) {
+            // 按 score 降序
+            rs.sort((a, b) -> {
+                double sa = toScore(a.get("score"));
+                double sb = toScore(b.get("score"));
+                return Double.compare(sb, sa);
+            });
+        }
+        return rs;
+    }
+
+    private double toScore(Object v) {
+        if (v == null) return Double.NEGATIVE_INFINITY;
+        if (v instanceof Number) return ((Number) v).doubleValue();
+        return Double.parseDouble(v.toString());
+    }
+
+    public List<GBaseKeyValue> hybridSearch(String coll, Map<String, Object> query) throws Throwable {
+        String collId = getStrOfCollId(coll);
+
+        String qasCollName = qaDocRepository.getMappingCollection(collId);
+
+        // 这里要判断 collection 是不是 index 到一个同一个milvus的collection,如果 是的话,就要加上 qasName 做过滤
+        boolean isIcludeCollName = true;
+        query.put(HybridIndexer.TABLE_ID, collId);
+        List<DocSearchResp> resps = hybridSearch.hybridSearch(qasCollName, query, isIcludeCollName);
+
+        String q = query.get("query").toString();
+        List<GBaseKeyValue> rets = getEntitiesBySearch(collId, q, resps);
+        return rets;
+    }
+
+    protected String getStrOfCollId(String coll) {
+        int id = qaCollectionService.getCollectionId(coll);
+        if (id <= 0) {
+            return null;
+        }
+        return Integer.toString(id);
+    }
+
+    private int toDocId(Object o) {
+        if (o instanceof Integer) {
+            return ((Integer) o).intValue();
+        } else {
+            return Integer.parseInt(o.toString());
+        }
+    }
+
+
+//    private List<GBaseKeyValue> getCollInfos(List<String> colls) throws Throwable {
+//        List<GBaseKeyValue> allCollections = new ArrayList<GBaseKeyValue>();
+//        if (colls.size() == 1 && (colls.get(0).equals("*") || colls.get(0).equalsIgnoreCase("all"))) {
+//            allCollections = qaCollectionService.getAllCollections();
+//        } else {
+//            for (int i = 0; i < colls.size(); i++) {
+//                GBaseKeyValue collObj = qaCollectionService.getKvByName(colls.get(i));
+//                allCollections.add(collObj);
+//            }
+//        }
+//        return allCollections;
+//    }
+
+
+    private void addIntList(List<Integer> is, Object v) {
+        if (v == null) {
+            return;
+        }
+        if (v instanceof Integer) {
+            int id1 = (Integer) v;
+            if (!is.contains(id1)) {
+                is.add(id1);
+            }
+        } else if (v instanceof String) {
+            int id1 = Integer.parseInt((String) v);
+            if (!is.contains(id1)) {
+                is.add(id1);
+            }
+        }
+    }
+
+    protected List<GBaseKeyValue> getEntitiesBySearch(String collId, String q, List<DocSearchResp> resps) throws Throwable {
+        if (resps.isEmpty()) {
+            return List.of();
+        }
+        List<Integer> ids = new ArrayList<>();
+        List<GBaseKeyValue> rets = new ArrayList<>();
+        Map<Integer, Double> scoreMap = new HashMap<>();
+
+        for (DocSearchResp resp : resps) {
+            Map<String, Object> metadata = resp.getMetadata();
+
+            if (metadata != null) {
+                Object o = metadata.get(QaIndexer.COLL_ID);
+                if (o != null && o instanceof String collId1) {
+                    if (collId.equals(collId1)) {
+                        o = metadata.get(QaIndexer.DOC_ID);
+                        addIntList(ids, o);
+                        scoreMap.putIfAbsent(ids.get(ids.size() - 1), resp.getScore());
+                    }
+                }
+            }
+        }
+        if (!ids.isEmpty()) {
+            List<GBaseKeyValue> rs2 = qaDocRepository.findAllByIds(collId, ids);
+            if (rs2 != null) {
+                for (GBaseKeyValue ro : rs2) {
+                    // 用 oe 计算相似度
+                    GoeSimService.Pair<String, Double> r1 = getOeSimilarity(q, ro);
+
+                    double score1 = scoreMap.get(ro.getIntId());
+
+                    if (score1 < r1.right()) {
+                        score1 = r1.right();
+                    }
+                    ro.put("score", score1);
+                    ro.put("matched", r1.left());
+                    rets.add(ro);
+                }
+            }
+        }
+
+        return rets;
+    }
+
+    private GoeSimService.Pair<String, Double> getOeSimilarity(String q, GBaseKeyValue qa) {
+        List<String> ls = new ArrayList<>();
+        Object o = qa.get("name");
+        if (o != null) {
+            ls.add(o.toString());
+        }
+        Object o1 = qa.get("altlabels");
+        if (o1 != null && o1 instanceof List) {
+            List<String> ls1 = (List<String>) o1;
+            ls.addAll(ls1);
+        }
+        GoeSimService.Pair r = goeSimService.search(q, ls);
+        return r;
+    }
+
+    public List<GBaseKeyValue> federatedSearch(CollsSearchRequest req) throws Throwable {
+        List<String> colls = req.getCollections();
+        if (colls == null || colls.isEmpty()) {
+            throw new RuntimeException("No collections found");
+        }
+        List<GBaseKeyValue> rets = new ArrayList<>();
+
+        MultiValueHashMap mapping = new MultiValueHashMap();
+
+        List<String> collNames = new ArrayList<>();
+        for (String coll : colls) {
+            String collId = getStrOfCollId(coll);
+            String qasCollName = qaDocRepository.getMappingCollection(collId);
+            collNames.add(qasCollName);
+            mapping.put(qasCollName, collId);
+        }
+        req.setCollections(collNames);
+
+        Map<String, List<DocSearchResp>> rs = hybridSearch.federatedSearch(req.toMap());
+        if (rs != null && !rs.isEmpty()) {
+            String q = req.getQuery();
+            rs.forEach((k, v) -> {
+                String[] collIds = mapping.get(k);
+                try {
+                    List<GBaseKeyValue> rs1 = getEntitiesBySearch2(collIds, q, v);
+                    if (rs1 != null && !rs1.isEmpty()) {
+                        rets.addAll(rs1);
+                    }
+                } catch (Throwable e) {
+                    throw new RuntimeException(e);
+                }
+            });
+        }
+
+        if (rets != null && !rets.isEmpty()) {
+            // 按 score 降序
+            rets.sort((a, b) -> {
+                double sa = toScore(a.get("score"));
+                double sb = toScore(b.get("score"));
+                return Double.compare(sb, sa);
+            });
+        }
+        return rets;
+    }
+
+
+    protected List<GBaseKeyValue> getEntitiesBySearch2(String[] collIds, String q, List<DocSearchResp> resps) throws Throwable {
+        if (resps.isEmpty()) {
+            return List.of();
+        }
+
+        List<GBaseKeyValue> rets = new ArrayList<>();
+
+        MultiValueHashMap mapping = new MultiValueHashMap();
+        Set<String> collSet = new HashSet<>(Arrays.asList(collIds));
+
+        Map<String, Double> scoreMap = new HashMap<>();
+
+        for (DocSearchResp resp : resps) {
+            Map<String, Object> metadata = resp.getMetadata();
+
+            if (metadata != null) {
+                Object o = metadata.get(QaIndexer.COLL_ID);
+                if (o != null && o instanceof String collId1) {
+
+                    if (collSet.contains(collId1)) {
+                        o = metadata.get(QaIndexer.DOC_ID);
+                        //addIntList(ids, o);
+                        //scoreMap.putIfAbsent(ids.get(ids.size() - 1), resp.getScore());
+                        String did1 = o.toString();
+                        mapping.put(collId1, o.toString());
+                        scoreMap.putIfAbsent(collId1 + ":" + did1, resp.getScore());
+                    }
+                }
+            }
+        }
+        Set<Map.Entry<String, String[]>> ens = mapping.entrySet();
+        for (Map.Entry<String, String[]> e : ens) {
+            String[] value = e.getValue();
+            List<Integer> idList = Arrays.stream(value)
+                    .map(Integer::valueOf)
+                    .collect(Collectors.toList());
+
+            String collId = e.getKey();
+            List<GBaseKeyValue> rs2 = qaDocRepository.findAllByIds(collId, idList);
+            if (rs2 != null) {
+                for (GBaseKeyValue ro : rs2) {
+                    // 用 oe 计算相似度
+                    GoeSimService.Pair<String, Double> r1 = getOeSimilarity(q, ro);
+                    double score1 = scoreMap.get(collId + ":" + ro.getIntId());
+
+                    if (score1 < r1.right()) {
+                        score1 = r1.right();
+                    }
+                    ro.put("score", score1);
+                    ro.put("matched", r1.left());
+                    rets.add(ro);
+                }
+            }
+
+        }
+        return rets;
+    }
+
+}

+ 72 - 0
server/src/main/java/com/giantan/data/qa/service/GoeSimService.java

@@ -0,0 +1,72 @@
+package com.giantan.data.qa.service;
+
+import cnnlp.semnet.core.OeConfigSet;
+import cnnlp.semnet.core.OeConfigSetFactory;
+import cnnlp.util.FileIOAdapter;
+import jakarta.annotation.PostConstruct;
+//import org.cnnlp.service.app.TextSimilarity;
+import org.cnnlp.service.v2.OntologyBasedService2;
+import org.springframework.stereotype.Service;
+
+import java.util.List;
+
+@Service
+public class GoeSimService {
+    OntologyBasedService2 os2;
+
+    public GoeSimService() {
+    }
+
+    @PostConstruct
+    public OntologyBasedService2 init() {
+        OeConfigSet ocs = OeConfigSetFactory.newInstance("mysim1");
+//        OntoDescriptor on = buildCfgForOntoServer("炼化/admin");
+//        ocs.add(on);
+//        OntoDescriptor on2 = buildCfgForKgIni("demo2");
+//        ocs.add(on2);
+        FileIOAdapter ioa = new FileIOAdapter();
+        os2 = OntologyBasedService2.build("oe1", ocs, ioa);
+        return os2;
+    }
+
+    public record Pair<L, R>(L left, R right) {
+    }
+
+    public Pair search(String query, List<String> candidates) {
+        //long t = System.currentTimeMillis();
+        double maxSim = 0;
+        int maxI = 0;
+        for (int i = 0; i < candidates.size(); i++) {
+            double sim = TextSimilarity.getTextSim(query, candidates.get(i), os2);
+            if (sim > maxSim) {
+                maxSim = sim;
+                maxI = i;
+            }
+        }
+        Pair r = new Pair(candidates.get(maxI), maxSim);
+        //t = System.currentTimeMillis() - t;
+        //System.out.println("used time="+t);
+        return r;
+    }
+
+    //    public SimilarityBatchResponse search(String query, List<String> candidates, int topK, Boolean sorted) {
+//        List<String> inputs = new ArrayList<>(candidates.size() + 1);
+//        inputs.add(query);
+//        inputs.addAll(candidates);
+//
+//        List<SimilarityItem> items = new ArrayList<>(candidates.size());
+//        for (int i = 0; i < candidates.size(); i++) {
+//            double sim = TextSimilarity.getTextSim(query, candidates.get(i), os2);
+//            items.add(new SimilarityItem(i, candidates.get(i), sim));
+//        }
+//        if (sorted == null || sorted) {
+//            items.sort((a, b) -> Double.compare(b.getSimilarity(), a.getSimilarity()));
+//        }
+//
+//        int limit = Math.min(topK, items.size());
+//        List<SimilarityItem> topItems = new ArrayList<>(items.subList(0, limit));
+//        return new SimilarityBatchResponse(topItems);
+//    }
+
+
+}

+ 3 - 0
server/src/main/java/com/giantan/data/qa/service/QaDocsService.java

@@ -1,5 +1,6 @@
 package com.giantan.data.qa.service;
 
+import com.giantan.data.index.HybridIndexer;
 import com.giantan.data.index.IHybridSearch;
 import com.giantan.data.index.dto.DocSearchResp;
 import com.giantan.data.kvs.kvstore.GBaseKeyValue;
@@ -457,6 +458,8 @@ public class QaDocsService implements IQaDocsService {
     public List<GBaseKeyValue> hybridSearch(String coll, Map<String, Object> query) throws Throwable {
         String collId = getStrOfCollId(coll);
         String qasName = qaDocRepository.getMappingCollection(collId);
+
+        query.put(HybridIndexer.TABLE_ID, collId);
         List<DocSearchResp> resps = hybridSearch.hybridSearch(qasName, query);
 
         List<GBaseKeyValue> rets = getEntitiesBySearch(collId, resps);

+ 419 - 0
server/src/main/java/com/giantan/data/qa/service/TextSimilarity.java

@@ -0,0 +1,419 @@
+package com.giantan.data.qa.service;
+
+
+import java.util.*;
+
+import org.cnnlp.service.IOntologyBasedService;
+
+import cnnlp.lexical.CnSegment;
+import cnnlp.lexical.dict.POSUtil;
+import cnnlp.lexical.segment.WordAtoms;
+import cnnlp.resource.IOntologyService;
+import cnnlp.resource.oe.OntoPointer;
+import cnnlp.resource.oe.OntoWordAtoms;
+import cnnlp.summarization.cluster.kmeans.EuclideanDocVector;
+import cnnlp.summarization.text.FeatureSet;
+import cnnlp.summarization.text.TextVector;
+import cnnlp.similarity.osce.SimParams;
+
+//2023.8.16
+
+public class TextSimilarity {
+
+    private static double SAME_THRESHOLD = 0.95;
+    private static double SIM_THRESHOLD = 0.75;
+
+    static SimParams weights = new SimParams();
+
+
+    private static EuclideanDocVector docToVector(TextVector tv1, FeatureSet dict) {
+        int len1 = tv1.trimLength();
+        EuclideanDocVector v = new EuclideanDocVector(len1);
+        for (int i = 0; i < len1; i++) {
+            int idx = dict.put(tv1.get1Words(i));
+            v.add(idx, tv1.get1TF(i));
+        }
+        return v;
+    }
+
+
+    private static boolean isKeyWord(String word, int pos) {
+        boolean ok = true;
+        if (pos == POSUtil.POS_UNKOWN) {
+            if (word.length() == 1 && Character.isSpaceChar(word.charAt(0))) {
+                ok = false;
+            }
+        } else if (weights.getWeight(pos) <= 0) {
+            ok = false;
+        }
+        return ok;
+    }
+
+    private static EuclideanDocVector docToVector(WordAtoms ws, FeatureSet dict) {
+        EuclideanDocVector v = new EuclideanDocVector();
+        for (int i = 0; i < ws.trimLength(); i++) {
+            if (isKeyWord(ws.getWords(i), ws.getTags(i))) {
+                int idx = dict.put(ws.getWords(i));
+                v.add(idx, 1);
+            }
+        }
+        return v;
+    }
+
+    // used指向先出现的同义词
+    private static EuclideanDocVector docToVector(WordAtoms ws, FeatureSet dict, int[] used) {
+        EuclideanDocVector v = new EuclideanDocVector();
+        for (int i = 0; i < ws.trimLength(); i++) {
+            if (isKeyWord(ws.getWords(i), ws.getTags(i))) {
+                int idx = dict.put(ws.getWords(i));
+                if (used[idx] >= 0) {
+                    v.add(used[idx], 1);
+                } else {
+                    v.add(idx, 1);
+                }
+            }
+        }
+        return v;
+    }
+
+
+    private static EuclideanDocVector docToVector2(WordAtoms ws, FeatureSet dict, FeatureSet dict1, int[] used) {
+        EuclideanDocVector v = new EuclideanDocVector();
+        int len = ws.trimLength();
+        for (int i = 0; i < len; i++) {
+            if (isKeyWord(ws.getWords(i), ws.getTags(i))) {
+                if (i + 1 < len) {
+                    //String w2 = ws.getWords(i)+ws.getWords(i+1);
+                    String w1 = ws.getWords(i);
+                    String w2 = ws.getWords(i + 1);
+                    int i1 = dict1.getIndex(w1);
+                    int i2 = dict1.getIndex(w2);
+                    if (i1 >= 0 && used[i1] >= 0) {
+                        i1 = used[i1];
+                    }
+
+                    if (i2 >= 0 && used[i2] >= 0) {
+                        i2 = used[i2];
+                    }
+
+                    if (i1 >= 0) {
+                        w1 = Integer.toString(i1);
+                    }
+                    if (i2 >= 0) {
+                        w2 = Integer.toString(i2);
+                    }
+
+                    String w22 = w1 + "-" + w2;
+                    int idx = dict.put(w22);
+                    v.add(idx, 1);
+                }
+            }
+        }
+        return v;
+    }
+
+
+    private static EuclideanDocVector docToVector2(WordAtoms ws, FeatureSet dict) {
+        EuclideanDocVector v = new EuclideanDocVector();
+        int len = ws.trimLength();
+        for (int i = 0; i < len; i++) {
+            if (isKeyWord(ws.getWords(i), ws.getTags(i))) {
+                if (i + 1 < len) {
+                    String w2 = ws.getWords(i) + ws.getWords(i + 1);
+                    int idx = dict.put(w2);
+                    v.add(idx, 1);
+                }
+            }
+        }
+        return v;
+    }
+
+
+    private static EuclideanDocVector docToVector(String text, FeatureSet dict, IOntologyBasedService obs) {
+        WordAtoms ws = obs.segment(text);
+        ws.trim();
+        EuclideanDocVector v = docToVector(ws, dict);
+        return v;
+    }
+
+    private static WordAtoms segment(String text, CnSegment segment) {
+        WordAtoms ws = segment.segment(text);
+        ws.trim();
+        return ws;
+    }
+
+    public static double getOntoSimilarity(String word1, String word2, IOntologyBasedService obs) {
+        int[] co1 = obs.getOntoService().getAllMappingId(word1);
+        int[] co2 = obs.getOntoService().getAllMappingId(word2);
+
+        if (co1 == null || co2 == null) {
+            return 0;
+        }
+        if (co1.length >= 256 && co2.length >= 256) {
+            //System.out.println(co1.length+"="+co2.length);
+            return 0;
+        }
+
+        IOntologyService service = obs.getOntoService();
+
+        double similarity = 0;
+        for (int i = 0; i < co1.length; i++) {
+            for (int j = 0; j < co2.length; j++) {
+                //v = co1[i]==co2[j]?1:0;
+                double v = co1[i] == co2[j] ? 1 : service.getOntoSimilarity(co1[i], co2[j]);
+                if (v > similarity) {
+                    similarity = v;
+                }
+                //2021.6.6 修改
+                //if (similarity >= 0.1)
+                //if (similarity >= 0.5)
+                //	break;
+            }
+            if (similarity >= 1.0)
+                break;
+        }
+
+        return similarity;
+    }
+
+
+    private static WordAtoms toWordAtoms(OntoWordAtoms ows) {
+        ArrayList<OntoPointer> ontos = ows.getOntoNodes();
+        if (ontos != null && ontos.size() > 0) {
+            WordAtoms ws2 = ows.clone();
+            int trimLen = 0;
+
+            for (int i = 0; i < ontos.size(); i++) {
+                OntoPointer op = ontos.get(i);
+                int len1 = op.getOccupied().size();
+                if (len1 > 1) {
+                    int i1 = op.getStart();
+                    StringBuilder sb = new StringBuilder();
+                    for (int j = i; j < i + len1; j++) {
+                        sb.append(ows.getWords(j));
+                    }
+
+                    int index = i - trimLen;
+                    ws2.moveFrom2(index + 1, index + len1);
+                    int trimLen1 = len1 - 1;
+                    trimLen = trimLen + trimLen1;
+                    ws2.set1Word(index, sb.toString());
+
+                    int pos1 = ows.getTags(i);
+                    ws2.set1Tag(index, pos1);
+
+                } else {
+
+                }
+            }
+            ws2.trim();
+            //ws2.outString();
+            return ws2;
+        } else {
+
+        }
+        return ows;
+
+    }
+
+    public static double getOntoTextSim(OntoWordAtoms ows1, OntoWordAtoms ows2, IOntologyBasedService obs) {
+        WordAtoms ws1 = toWordAtoms(ows1);
+        WordAtoms ws2 = toWordAtoms(ows2);
+
+        FeatureSet dict = new FeatureSet();
+        EuclideanDocVector v1 = docToVector(ws1, dict);
+        EuclideanDocVector v2 = docToVector(ws2, dict);
+        String[] words = dict.getIndexedWords();
+
+        int[] used = new int[words.length];
+        Arrays.fill(used, -1);
+
+        for (int i = 0; i < words.length; i++) {
+            for (int j = i + 1; j < words.length; j++) {
+                //double s1 = obs.getSimilarity(words[i], words[j]);
+                if (used[j] < 0) {
+                    double s1 = getOntoSimilarity(words[i], words[j], obs);
+
+                    if (s1 < SAME_THRESHOLD) {
+                        s1 = obs.getSemNet().getSimilarity(words[i], words[j]);
+                    }
+                    if (s1 > SAME_THRESHOLD) {
+                        //System.out.println(words[i]+" "+words[j]+" = "+s1);
+                        used[j] = i;
+                    }
+                }
+            }
+        }
+
+        EuclideanDocVector v31 = docToVector(ws1, dict, used);
+        EuclideanDocVector v32 = docToVector(ws2, dict, used);
+
+        double sim3 = v31.simFrom2(v32);
+
+        FeatureSet dict2 = new FeatureSet();
+        EuclideanDocVector v21 = docToVector2(ws1, dict2, dict, used);
+        EuclideanDocVector v22 = docToVector2(ws2, dict2, dict, used);
+
+        double sim2 = v21.simFrom2(v22);
+
+        double sim = 0.95 * sim3 + 0.05 * sim2;
+
+        return sim;
+    }
+
+
+    private static boolean isOntoEmpty(OntoWordAtoms ows) {
+        ArrayList<OntoPointer> ontos = ows.getOntoNodes();
+        if (ontos != null && ontos.size() > 0) {
+            return false;
+        }
+        return true;
+    }
+
+    public static double getTextSim(String text1, String text2, IOntologyBasedService obs) {
+
+        WordAtoms ws1 = segment(text1, obs.getCnSegment());
+        WordAtoms ws2 = segment(text2, obs.getCnSegment());
+
+        OntoWordAtoms ows1 = obs.getOntoService().getMappings(ws1);
+        OntoWordAtoms ows2 = obs.getOntoService().getMappings(ws2);
+
+        double ontoSim = 0;
+        // 如果都有语义网节点,则先求出ontoSimilarity
+        if (!isOntoEmpty(ows1) && !isOntoEmpty(ows2)) {
+            ontoSim = getOntoTextSim(ows1, ows2, obs);
+        }
+
+        FeatureSet dict1 = new FeatureSet();
+        EuclideanDocVector v1 = docToVector(ws1, dict1);
+        EuclideanDocVector v2 = docToVector(ws2, dict1);
+
+        double sim = v1.simFrom2(v2);
+
+        FeatureSet dict2 = new FeatureSet();
+        EuclideanDocVector v21 = docToVector2(ws1, dict2);
+        EuclideanDocVector v22 = docToVector2(ws2, dict2);
+        double sim2 = v21.simFrom2(v22);
+
+
+        // 2025.12.26 这是之前的版本
+//        String[] words = dict1.getIndexedWords();
+//        int[] used = new int[words.length];
+//        Arrays.fill(used, -1);
+//
+//        for (int i = 0; i < words.length; i++) {
+//            for (int j = i + 1; j < words.length; j++) {
+//                if (used[j] < 0) {
+//                    double s1 = obs.getSemNet().getSimilarity(words[i], words[j]);
+//
+//                    if (s1 > SAME_THRESHOLD) {
+//                        //System.out.println(words[i]+" "+words[j]+" = "+s1);
+//                        used[j] = i;
+//                    }
+//                }
+//            }
+//        }
+//
+//        EuclideanDocVector v31 = docToVector(ws1, dict1, used);
+//        EuclideanDocVector v32 = docToVector(ws2, dict1, used);
+//        double sim3 = v31.simFrom2(v32);
+//        System.out.println("1.sim3=" + sim3);
+
+        double sim3 = semanticCosine(ws1, ws2, obs);
+        //System.out.println("2.sim3_2=" + sim3);
+        double sim23 = sim2 * 0.05 + sim3 * 0.95;
+        double sim12 = Math.max(ontoSim, sim23);
+
+        sim = sim12 * 0.98 + sim * 0.02;
+
+        return sim;
+    }
+
+
+    private static double semanticCosine(WordAtoms ws1, WordAtoms ws2, IOntologyBasedService obs) {
+        Map<String, Double> vec1 = vectorize(ws1, obs);
+        Map<String, Double> vec2 = vectorize(ws2, obs);
+        Map<String, GoeSimService.Pair<String, Double>> unmatched = new HashMap<>();
+        vec2.forEach((k, v) -> {
+            if (vec1.containsKey(k)) {
+            } else {
+                unmatched.put(k, new GoeSimService.Pair(null, 0));
+            }
+        });
+        //double sim = semanticCosine(vec1,vec2,);
+        double score = 0.0;
+
+        for (var qe : vec1.entrySet()) {
+            String qt = qe.getKey();
+            double qw = qe.getValue();
+
+            double best = 0.0;
+            if (vec2.containsKey(qt)) {
+                double qw2 = vec2.get(qt);
+                best = qw * qw2 * 1.0;
+            } else {
+                double sim1 = 0;
+                String matchedKey = null;
+                for (var de : unmatched.entrySet()) {
+                    String k3 = de.getKey();
+                    double s1 = obs.getSemNet().getSimilarity(qt, k3);
+                    if (s1 >= SAME_THRESHOLD) {
+                        if (s1 > sim1) {
+                            sim1 = s1;
+                            matchedKey = k3;
+                        }
+                    }
+                }
+                if (matchedKey != null) {
+                    unmatched.remove(matchedKey);
+                    double qw2 = vec2.get(matchedKey);
+                    best = qw * qw2 * sim1;
+                }
+            }
+
+            score += best;
+        }
+        return score;
+    }
+
+    private static Map<String, Double> vectorize(WordAtoms ws, IOntologyBasedService obs) {
+        // binary TF + IDF
+        Map<String, Double> vec = new HashMap<>();
+        for (int i = 0; i < ws.trimLength(); i++) {
+            double w = getWordWeight(ws, i, obs);
+            vec.put(ws.getWords(i), w);
+        }
+        normalize(vec);
+        return vec;
+    }
+
+    private static double getWordWeight(WordAtoms ws, int idx, IOntologyBasedService obs) {
+        double w = 1;
+        int pos = ws.getTags(idx);
+        String word = ws.getWords(idx);
+        boolean ok = true;
+        if (pos == POSUtil.POS_UNKOWN) {
+            if (word.length() == 1 && Character.isSpaceChar(word.charAt(0))) {
+                w = 0;
+            } else {
+                w = 2;
+            }
+        } else if (weights.getWeight(pos) > 0) {
+            w = weights.getWeight(pos);
+        }
+        return w;
+    }
+
+    private static void normalize(Map<String, Double> vec) {
+        double sumSq = 0.0;
+        for (double v : vec.values()) {
+            sumSq += v * v;
+        }
+        double norm = Math.sqrt(sumSq);
+        if (norm == 0) return;
+
+        for (var e : vec.entrySet()) {
+            e.setValue(e.getValue() / norm);
+        }
+    }
+}

+ 1 - 2
tools/src/test/java/com/giantan/mds/MdSearcherTest.java → server/src/test/java/com/giantan/data/mds/MdSearcherTest.java

@@ -1,7 +1,6 @@
-package com.giantan.mds;
+package com.giantan.data.mds;
 
 import com.google.gson.Gson;
-import org.cnnlp.data.document.GDocConstants;
 import org.cnnlp.data.md.MdSearcher;
 import org.cnnlp.data.util.BaseParameters;
 

+ 0 - 37
tools/pom.xml

@@ -1,37 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project xmlns="http://maven.apache.org/POM/4.0.0"
-         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-    <modelVersion>4.0.0</modelVersion>
-    <parent>
-        <groupId>com.giantan.mds</groupId>
-        <artifactId>mds</artifactId>
-        <version>1.0.0</version>
-    </parent>
-
-    <artifactId>tools</artifactId>
-
-    <properties>
-        <maven.compiler.source>17</maven.compiler.source>
-        <maven.compiler.target>17</maven.compiler.target>
-        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
-    </properties>
-
-    <dependencies>
-
-        <dependency>
-            <groupId>org.cnnlp.data.md</groupId>
-            <artifactId>gtbook</artifactId>
-            <version>1.0.0</version>
-        </dependency>
-
-        <!-- https://mvnrepository.com/artifact/com.google.code.gson/gson -->
-        <dependency>
-            <groupId>com.google.code.gson</groupId>
-            <artifactId>gson</artifactId>
-            <version>2.13.1</version>
-        </dependency>
-
-    </dependencies>
-
-</project>

+ 0 - 110
tools/src/test/java/com/giantan/mds/Main.java

@@ -1,110 +0,0 @@
-package com.giantan.mds;
-
-//TIP To <b>Run</b> code, press <shortcut actionId="Run"/> or
-// click the <icon src="AllIcons.Actions.Execute"/> icon in the gutter.
-public class Main {
-
-    public static int match(String text, String str) {
-        text = text.replace("\r\n", "\n");
-        str = str.replace("\r\n", "\n");
-        String[] ss = str.split("\n");
-        int count = 0;
-
-        int pos = 0;
-        while (true) {
-            count++;
-
-            pos = text.indexOf(ss[0], pos);
-
-            if (pos < 0) {
-                break;
-            } else {
-                int matched1 = match(text, ss, pos);
-                if (matched1 >= 0) {
-                    break;
-                } else {
-                    pos = pos + ss[0].length();
-                }
-            }
-        }
-        return pos;
-    }
-
-    public static int match(String text, String[] target, int beginIndex) {
-        int pos = beginIndex;
-        if (pos < 0) return -1;
-
-        int nowP = pos;
-        boolean isMatched = true;
-        int i = 0;
-        for (i = 1; i < target.length; i++) {
-            nowP = nowP + +target[i - 1].length();
-            if (nowP >= text.length()) {
-                isMatched = false;
-                break;
-            }
-            for (int j = nowP; j < text.length(); j++) {
-                if (text.charAt(j) != '\n') {
-                    nowP = j;
-                    break;
-                }
-            }
-
-            if (nowP >= text.length()) {
-                isMatched = false;
-                break;
-            }
-            String s1 = text.substring(nowP, nowP + target[i].length());
-            if (!s1.equals(target[i])) {
-                isMatched = false;
-                break;
-            }
-        }
-
-        if (isMatched) {
-            return pos;
-        } else {
-            if (pos > 0) {
-                return -pos;
-            } else {
-                return -1;
-            }
-        }
-    }
-
-//    public static void main(String[] args) throws IOException {
-//        String f = "D:\\data\\乙烯\\target\\四川操规与应急操作卡\\1000万吨年常减压蒸馏装置\\1000万吨年常减压蒸馏装置操作规程.md";
-//        f = "C:\\Users\\dwp\\Desktop\\temp\\dzn\\石油知识文档-已转换\\target\\四川操规与应急操作卡\\1000万吨年常减压蒸馏装置\\1000万吨年常减压蒸馏装置操作规程.md";
-//
-//        String jf = "C:\\Users\\dwp\\Desktop\\temp\\milvus1.json";
-//        List<String> ls = Files.readAllLines(Path.of(jf));
-//        Gson gson = new Gson();
-//        Map row = gson.fromJson((String) ls.get(302), Map.class);
-//        String s = (String) row.get("text");
-//        s = s.replace("\r\n", "\n");
-//        //s = s.substring(0,60);
-//        s = s + "2";
-//        String[] ss = s.split("\n");
-//
-//        String text = Files.readString(Path.of(f));
-//
-//        text = text.replace("\r\n", "\n");
-//        long t = System.currentTimeMillis();
-//        int matched = match(text, s);
-//        t = System.currentTimeMillis() - t;
-//        System.out.println("used time = " + t);
-//        System.out.println("matched=" + matched);
-//        //String s1 = text.substring(matched,matched+200);
-//        //System.out.println(s1);
-//
-//
-////                System.out.println(i);
-////                int matched1 = match(text, ss, i);
-////                if (matched1>0){
-////                    System.out.println("matched1="+matched1);
-////                    break;
-////                }
-//
-//
-//    }
-}