|
@@ -0,0 +1,419 @@
|
|
|
|
|
+package com.giantan.data.qa.service;
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+import java.util.*;
|
|
|
|
|
+
|
|
|
|
|
+import org.cnnlp.service.IOntologyBasedService;
|
|
|
|
|
+
|
|
|
|
|
+import cnnlp.lexical.CnSegment;
|
|
|
|
|
+import cnnlp.lexical.dict.POSUtil;
|
|
|
|
|
+import cnnlp.lexical.segment.WordAtoms;
|
|
|
|
|
+import cnnlp.resource.IOntologyService;
|
|
|
|
|
+import cnnlp.resource.oe.OntoPointer;
|
|
|
|
|
+import cnnlp.resource.oe.OntoWordAtoms;
|
|
|
|
|
+import cnnlp.summarization.cluster.kmeans.EuclideanDocVector;
|
|
|
|
|
+import cnnlp.summarization.text.FeatureSet;
|
|
|
|
|
+import cnnlp.summarization.text.TextVector;
|
|
|
|
|
+import cnnlp.similarity.osce.SimParams;
|
|
|
|
|
+
|
|
|
|
|
+//2023.8.16
|
|
|
|
|
+
|
|
|
|
|
+public class TextSimilarity {
|
|
|
|
|
+
|
|
|
|
|
+ private static double SAME_THRESHOLD = 0.95;
|
|
|
|
|
+ private static double SIM_THRESHOLD = 0.75;
|
|
|
|
|
+
|
|
|
|
|
+ static SimParams weights = new SimParams();
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+ private static EuclideanDocVector docToVector(TextVector tv1, FeatureSet dict) {
|
|
|
|
|
+ int len1 = tv1.trimLength();
|
|
|
|
|
+ EuclideanDocVector v = new EuclideanDocVector(len1);
|
|
|
|
|
+ for (int i = 0; i < len1; i++) {
|
|
|
|
|
+ int idx = dict.put(tv1.get1Words(i));
|
|
|
|
|
+ v.add(idx, tv1.get1TF(i));
|
|
|
|
|
+ }
|
|
|
|
|
+ return v;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+ private static boolean isKeyWord(String word, int pos) {
|
|
|
|
|
+ boolean ok = true;
|
|
|
|
|
+ if (pos == POSUtil.POS_UNKOWN) {
|
|
|
|
|
+ if (word.length() == 1 && Character.isSpaceChar(word.charAt(0))) {
|
|
|
|
|
+ ok = false;
|
|
|
|
|
+ }
|
|
|
|
|
+ } else if (weights.getWeight(pos) <= 0) {
|
|
|
|
|
+ ok = false;
|
|
|
|
|
+ }
|
|
|
|
|
+ return ok;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ private static EuclideanDocVector docToVector(WordAtoms ws, FeatureSet dict) {
|
|
|
|
|
+ EuclideanDocVector v = new EuclideanDocVector();
|
|
|
|
|
+ for (int i = 0; i < ws.trimLength(); i++) {
|
|
|
|
|
+ if (isKeyWord(ws.getWords(i), ws.getTags(i))) {
|
|
|
|
|
+ int idx = dict.put(ws.getWords(i));
|
|
|
|
|
+ v.add(idx, 1);
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ return v;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // used指向先出现的同义词
|
|
|
|
|
+ private static EuclideanDocVector docToVector(WordAtoms ws, FeatureSet dict, int[] used) {
|
|
|
|
|
+ EuclideanDocVector v = new EuclideanDocVector();
|
|
|
|
|
+ for (int i = 0; i < ws.trimLength(); i++) {
|
|
|
|
|
+ if (isKeyWord(ws.getWords(i), ws.getTags(i))) {
|
|
|
|
|
+ int idx = dict.put(ws.getWords(i));
|
|
|
|
|
+ if (used[idx] >= 0) {
|
|
|
|
|
+ v.add(used[idx], 1);
|
|
|
|
|
+ } else {
|
|
|
|
|
+ v.add(idx, 1);
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ return v;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+ private static EuclideanDocVector docToVector2(WordAtoms ws, FeatureSet dict, FeatureSet dict1, int[] used) {
|
|
|
|
|
+ EuclideanDocVector v = new EuclideanDocVector();
|
|
|
|
|
+ int len = ws.trimLength();
|
|
|
|
|
+ for (int i = 0; i < len; i++) {
|
|
|
|
|
+ if (isKeyWord(ws.getWords(i), ws.getTags(i))) {
|
|
|
|
|
+ if (i + 1 < len) {
|
|
|
|
|
+ //String w2 = ws.getWords(i)+ws.getWords(i+1);
|
|
|
|
|
+ String w1 = ws.getWords(i);
|
|
|
|
|
+ String w2 = ws.getWords(i + 1);
|
|
|
|
|
+ int i1 = dict1.getIndex(w1);
|
|
|
|
|
+ int i2 = dict1.getIndex(w2);
|
|
|
|
|
+ if (i1 >= 0 && used[i1] >= 0) {
|
|
|
|
|
+ i1 = used[i1];
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ if (i2 >= 0 && used[i2] >= 0) {
|
|
|
|
|
+ i2 = used[i2];
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ if (i1 >= 0) {
|
|
|
|
|
+ w1 = Integer.toString(i1);
|
|
|
|
|
+ }
|
|
|
|
|
+ if (i2 >= 0) {
|
|
|
|
|
+ w2 = Integer.toString(i2);
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ String w22 = w1 + "-" + w2;
|
|
|
|
|
+ int idx = dict.put(w22);
|
|
|
|
|
+ v.add(idx, 1);
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ return v;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+ private static EuclideanDocVector docToVector2(WordAtoms ws, FeatureSet dict) {
|
|
|
|
|
+ EuclideanDocVector v = new EuclideanDocVector();
|
|
|
|
|
+ int len = ws.trimLength();
|
|
|
|
|
+ for (int i = 0; i < len; i++) {
|
|
|
|
|
+ if (isKeyWord(ws.getWords(i), ws.getTags(i))) {
|
|
|
|
|
+ if (i + 1 < len) {
|
|
|
|
|
+ String w2 = ws.getWords(i) + ws.getWords(i + 1);
|
|
|
|
|
+ int idx = dict.put(w2);
|
|
|
|
|
+ v.add(idx, 1);
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ return v;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+ private static EuclideanDocVector docToVector(String text, FeatureSet dict, IOntologyBasedService obs) {
|
|
|
|
|
+ WordAtoms ws = obs.segment(text);
|
|
|
|
|
+ ws.trim();
|
|
|
|
|
+ EuclideanDocVector v = docToVector(ws, dict);
|
|
|
|
|
+ return v;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ private static WordAtoms segment(String text, CnSegment segment) {
|
|
|
|
|
+ WordAtoms ws = segment.segment(text);
|
|
|
|
|
+ ws.trim();
|
|
|
|
|
+ return ws;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ public static double getOntoSimilarity(String word1, String word2, IOntologyBasedService obs) {
|
|
|
|
|
+ int[] co1 = obs.getOntoService().getAllMappingId(word1);
|
|
|
|
|
+ int[] co2 = obs.getOntoService().getAllMappingId(word2);
|
|
|
|
|
+
|
|
|
|
|
+ if (co1 == null || co2 == null) {
|
|
|
|
|
+ return 0;
|
|
|
|
|
+ }
|
|
|
|
|
+ if (co1.length >= 256 && co2.length >= 256) {
|
|
|
|
|
+ //System.out.println(co1.length+"="+co2.length);
|
|
|
|
|
+ return 0;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ IOntologyService service = obs.getOntoService();
|
|
|
|
|
+
|
|
|
|
|
+ double similarity = 0;
|
|
|
|
|
+ for (int i = 0; i < co1.length; i++) {
|
|
|
|
|
+ for (int j = 0; j < co2.length; j++) {
|
|
|
|
|
+ //v = co1[i]==co2[j]?1:0;
|
|
|
|
|
+ double v = co1[i] == co2[j] ? 1 : service.getOntoSimilarity(co1[i], co2[j]);
|
|
|
|
|
+ if (v > similarity) {
|
|
|
|
|
+ similarity = v;
|
|
|
|
|
+ }
|
|
|
|
|
+ //2021.6.6 修改
|
|
|
|
|
+ //if (similarity >= 0.1)
|
|
|
|
|
+ //if (similarity >= 0.5)
|
|
|
|
|
+ // break;
|
|
|
|
|
+ }
|
|
|
|
|
+ if (similarity >= 1.0)
|
|
|
|
|
+ break;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ return similarity;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+ private static WordAtoms toWordAtoms(OntoWordAtoms ows) {
|
|
|
|
|
+ ArrayList<OntoPointer> ontos = ows.getOntoNodes();
|
|
|
|
|
+ if (ontos != null && ontos.size() > 0) {
|
|
|
|
|
+ WordAtoms ws2 = ows.clone();
|
|
|
|
|
+ int trimLen = 0;
|
|
|
|
|
+
|
|
|
|
|
+ for (int i = 0; i < ontos.size(); i++) {
|
|
|
|
|
+ OntoPointer op = ontos.get(i);
|
|
|
|
|
+ int len1 = op.getOccupied().size();
|
|
|
|
|
+ if (len1 > 1) {
|
|
|
|
|
+ int i1 = op.getStart();
|
|
|
|
|
+ StringBuilder sb = new StringBuilder();
|
|
|
|
|
+ for (int j = i; j < i + len1; j++) {
|
|
|
|
|
+ sb.append(ows.getWords(j));
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ int index = i - trimLen;
|
|
|
|
|
+ ws2.moveFrom2(index + 1, index + len1);
|
|
|
|
|
+ int trimLen1 = len1 - 1;
|
|
|
|
|
+ trimLen = trimLen + trimLen1;
|
|
|
|
|
+ ws2.set1Word(index, sb.toString());
|
|
|
|
|
+
|
|
|
|
|
+ int pos1 = ows.getTags(i);
|
|
|
|
|
+ ws2.set1Tag(index, pos1);
|
|
|
|
|
+
|
|
|
|
|
+ } else {
|
|
|
|
|
+
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ ws2.trim();
|
|
|
|
|
+ //ws2.outString();
|
|
|
|
|
+ return ws2;
|
|
|
|
|
+ } else {
|
|
|
|
|
+
|
|
|
|
|
+ }
|
|
|
|
|
+ return ows;
|
|
|
|
|
+
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ public static double getOntoTextSim(OntoWordAtoms ows1, OntoWordAtoms ows2, IOntologyBasedService obs) {
|
|
|
|
|
+ WordAtoms ws1 = toWordAtoms(ows1);
|
|
|
|
|
+ WordAtoms ws2 = toWordAtoms(ows2);
|
|
|
|
|
+
|
|
|
|
|
+ FeatureSet dict = new FeatureSet();
|
|
|
|
|
+ EuclideanDocVector v1 = docToVector(ws1, dict);
|
|
|
|
|
+ EuclideanDocVector v2 = docToVector(ws2, dict);
|
|
|
|
|
+ String[] words = dict.getIndexedWords();
|
|
|
|
|
+
|
|
|
|
|
+ int[] used = new int[words.length];
|
|
|
|
|
+ Arrays.fill(used, -1);
|
|
|
|
|
+
|
|
|
|
|
+ for (int i = 0; i < words.length; i++) {
|
|
|
|
|
+ for (int j = i + 1; j < words.length; j++) {
|
|
|
|
|
+ //double s1 = obs.getSimilarity(words[i], words[j]);
|
|
|
|
|
+ if (used[j] < 0) {
|
|
|
|
|
+ double s1 = getOntoSimilarity(words[i], words[j], obs);
|
|
|
|
|
+
|
|
|
|
|
+ if (s1 < SAME_THRESHOLD) {
|
|
|
|
|
+ s1 = obs.getSemNet().getSimilarity(words[i], words[j]);
|
|
|
|
|
+ }
|
|
|
|
|
+ if (s1 > SAME_THRESHOLD) {
|
|
|
|
|
+ //System.out.println(words[i]+" "+words[j]+" = "+s1);
|
|
|
|
|
+ used[j] = i;
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ EuclideanDocVector v31 = docToVector(ws1, dict, used);
|
|
|
|
|
+ EuclideanDocVector v32 = docToVector(ws2, dict, used);
|
|
|
|
|
+
|
|
|
|
|
+ double sim3 = v31.simFrom2(v32);
|
|
|
|
|
+
|
|
|
|
|
+ FeatureSet dict2 = new FeatureSet();
|
|
|
|
|
+ EuclideanDocVector v21 = docToVector2(ws1, dict2, dict, used);
|
|
|
|
|
+ EuclideanDocVector v22 = docToVector2(ws2, dict2, dict, used);
|
|
|
|
|
+
|
|
|
|
|
+ double sim2 = v21.simFrom2(v22);
|
|
|
|
|
+
|
|
|
|
|
+ double sim = 0.95 * sim3 + 0.05 * sim2;
|
|
|
|
|
+
|
|
|
|
|
+ return sim;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+ private static boolean isOntoEmpty(OntoWordAtoms ows) {
|
|
|
|
|
+ ArrayList<OntoPointer> ontos = ows.getOntoNodes();
|
|
|
|
|
+ if (ontos != null && ontos.size() > 0) {
|
|
|
|
|
+ return false;
|
|
|
|
|
+ }
|
|
|
|
|
+ return true;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ public static double getTextSim(String text1, String text2, IOntologyBasedService obs) {
|
|
|
|
|
+
|
|
|
|
|
+ WordAtoms ws1 = segment(text1, obs.getCnSegment());
|
|
|
|
|
+ WordAtoms ws2 = segment(text2, obs.getCnSegment());
|
|
|
|
|
+
|
|
|
|
|
+ OntoWordAtoms ows1 = obs.getOntoService().getMappings(ws1);
|
|
|
|
|
+ OntoWordAtoms ows2 = obs.getOntoService().getMappings(ws2);
|
|
|
|
|
+
|
|
|
|
|
+ double ontoSim = 0;
|
|
|
|
|
+ // 如果都有语义网节点,则先求出ontoSimilarity
|
|
|
|
|
+ if (!isOntoEmpty(ows1) && !isOntoEmpty(ows2)) {
|
|
|
|
|
+ ontoSim = getOntoTextSim(ows1, ows2, obs);
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ FeatureSet dict1 = new FeatureSet();
|
|
|
|
|
+ EuclideanDocVector v1 = docToVector(ws1, dict1);
|
|
|
|
|
+ EuclideanDocVector v2 = docToVector(ws2, dict1);
|
|
|
|
|
+
|
|
|
|
|
+ double sim = v1.simFrom2(v2);
|
|
|
|
|
+
|
|
|
|
|
+ FeatureSet dict2 = new FeatureSet();
|
|
|
|
|
+ EuclideanDocVector v21 = docToVector2(ws1, dict2);
|
|
|
|
|
+ EuclideanDocVector v22 = docToVector2(ws2, dict2);
|
|
|
|
|
+ double sim2 = v21.simFrom2(v22);
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+ // 2025.12.26 这是之前的版本
|
|
|
|
|
+// String[] words = dict1.getIndexedWords();
|
|
|
|
|
+// int[] used = new int[words.length];
|
|
|
|
|
+// Arrays.fill(used, -1);
|
|
|
|
|
+//
|
|
|
|
|
+// for (int i = 0; i < words.length; i++) {
|
|
|
|
|
+// for (int j = i + 1; j < words.length; j++) {
|
|
|
|
|
+// if (used[j] < 0) {
|
|
|
|
|
+// double s1 = obs.getSemNet().getSimilarity(words[i], words[j]);
|
|
|
|
|
+//
|
|
|
|
|
+// if (s1 > SAME_THRESHOLD) {
|
|
|
|
|
+// //System.out.println(words[i]+" "+words[j]+" = "+s1);
|
|
|
|
|
+// used[j] = i;
|
|
|
|
|
+// }
|
|
|
|
|
+// }
|
|
|
|
|
+// }
|
|
|
|
|
+// }
|
|
|
|
|
+//
|
|
|
|
|
+// EuclideanDocVector v31 = docToVector(ws1, dict1, used);
|
|
|
|
|
+// EuclideanDocVector v32 = docToVector(ws2, dict1, used);
|
|
|
|
|
+// double sim3 = v31.simFrom2(v32);
|
|
|
|
|
+// System.out.println("1.sim3=" + sim3);
|
|
|
|
|
+
|
|
|
|
|
+ double sim3 = semanticCosine(ws1, ws2, obs);
|
|
|
|
|
+ //System.out.println("2.sim3_2=" + sim3);
|
|
|
|
|
+ double sim23 = sim2 * 0.05 + sim3 * 0.95;
|
|
|
|
|
+ double sim12 = Math.max(ontoSim, sim23);
|
|
|
|
|
+
|
|
|
|
|
+ sim = sim12 * 0.98 + sim * 0.02;
|
|
|
|
|
+
|
|
|
|
|
+ return sim;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+ private static double semanticCosine(WordAtoms ws1, WordAtoms ws2, IOntologyBasedService obs) {
|
|
|
|
|
+ Map<String, Double> vec1 = vectorize(ws1, obs);
|
|
|
|
|
+ Map<String, Double> vec2 = vectorize(ws2, obs);
|
|
|
|
|
+ Map<String, GoeSimService.Pair<String, Double>> unmatched = new HashMap<>();
|
|
|
|
|
+ vec2.forEach((k, v) -> {
|
|
|
|
|
+ if (vec1.containsKey(k)) {
|
|
|
|
|
+ } else {
|
|
|
|
|
+ unmatched.put(k, new GoeSimService.Pair(null, 0));
|
|
|
|
|
+ }
|
|
|
|
|
+ });
|
|
|
|
|
+ //double sim = semanticCosine(vec1,vec2,);
|
|
|
|
|
+ double score = 0.0;
|
|
|
|
|
+
|
|
|
|
|
+ for (var qe : vec1.entrySet()) {
|
|
|
|
|
+ String qt = qe.getKey();
|
|
|
|
|
+ double qw = qe.getValue();
|
|
|
|
|
+
|
|
|
|
|
+ double best = 0.0;
|
|
|
|
|
+ if (vec2.containsKey(qt)) {
|
|
|
|
|
+ double qw2 = vec2.get(qt);
|
|
|
|
|
+ best = qw * qw2 * 1.0;
|
|
|
|
|
+ } else {
|
|
|
|
|
+ double sim1 = 0;
|
|
|
|
|
+ String matchedKey = null;
|
|
|
|
|
+ for (var de : unmatched.entrySet()) {
|
|
|
|
|
+ String k3 = de.getKey();
|
|
|
|
|
+ double s1 = obs.getSemNet().getSimilarity(qt, k3);
|
|
|
|
|
+ if (s1 >= SAME_THRESHOLD) {
|
|
|
|
|
+ if (s1 > sim1) {
|
|
|
|
|
+ sim1 = s1;
|
|
|
|
|
+ matchedKey = k3;
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ if (matchedKey != null) {
|
|
|
|
|
+ unmatched.remove(matchedKey);
|
|
|
|
|
+ double qw2 = vec2.get(matchedKey);
|
|
|
|
|
+ best = qw * qw2 * sim1;
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ score += best;
|
|
|
|
|
+ }
|
|
|
|
|
+ return score;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ private static Map<String, Double> vectorize(WordAtoms ws, IOntologyBasedService obs) {
|
|
|
|
|
+ // binary TF + IDF
|
|
|
|
|
+ Map<String, Double> vec = new HashMap<>();
|
|
|
|
|
+ for (int i = 0; i < ws.trimLength(); i++) {
|
|
|
|
|
+ double w = getWordWeight(ws, i, obs);
|
|
|
|
|
+ vec.put(ws.getWords(i), w);
|
|
|
|
|
+ }
|
|
|
|
|
+ normalize(vec);
|
|
|
|
|
+ return vec;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ private static double getWordWeight(WordAtoms ws, int idx, IOntologyBasedService obs) {
|
|
|
|
|
+ double w = 1;
|
|
|
|
|
+ int pos = ws.getTags(idx);
|
|
|
|
|
+ String word = ws.getWords(idx);
|
|
|
|
|
+ boolean ok = true;
|
|
|
|
|
+ if (pos == POSUtil.POS_UNKOWN) {
|
|
|
|
|
+ if (word.length() == 1 && Character.isSpaceChar(word.charAt(0))) {
|
|
|
|
|
+ w = 0;
|
|
|
|
|
+ } else {
|
|
|
|
|
+ w = 2;
|
|
|
|
|
+ }
|
|
|
|
|
+ } else if (weights.getWeight(pos) > 0) {
|
|
|
|
|
+ w = weights.getWeight(pos);
|
|
|
|
|
+ }
|
|
|
|
|
+ return w;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ private static void normalize(Map<String, Double> vec) {
|
|
|
|
|
+ double sumSq = 0.0;
|
|
|
|
|
+ for (double v : vec.values()) {
|
|
|
|
|
+ sumSq += v * v;
|
|
|
|
|
+ }
|
|
|
|
|
+ double norm = Math.sqrt(sumSq);
|
|
|
|
|
+ if (norm == 0) return;
|
|
|
|
|
+
|
|
|
|
|
+ for (var e : vec.entrySet()) {
|
|
|
|
|
+ e.setValue(e.getValue() / norm);
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+}
|