| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288 |
- package org.cnnlp.data.splitter;
- import com.vladsch.flexmark.util.ast.Node;
- import org.cnnlp.data.book.GTBookConstants;
- import org.cnnlp.data.book.IElement;
- import org.cnnlp.data.document.GDocConstants;
- import org.cnnlp.data.document.GDocument;
- import org.cnnlp.data.util.BaseParameters;
- import org.cnnlp.data.util.id.IdGenerator;
- import org.cnnlp.data.util.id.UlidGenerator;
- import java.io.File;
- import java.io.IOException;
- import java.nio.file.Files;
- import java.nio.file.Path;
- import java.nio.file.Paths;
- import java.util.ArrayList;
- import java.util.LinkedHashMap;
- import java.util.List;
- import java.util.Map;
- // MdSimpleSplitter 用于将 md 按照 headline 的outline拆分,同时 把<!--key: value --> 解析到metadata中
- public class SimpleMdSplitter extends BaseMdParser implements ISplitter {
- private static final int MIN_PARAGRAPH_LEN = 32;
- protected IdGenerator idGen = new UlidGenerator();
- protected ICommentProcessor commentProcessor;
- protected BaseParameters params;
- public SimpleMdSplitter() {
- }
- public void setIdGenerator(IdGenerator idGen) {
- this.idGen = idGen;
- }
- public void setCommentProcessor(ICommentProcessor commentProcessor) {
- this.commentProcessor = commentProcessor;
- }
- private void mergeMap(Map<String, Object> dest, Map<String, Object> src) {
- if (src == null || src.size() <= 0) return;
- src.forEach((k, v) -> {
- Object o = dest.get(k);
- if (o == null) {
- dest.put(k, v);
- } else {
- if (o instanceof List) {
- List ls = (List) o;
- ls.add(v);
- dest.put(k, ls);
- } else {
- List ls = new ArrayList();
- ls.add(o);
- ls.add(v);
- dest.put(k, ls);
- }
- }
- });
- }
- protected Map<String, Object> processComment(int beginIdx, int endIdx) {
- Map<String, Object> rets = new LinkedHashMap<>();
- if (commentProcessor != null) {
- if (beginIdx >= 1) {
- GTNode2 nd = nodes[beginIdx - 1];
- if (nd.getType() == GTBookConstants.MD_COMMENTS) {
- Map<String, Object> meta1 = processComment1(beginIdx - 1, beginIdx);
- mergeMap(rets, meta1);
- }
- }
- for (int i = beginIdx; i < endIdx; i++) {
- GTNode2 nd = nodes[i];
- if (nd.getType() == GTBookConstants.MD_COMMENTS) {
- Map<String, Object> meta1 = processComment1(i, i + 1);
- mergeMap(rets, meta1);
- }
- }
- }
- return rets;
- }
- protected Map<String, Object> processComment1(int beginIdx, int endIdx) {
- List<String> hs = getHtmls(beginIdx, endIdx);
- if (hs != null && hs.size() > 0) {
- List<String> hs2 = SplitUtils.mdStringToList(hs);
- Map<String, Object> meta1 = HtmlCommentParser.parseKv(hs2);
- return meta1;
- } else {
- return Map.of();
- }
- }
- protected Map<String, Object> getMetadata(int beginIdx, int endIdx) {
- Map<String, Object> metadata = new LinkedHashMap<>();
- Object o = params.get(GDocConstants.FILE_NAME);
- if (o != null) {
- metadata.put(GDocConstants.FILE_NAME, o);
- }
- metadata.put(GDocConstants.FROM_IDX, beginIdx);
- metadata.put(GDocConstants.TO_IDX, endIdx);
- Map<String, Object> cm = processComment(beginIdx, endIdx);
- if (cm != null) {
- metadata.putAll(cm);
- }
- return metadata;
- }
- @Override
- public List<GDocument> split(String md, BaseParameters params) {
- this.params = params;
- List<GDocument> docs = new ArrayList<>();
- parse(md, params);
- root.countingTextSize();
- int totalSize = root.countingTextSize();
- int[] depthes = docTree.getDepthes();
- int[] fathers = docTree.getFathers();
- //List<String> outline = getOutline(true);
- //System.out.println(outline);
- int docLen = depthes.length;
- int i = 0;
- int nowJ = 0;
- while (i < docLen) {
- GTNode2 nd = nodes[i];
- //int blockSize = nd.getBlockSize();
- if (nd.getType() == GTBookConstants.MD_HEADING) {
- int nextJ = docLen;
- for (int j = i + 1; j < docLen; j++) {
- if (nodes[j].getType() == GTBookConstants.MD_HEADING) {
- nextJ = j;
- break;
- }
- }
- // i..nextJ 之间 的 text 是否大于 minChunkLen
- int textLen = getTextLen(nowJ + 1, nextJ);
- if (textLen > MIN_PARAGRAPH_LEN) {
- List<String> fatherLabels = getFatherLabels(nowJ);
- //List<String> htmls = getHtmls(nowJ, nextJ);
- List<String> texts = getTexts(nowJ, nextJ);
- GDocument.Builder builder = GDocument.builder().id(idGen.generateId()).text(SplitUtils.listToString(texts));
- builder.metadata(GDocConstants.CURRENT_SECTION_TOC, fatherLabels);
- //builder.metadata(GDocConstants.RAW_CONTENT, SplitUtils.listToString(htmls));
- builder.metadata(GDocConstants.FIRST_HEADING, nd.getLabel());
- getCharOffset(builder, nowJ, nextJ);
- Map<String, Object> metadata = getMetadata(nowJ, nextJ);
- metadata.forEach((k, v) -> builder.metadata(k, v));
- GDocument doc = builder.build();
- docs.add(doc);
- //Object o = doc.getMetadata().get(GDocConstants.RAW_CONTENT);
- //System.out.println(o);
- nowJ = nextJ;
- }
- i = nextJ;
- } else {
- i++;
- }
- }
- // 没有任何标题
- if (docs.size() <= 0 && i > 0) {
- List<String> texts = getTexts(0, docLen);
- GDocument.Builder builder = GDocument.builder().id(idGen.generateId()).text(SplitUtils.listToString(texts));
- builder.metadata(GDocConstants.CURRENT_SECTION_TOC, List.of(""));
- //builder.metadata(GDocConstants.FIRST_HEADING, nd.getLabel());
- getCharOffset(builder, 0, docLen);
- Map<String, Object> metadata = getMetadata(0, docLen);
- metadata.forEach((k, v) -> builder.metadata(k, v));
- GDocument doc = builder.build();
- docs.add(doc);
- }
- return docs;
- }
- protected void getCharOffset(GDocument.Builder builder, int startIndex, int endIndex) {
- String text = docTree.getText();
- Node mdNode = nodes[startIndex].getMdNode();
- int startOffset = mdNode.getStartOffset();
- int endOffset = mdNode.getEndOffset();
- if (endIndex < nodes.length) {
- mdNode = nodes[endIndex].getMdNode();
- endOffset = mdNode.getStartOffset();
- } else {
- endOffset = text.length();
- }
- String s = text.substring(startOffset, endOffset);
- //System.out.println("s="+s);
- builder.metadata(GDocConstants.RAW_CONTENT, s);
- builder.metadata(GDocConstants.START_OFFSET, startOffset);
- builder.metadata(GDocConstants.END_OFFSET, endOffset);
- }
- private int getNodeTextLength(int index) {
- int len = ((IElement) nodes[index].getValue()).getText().get(0).length();
- return len;
- }
- protected int getTextLen(int startIndex, int endIndex) {
- int count = 0;
- for (int i = startIndex; i < endIndex; i++) {
- count = count + getNodeTextLength(i);
- }
- return count;
- }
- protected List<String> getHtmls(int startIndex, int endIndex) {
- List<String> htmls = new ArrayList<>();
- for (int i = startIndex; i < endIndex; i++) {
- IElement value = (IElement) nodes[i].getValue();
- List<String> html = value.getHtml();
- htmls.addAll(html);
- }
- return htmls;
- }
- protected List<String> getTexts(int startIndex, int endIndex) {
- List<String> texts = new ArrayList<>();
- for (int i = startIndex; i < endIndex; i++) {
- List<String> txt = ((IElement) nodes[i].getValue()).getText();
- texts.addAll(txt);
- }
- return texts;
- }
- // 从上至下
- protected List<String> getFatherLabels(int index) {
- int[] fathers = docTree.getFather(index);
- List<String> labels = new ArrayList<>();
- if (fathers != null && fathers.length > 0) {
- for (int i = fathers.length - 1; i >= 0; i--) {
- labels.add(nodes[fathers[i]].getLabel());
- }
- }
- return labels;
- }
- protected void preProcess(Path path, BaseParameters params) {
- if (params.get(GDocConstants.FILE_NAME) == null) {
- params.put(GDocConstants.FILE_NAME, SplitUtils.getFileBaseName(path));
- }
- }
- // 留做扩展
- protected void postProcess(List<GDocument> docs) {
- }
- @Override
- public List<GDocument> split(Path path, BaseParameters params) throws IOException {
- preProcess(path, params);
- String md = Files.readString(path);
- List<GDocument> rets = split(md, params);
- postProcess(rets);
- return rets;
- }
- // public static void main(String[] args) throws IOException {
- // String md = "D:\\testdata\\md\\官网银行卡知识.1.md";
- // md = "D:\\testdata\\md\\海底捞员工手册.md";
- // md = "D:\\testdata\\md\\列表1.md";
- // md = "D:\\data\\乙烯\\乙烯装置技术与运行测试.md";
- // md = "D:\\data\\乙烯\\乙烯1.md";
- // String json = md + ".json";
- // SimpleMdSplitter splitter = new SimpleMdSplitter();
- // splitter.setCommentProcessor(KvCommentProcessor.build());
- // BaseParameters params = BaseParameters.defaultParams();
- // Path path = Paths.get(md);
- // //String baseName = SplitUtils.getFileBaseName(path);
- // //System.out.println("baseName=" + baseName);
- // List<GDocument> docs = splitter.split(path, params);
- // SplitUtils.toJsonFile(new File(json), docs);
- // }
- }
|