| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117 |
- package org.cnnlp.data.splitter;
- import org.cnnlp.data.document.GDocument;
- import org.cnnlp.data.util.BaseParameters;
- import java.io.File;
- import java.io.IOException;
- import java.nio.file.Files;
- import java.nio.file.Path;
- import java.nio.file.Paths;
- import java.util.List;
- import java.util.stream.Stream;
- public class SplitTools {
- public SplitTools() {
- }
- public void splitFaq(Path path) throws IOException {
- System.out.println("Process " + path.toString());
- String json = path.toString() + ".json";
- FaqMdSplitter splitter = new FaqMdSplitter();
- BaseParameters params = BaseParameters.defaultParams();
- List<GDocument> docs = splitter.split(path, params);
- SplitUtils.toJsonFile(new File(json), docs);
- }
- public static boolean isMd(Path p) {
- if (Files.isRegularFile(p)) {
- boolean b = p.toString().endsWith(".md");
- return b;
- }
- return false;
- }
- public void splitSimple(Path path) throws IOException {
- System.out.println("Process " + path.toString());
- String json = path.toString() + ".json";
- SimpleMdSplitter splitter = new SimpleMdSplitter();
- splitter.setCommentProcessor(KvCommentProcessor.build());
- BaseParameters params = BaseParameters.defaultParams();
- // Path path = Paths.get(md);
- //String baseName = SplitUtils.getFileBaseName(path);
- //System.out.println("baseName=" + baseName);
- List<GDocument> docs = splitter.split(path, params);
- SplitUtils.toJsonFile(new File(json), docs);
- }
- public void splitSimpleDir(Path p) throws IOException {
- Stream<Path> fl = Files.list(p);
- fl.forEach(p1 -> {
- try {
- // if (isMd(f)) {
- // tools.splitSimple(f);
- // }
- //System.out.println(p1);
- File file = p1.toFile();
- if (file.isDirectory()) {
- splitSimpleDir(p1);
- } else {
- System.out.println(p1);
- splitSimple(p1);
- }
- } catch (Exception e) {
- throw new RuntimeException(e);
- }
- });
- }
- public static void main(String[] args) throws IOException {
- String md = "D:\\testdata\\md\\官网银行卡知识.1.md";
- md = "D:\\testdata\\md\\海底捞员工手册.md";
- md = "D:\\testdata\\md\\列表1.md";
- md = "D:\\data\\乙烯\\乙烯装置技术与运行测试.md";
- md = "D:\\data\\乙烯\\乙烯1.md";
- md = "D:\\data\\乙烯\\催化裂化操作规程2024.4.28.md";
- // String json = md + ".json";
- SplitTools tools = new SplitTools();
- // tools.splitSimple(Paths.get(md));
- // String p = "D:\\data\\乙烯\\target";
- // tools.splitSimpleDir(Paths.get(p));
- String p = "D:\\data\\乙烯\\规章制度\\中国石油四川石化有限责任公司原油采购管理办法.md";
- tools.splitSimple(Paths.get(p));
- // String dir = "D:\\data\\乙烯\\0624\\安全技术说明书完善\\";
- // tools.splitSimpleDir(Paths.get(dir));
- // String p = "D:\\data\\乙烯\\2\\";
- // Stream<Path> fl = Files.list(Path.of(p));
- // fl.forEach(f->{
- // try {
- // if (isMd(f)){
- // tools.splitSimple(f);
- // }
- // } catch (IOException e) {
- // throw new RuntimeException(e);
- // }
- // });
- //
- // String p2 = "D:\\data\\乙烯\\2\\faq\\";
- // Stream<Path> fl2 = Files.list(Path.of(p2));
- // fl2.forEach(f->{
- // try {
- // if (isMd(f)){
- // tools.splitFaq(f);
- // }
- // } catch (IOException e) {
- // throw new RuntimeException(e);
- // }
- // });
- }
- }
|