文本比较算法
0
最近我是在网上看过一个例子,突然就有点兴趣了,于是乎自己就写了一个算法。
先列出网上的一个例子:http://wenbenbijiao.renrensousuo.com/#diff
这个是一个在线文本编辑的,但是感觉有一些问题。例如:
"A", "B", "C", "A", "C", "A", "D", "F", "A", "B", "C", "A", "C", "A", "D", "F"
"B", "C", "X", "C", "A", "D", "F", "E", "S", "B", "A", "B", "C", "A", "C", "A"
上面这两个文本,换行后比较结果:
明显这个是有点问题的。
于是我在网上看了一篇文章:http://blog.csdn.net/sunskyor/article/details/4491656
根据这篇文章里面的算法分析,实现了算法,分析出来的结果比较准确,但是多行以后效率实在是低下,贴出来希望大家能够给出一些意见优化:
package com.acgist.nlp.query.compare;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import org.apache.commons.collections.CollectionUtils;
public class Path {
private List<String> source;
private List<String> target;
private boolean[][] map;
/**
* 路径
*/
private List<Points> points = new ArrayList<Points>();
public Path() {
}
public Path(List<String> source, List<String> target) {
this.source = source;
this.target = target;
}
public static void main(String[] args) {
long b = System.currentTimeMillis();
// Path path = new Path(Arrays.asList(new String[] {"B", "C", "X", "C", "A", "D", "F", "E", "S", "B", "A", "B", "C", "A", "C", "A"}), Arrays.asList(new String[]{"A", "B", "C", "A", "C", "A", "D", "F"}));
// Path path = new Path(Arrays.asList(new String[] {"A", "B", "C", "A", "C", "A", "D", "F"}), Arrays.asList(new String[]{"B", "C", "X", "C", "A", "D", "F", "E", "S", "B", "A", "B", "C", "A", "C", "A"}));
Path path = new Path(Arrays.asList(new String[]{"A", "B", "C", "A", "C", "A", "D", "F", "A", "B", "C", "A", "C", "A", "D", "F"}), Arrays.asList(new String[] {"B", "C", "X", "C", "A", "D", "F", "E", "S", "B", "A", "B", "C", "A", "C", "A"}));
path.execute();
long e = System.currentTimeMillis();
System.out.println("消耗时间:" + (e - b) + "ms");
}
/**
* 执行比较
*/
public void execute() {
if (CollectionUtils.isEmpty(source) || CollectionUtils.isEmpty(target))
return;
map = new boolean[this.source.size()][this.target.size()];
for (int x = 0; x < this.source.size(); x++) {
for (int y = 0; y < this.target.size(); y++) {
if (this.source.get(x) == this.target.get(y) || this.source.get(x).equals(this.target.get(y)))
map[x][y] = true;
else
map[x][y] = false;
}
}
print(map);
for (int x = 0; x < this.source.size(); x++) {
for (int y = 0; y < this.target.size(); y++) {
if(map[x][y])
this.points.add(find(x, y, new Points(), null));
}
}
Collections.sort(this.points, (Points a, Points b) -> {
if(a == null || b == null)
return 0;
if(a.getNumber() >= b.getNumber())
return 1;
return 0;
});
int maxNumber = this.points.get(0).getNumber();
List<Points> maxPoints = new ArrayList<Points>();
this.points.forEach(points -> {
if(points.getNumber() == maxNumber)
maxPoints.add(points);
else
return;
});
this.points.clear();
for (Points points : maxPoints) {
for (Point point : points.getPoints()) {
System.out.print(point.getX() + "=" + point.getY() + "=" + point.getSame() + "----------");
}
System.out.println();
}
Points points = optimal(maxPoints);
for(Point point : points.getPoints()) {
if(point.getSame())
System.out.println(this.source.get(point.getX()));
}
}
/**
* 计算最大路径
*/
public Points find(int x, int y, Points points, Points optimal) {
if(x == this.source.size() || y == this.target.size()) {
if(optimal == null)
return points;
else if(optimal.getNumber() < points.getNumber())
return points;
return optimal;
} else {
points.addPoint(new Point(x, y, map[x][y]));
if(map[x][y]) {
optimal = find(x + 1, y + 1, new Points(points.getNumber(), points.clonePoints()), optimal);
} else {
optimal = find(x, y + 1, new Points(points.getNumber(), points.clonePoints()), optimal);
optimal = find(x + 1, y, new Points(points.getNumber(), points.clonePoints()), optimal);
optimal = find(x + 1, y + 1, new Points(points.getNumber(), points.clonePoints()), optimal);
}
return optimal;
}
}
/**
* 打印二维数组
*/
public void print(boolean[][] map) {
for (boolean[] bs : map) {
System.out.print("----");
for (boolean b : bs) {
System.out.print((b ? "1" : "0") + "----");
}
System.out.println();
}
}
/**
* 计算最优路径
*/
public Points optimal(List<Points> maxPoints){
int index = 0;
float similarity = 0;
for (int i = 0; i < maxPoints.size(); i++) {
Points points = maxPoints.get(i);
float tmp = Float.valueOf(points.getNumber()) / Float.valueOf(points.getPoints().size());
if(tmp > similarity) {
similarity = tmp;
index = i;
}
}
return maxPoints.get(index);
}
}
class Points {
private int number;
private List<Point> points;
public Points() {
}
public Points(int number, List<Point> points) {
this.number = number;
this.points = points;
}
public void addPoint(Point point) {
if(points == null)
points = new ArrayList<>();
if(point.getSame()) {
number++;
}
points.add(point);
}
public void clear() {
this.number = 0;
this.points.clear();
}
public boolean isNotEmpty() {
return CollectionUtils.isNotEmpty(this.points);
}
public List<Point> clonePoints() {
return new ArrayList<>(points);
}
public int getNumber() {
return number;
}
public void setNumber(int number) {
this.number = number;
}
public List<Point> getPoints() {
return points;
}
public void setPoints(List<Point> points) {
this.points = points;
}
}
class Point {
private int x;
private int y;
private boolean same;
public Point() {
}
public Point(int x, int y, boolean same) {
this.x = x;
this.y = y;
this.same = same;
}
public int getX() {
return x;
}
public void setX(int x) {
this.x = x;
}
public int getY() {
return y;
}
public void setY(int y) {
this.y = y;
}
public boolean getSame() {
return same;
}
public void setSame(boolean same) {
this.same = same;
}
}
下面是比较的结果: