3-d中的点由(x,y,z)定义.任何两个点(X,Y,Z)和(x,y,z)之间的距离d是d = Sqrt [(Xx)^ 2 +(Yy)^ 2 +(Zz)^ 2].现在文件中有一百万个条目,每个条目都是空间中的某个点,没有特定的顺序.给定任意点(a,b,c)找到最近的10个点.您将如何存储百万点以及如何从该数据结构中检索这10个点.
百万分是一个小数字.最简单的方法在这里工作(基于KDTree的代码较慢(仅查询一个点)).
#!/usr/bin/env python
import numpy
NDIM = 3 # number of dimensions
# read points into array
a = numpy.fromfile('million_3D_points.txt', sep=' ')
a.shape = a.size / NDIM, NDIM
point = numpy.random.uniform(0, 100, NDIM) # choose random point
print 'point:', point
d = ((a-point)**2).sum(axis=1) # compute distances
ndx = d.argsort() # indirect sort
# print 10 nearest points to the chosen one
import pprint
pprint.pprint(zip(a[ndx[:10]], d[ndx[:10]]))
运行:
$ time python nearest.py point: [ 69.06310224 2.23409409 50.41979143] [(array([ 69., 2., 50.]), 0.23500677815852947), (array([ 69., 2., 51.]), 0.39542392750839772), (array([ 69., 3., 50.]), 0.76681859086988302), (array([ 69., 3., 50.]), 0.76681859086988302), (array([ 69., 3., 51.]), 0.9272357402197513), (array([ 70., 2., 50.]), 1.1088022980015722), (array([ 70., 2., 51.]), 1.2692194473514404), (array([ 70., 2., 51.]), 1.2692194473514404), (array([ 70., 3., 51.]), 1.801031260062794), (array([ 69., 1., 51.]), 1.8636121147970444)] real 0m1.122s user 0m1.010s sys 0m0.120s
这是生成百万个3D点的脚本:
#!/usr/bin/env python import random for _ in xrange(10**6): print ' '.join(str(random.randrange(100)) for _ in range(3))
输出:
$ head million_3D_points.txt 18 56 26 19 35 74 47 43 71 82 63 28 43 82 0 34 40 16 75 85 69 88 58 3 0 63 90 81 78 98
您可以使用该代码来测试更复杂的数据结构和算法(例如,它们实际上是消耗更少的内存还是比上述最简单的方法更快).值得注意的是,目前它是唯一包含工作代码的答案.
#!/usr/bin/env python
import numpy
NDIM = 3 # number of dimensions
# read points into array
a = numpy.fromfile('million_3D_points.txt', sep=' ')
a.shape = a.size / NDIM, NDIM
point = [ 69.06310224, 2.23409409, 50.41979143] # use the same point as above
print 'point:', point
from scipy.spatial import KDTree
# find 10 nearest points
tree = KDTree(a, leafsize=a.shape[0]+1)
distances, ndx = tree.query([point], k=10)
# print 10 nearest points to the chosen one
print a[ndx]
运行:
$ time python nearest_kdtree.py point: [69.063102240000006, 2.2340940900000001, 50.419791429999997] [[[ 69. 2. 50.] [ 69. 2. 51.] [ 69. 3. 50.] [ 69. 3. 50.] [ 69. 3. 51.] [ 70. 2. 50.] [ 70. 2. 51.] [ 70. 2. 51.] [ 70. 3. 51.] [ 69. 1. 51.]]] real 0m1.359s user 0m1.280s sys 0m0.080s
// $ g++ nearest.cc && (time ./a.out < million_3D_points.txt )
#include
#include
#include
#include // _1
#include // bind()
#include
namespace {
typedef double coord_t;
typedef boost::tuple point_t;
coord_t distance_sq(const point_t& a, const point_t& b) { // or boost::geometry::distance
coord_t x = a.get<0>() - b.get<0>();
coord_t y = a.get<1>() - b.get<1>();
coord_t z = a.get<2>() - b.get<2>();
return x*x + y*y + z*z;
}
}
int main() {
using namespace std;
using namespace boost::lambda; // _1, _2, bind()
// read array from stdin
vector points;
cin.exceptions(ios::badbit); // throw exception on bad input
while(cin) {
coord_t x,y,z;
cin >> x >> y >> z;
points.push_back(boost::make_tuple(x,y,z));
}
// use point value from previous examples
point_t point(69.06310224, 2.23409409, 50.41979143);
cout << "point: " << point << endl; // 1.14s
// find 10 nearest points using partial_sort()
// Complexity: O(N)*log(m) comparisons (O(N)*log(N) worst case for the implementation)
const size_t m = 10;
partial_sort(points.begin(), points.begin() + m, points.end(),
bind(less(), // compare by distance to the point
bind(distance_sq, _1, point),
bind(distance_sq, _2, point)));
for_each(points.begin(), points.begin() + m, cout << _1 << "\n"); // 1.16s
}
运行:
g++ -O3 nearest.cc && (time ./a.out < million_3D_points.txt ) point: (69.0631 2.23409 50.4198) (69 2 50) (69 2 51) (69 3 50) (69 3 50) (69 3 51) (70 2 50) (70 2 51) (70 2 51) (70 3 51) (69 1 51) real 0m1.152s user 0m1.140s sys 0m0.010s
#include // make_heap
#include // binary_function<>
#include
#include // boost::begin(), boost::end()
#include // get<>, tuple<>, cout <<
namespace {
typedef double coord_t;
typedef std::tr1::tuple point_t;
// calculate distance (squared) between points `a` & `b`
coord_t distance_sq(const point_t& a, const point_t& b) {
// boost::geometry::distance() squared
using std::tr1::get;
coord_t x = get<0>(a) - get<0>(b);
coord_t y = get<1>(a) - get<1>(b);
coord_t z = get<2>(a) - get<2>(b);
return x*x + y*y + z*z;
}
// read from input stream `in` to the point `point_out`
std::istream& getpoint(std::istream& in, point_t& point_out) {
using std::tr1::get;
return (in >> get<0>(point_out) >> get<1>(point_out) >> get<2>(point_out));
}
// Adaptable binary predicate that defines whether the first
// argument is nearer than the second one to given reference point
template
class less_distance : public std::binary_function {
const T& point;
public:
less_distance(const T& reference_point) : point(reference_point) {}
bool operator () (const T& a, const T& b) const {
return distance_sq(a, point) < distance_sq(b, point);
}
};
}
int main() {
using namespace std;
// use point value from previous examples
point_t point(69.06310224, 2.23409409, 50.41979143);
cout << "point: " << point << endl;
const size_t nneighbours = 10; // number of nearest neighbours to find
point_t points[nneighbours+1];
// populate `points`
for (size_t i = 0; getpoint(cin, points[i]) && i < nneighbours; ++i)
;
less_distance less_distance_point(point);
make_heap (boost::begin(points), boost::end(points), less_distance_point);
// Complexity: O(N*log(m))
while(getpoint(cin, points[nneighbours])) {
// add points[-1] to the heap; O(log(m))
push_heap(boost::begin(points), boost::end(points), less_distance_point);
// remove (move to last position) the most distant from the
// `point` point; O(log(m))
pop_heap (boost::begin(points), boost::end(points), less_distance_point);
}
// print results
push_heap (boost::begin(points), boost::end(points), less_distance_point);
// O(m*log(m))
sort_heap (boost::begin(points), boost::end(points), less_distance_point);
for (size_t i = 0; i < nneighbours; ++i) {
cout << points[i] << ' ' << distance_sq(points[i], point) << '\n';
}
}
运行:
$ g++ -O3 nearest.cc && (time ./a.out < million_3D_points.txt ) point: (69.0631 2.23409 50.4198) (69 2 50) 0.235007 (69 2 51) 0.395424 (69 3 50) 0.766819 (69 3 50) 0.766819 (69 3 51) 0.927236 (70 2 50) 1.1088 (70 2 51) 1.26922 (70 2 51) 1.26922 (70 3 51) 1.80103 (69 1 51) 1.86361 real 0m1.174s user 0m1.180s sys 0m0.000s
// $ g++ -O3 nearest.cc && (time ./a.out < million_3D_points.txt )
#include // sort
#include // binary_function<>
#include
#include
#include // begin(), end()
#include // get<>, tuple<>, cout <<
#define foreach BOOST_FOREACH
namespace {
typedef double coord_t;
typedef std::tr1::tuple point_t;
// calculate distance (squared) between points `a` & `b`
coord_t distance_sq(const point_t& a, const point_t& b);
// read from input stream `in` to the point `point_out`
std::istream& getpoint(std::istream& in, point_t& point_out);
// Adaptable binary predicate that defines whether the first
// argument is nearer than the second one to given reference point
class less_distance : public std::binary_function {
const point_t& point;
public:
explicit less_distance(const point_t& reference_point)
: point(reference_point) {}
bool operator () (const point_t& a, const point_t& b) const {
return distance_sq(a, point) < distance_sq(b, point);
}
};
}
int main() {
using namespace std;
// use point value from previous examples
point_t point(69.06310224, 2.23409409, 50.41979143);
cout << "point: " << point << endl;
less_distance nearer(point);
const size_t nneighbours = 10; // number of nearest neighbours to find
point_t points[nneighbours];
// populate `points`
foreach (point_t& p, points)
if (! getpoint(cin, p))
break;
// Complexity: O(N*m)
point_t current_point;
while(cin) {
getpoint(cin, current_point); //NOTE: `cin` fails after the last
//point, so one can't lift it up to
//the while condition
// move to the last position the most distant from the
// `point` point; O(m)
foreach (point_t& p, points)
if (nearer(current_point, p))
// found point that is nearer to the `point`
//NOTE: could use insert (on sorted sequence) & break instead
//of swap but in that case it might be better to use
//heap-based algorithm altogether
std::swap(current_point, p);
}
// print results; O(m*log(m))
sort(boost::begin(points), boost::end(points), nearer);
foreach (point_t p, points)
cout << p << ' ' << distance_sq(p, point) << '\n';
}
namespace {
coord_t distance_sq(const point_t& a, const point_t& b) {
// boost::geometry::distance() squared
using std::tr1::get;
coord_t x = get<0>(a) - get<0>(b);
coord_t y = get<1>(a) - get<1>(b);
coord_t z = get<2>(a) - get<2>(b);
return x*x + y*y + z*z;
}
std::istream& getpoint(std::istream& in, point_t& point_out) {
using std::tr1::get;
return (in >> get<0>(point_out) >> get<1>(point_out) >> get<2>(point_out));
}
}
测量表明,大部分时间都花在从文件中读取数组,实际计算花费的时间要少一些.
如果百万条目已经存在于文件中,则无需将它们全部加载到内存中的数据结构中.只需保留一个阵列,其中包含到目前为止找到的前十个点,并扫描超过百万个点,随时更新前十个列表.
这是点数的O(n).
您可以将点存储在k维树(kd-tree)中.Kd树针对最近邻搜索进行了优化(找到最接近给定点的n个点).
我认为这是一个棘手的问题,测试你是否不试图过分.
考虑一下上面人们已经给出的最简单的算法:保持一个十个最佳候选人的表格,并逐个查看所有点.如果你发现的距离比最好的十个中的任何一个都要近,那就换掉它.复杂性有多大?好吧,我们必须从文件中查看每个点一次,计算它的距离(或实际距离的平方)并与第10个最近点进行比较.如果它更好,请将它插入10-best-so-far表中的适当位置.
那么复杂性又是什么呢?我们一次看每个点,所以它是距离的计算和n次比较.如果该点更好,我们需要将其插入正确的位置,这需要更多的比较,但这是一个常数因素,因为最佳候选者表的大小恒定为10.
我们最终得到一个在线性时间内运行的算法,点数为O(n).
但现在考虑这种算法的下限是什么?如果输入数据中没有顺序,我们必须查看每个点以查看它是否不是最接近的点之一.因此,据我所知,下限是Omega(n),因此上述算法是最优的.
无需计算距离.距离的正方形应该满足您的需求.我认为应该更快.换句话说,你可以跳过这sqrt
一点.