diff --git a/cpp_easygraph/CMakeLists.txt b/cpp_easygraph/CMakeLists.txt index 2a43c776..673c70fe 100644 --- a/cpp_easygraph/CMakeLists.txt +++ b/cpp_easygraph/CMakeLists.txt @@ -13,32 +13,55 @@ add_subdirectory(pybind11) option(EASYGRAPH_ENABLE_GPU "EASYGRAPH_ENABLE_GPU" OFF) -if (EASYGRAPH_ENABLE_GPU) - - pybind11_add_module(cpp_easygraph - ${SOURCES} - $ - ) - - set_property(TARGET cpp_easygraph PROPERTY CUDA_ARCHITECTURES all) +# Find OpenMP (optional) +find_package(OpenMP) +if(OpenMP_CXX_FOUND) + message(STATUS "Found OpenMP: ${OpenMP_CXX_VERSION}") +else() + message(STATUS "OpenMP not found; continuing without OpenMP") +endif() - target_compile_definitions(cpp_easygraph - PRIVATE EASYGRAPH_ENABLE_GPU - ) +# Find ARPACK (optional; do not fail if absent) +find_library(ARPACK_LIB NAMES arpack arpack_double) +if(ARPACK_LIB) + message(STATUS "Found ARPACK library: ${ARPACK_LIB}") + add_compile_definitions(HAVE_ARPACK=1) +else() + message(WARNING "ARPACK not found; ARPACK-based solvers will be disabled (fallback used).") + add_compile_definitions(HAVE_ARPACK=0) +endif() - target_link_libraries(cpp_easygraph - PRIVATE cudart_static - ) +find_package(LAPACK REQUIRED) +# Find Eigen (optional). Change to REQUIRED if Eigen must be enforced. +find_package(Eigen3 QUIET) +if(Eigen3_FOUND) + message(STATUS "Found Eigen3: ${Eigen3_INCLUDE_DIRS}") + add_compile_definitions(HAVE_EIGEN=1) else() + message(STATUS "Eigen3 not found; eigen-accelerated code will be disabled") + add_compile_definitions(HAVE_EIGEN=0) +endif() - pybind11_add_module(cpp_easygraph - ${SOURCES} - ) +# Link/include to cpp_easygraph only if the target exists. +# This avoids errors when the target is created elsewhere or later. +if(TARGET cpp_easygraph) + if(Eigen3_FOUND) + target_include_directories(cpp_easygraph PRIVATE ${Eigen3_INCLUDE_DIRS}) + if(TARGET Eigen3::Eigen) + target_link_libraries(cpp_easygraph PRIVATE Eigen3::Eigen) + endif() + endif() -endif() + if(OpenMP_CXX_FOUND) + target_link_libraries(cpp_easygraph PRIVATE OpenMP::OpenMP_CXX) + endif() -set_target_properties(cpp_easygraph PROPERTIES - LINK_SEARCH_START_STATIC ON - LINK_SEARCH_END_STATIC ON -) \ No newline at end of file + if(ARPACK_LIB) + target_link_libraries(cpp_easygraph PRIVATE ${ARPACK_LIB} LAPACK::LAPACK) + else() + target_link_libraries(cpp_easygraph PRIVATE LAPACK::LAPACK) + endif() +else() + message(STATUS "cpp_easygraph target not defined yet; optional libs will be linked after target creation.") +endif() \ No newline at end of file diff --git a/cpp_easygraph/cpp_easygraph.cpp b/cpp_easygraph/cpp_easygraph.cpp index f5570d6e..1733d7e0 100644 --- a/cpp_easygraph/cpp_easygraph.cpp +++ b/cpp_easygraph/cpp_easygraph.cpp @@ -83,6 +83,7 @@ PYBIND11_MODULE(cpp_easygraph, m) { m.def("cpp_constraint", &constraint, py::arg("G"), py::arg("nodes") = py::none(), py::arg("weight") = py::none(), py::arg("n_workers") = py::none()); m.def("cpp_effective_size", &effective_size, py::arg("G"), py::arg("nodes") = py::none(), py::arg("weight") = py::none(), py::arg("n_workers") = py::none()); m.def("cpp_efficiency", &efficiency, py::arg("G"), py::arg("nodes") = py::none(), py::arg("weight") = py::none(), py::arg("n_workers") = py::none()); + m.def("cpp_eigenvector_centrality", &cpp_eigenvector_centrality, py::arg("G"), py::arg("max_iter") = 100, py::arg("tol") = 1.0e-6, py::arg("nstart") = py::none(), py::arg("weight") = py::none()); m.def("cpp_hierarchy", &hierarchy, py::arg("G"), py::arg("nodes") = py::none(), py::arg("weight") = py::none(), py::arg("n_workers") = py::none()); m.def("cpp_pagerank", &_pagerank, py::arg("G"), py::arg("alpha") = 0.85, py::arg("max_iterator") = 500, py::arg("threshold") = 1e-6); m.def("cpp_dijkstra_multisource", &_dijkstra_multisource, py::arg("G"), py::arg("sources"), py::arg("weight") = "weight", py::arg("target") = py::none()); diff --git a/cpp_easygraph/functions/centrality/centrality.h b/cpp_easygraph/functions/centrality/centrality.h index 7040618a..121c9eb0 100644 --- a/cpp_easygraph/functions/centrality/centrality.h +++ b/cpp_easygraph/functions/centrality/centrality.h @@ -12,4 +12,11 @@ py::object cpp_katz_centrality( py::object py_max_iter, py::object py_tol, py::object py_normalized +); +py::object cpp_eigenvector_centrality( + py::object G, + py::object py_max_iter, + py::object py_tol, + py::object py_nstart, + py::object py_weight ); \ No newline at end of file diff --git a/cpp_easygraph/functions/centrality/closeness.cpp b/cpp_easygraph/functions/centrality/closeness.cpp index 524532fe..24134470 100644 --- a/cpp_easygraph/functions/centrality/closeness.cpp +++ b/cpp_easygraph/functions/centrality/closeness.cpp @@ -7,44 +7,186 @@ #include "../../classes/graph.h" #include "../../common/utils.h" #include "../../classes/linkgraph.h" -#include "../../classes/segment_tree.cpp" -double closeness_dijkstra(const Graph_L& G_l, const int &S, int cutoff, Segment_tree_zkw& segment_tree_zkw){ - const int dis_inf = 0x3f3f3f3f; +#include +#include +#include +#include + +#ifdef _OPENMP +#include +#endif + +// Heap node: use negative value + max heap to implement min heap +typedef std::pair HeapNode; + +// Optimized adjacency list cache +struct FastAdjCache { + std::vector neighbor_ptrs; + std::vector neighbor_counts; + std::vector weight_ptrs; + std::vector> neighbor_storage; + std::vector> weight_storage; + + void init(int N) { + neighbor_ptrs.resize(N + 1, nullptr); + neighbor_counts.resize(N + 1, 0); + neighbor_storage.resize(N + 1); + } + + void init_with_weights(int N) { + init(N); + weight_ptrs.resize(N + 1, nullptr); + weight_storage.resize(N + 1); + } + + inline void build_if_needed(int u, const std::vector& head, + const std::vector& edges, bool store_weights) { + if (neighbor_ptrs[u] != nullptr) return; + + std::vector& neis = neighbor_storage[u]; + for (int p = head[u]; p != -1; p = edges[p].next) { + neis.push_back(edges[p].to); + if (store_weights) { + weight_storage[u].push_back(edges[p].w); + } + } + + neighbor_counts[u] = neis.size(); + neighbor_ptrs[u] = neis.data(); + if (store_weights) { + weight_ptrs[u] = weight_storage[u].data(); + } + } + + inline int* get_neighbors_ptr(int u) const { return neighbor_ptrs[u]; } + inline int get_neighbor_count(int u) const { return neighbor_counts[u]; } + inline float* get_weights_ptr(int u) const { return weight_ptrs[u]; } +}; + +// BFS implementation - directly use raw adjacency list +double closeness_bfs_direct(const Graph_L& G_l, const int &S, int cutoff, + std::vector& already_counted, + std::vector& queue_storage, + int timestamp) { int N = G_l.n; - segment_tree_zkw.init(N); - std::vector dis(N+1, INT_MAX); const std::vector& E = G_l.edges; const std::vector& head = G_l.head; - int number_connected = 0; + + int nodes_reached = 0; long long sum_dis = 0; - dis[S] = 0; - segment_tree_zkw.change(S, 0); - while(segment_tree_zkw.t[1] != dis_inf) { - int u = segment_tree_zkw.num[1]; - if(u == 0) break; - segment_tree_zkw.change(u, dis_inf); - if (cutoff >= 0 && dis[u] > cutoff){ + + queue_storage.clear(); + + int queue_front = 0; + already_counted[S] = timestamp; + queue_storage.push_back(S); + queue_storage.push_back(0); + + while (queue_front < static_cast(queue_storage.size())) { + int u = queue_storage[queue_front++]; + int actdist = queue_storage[queue_front++]; + + if (cutoff >= 0 && actdist > cutoff) { continue; } - number_connected += 1; - sum_dis += dis[u]; - for(int p = head[u]; p != -1; p = E[p].next) { + + sum_dis += actdist; + nodes_reached++; + + for (int p = head[u]; p != -1; p = E[p].next) { int v = E[p].to; - if(cutoff >= 0 && (dis[u] + E[p].w) > cutoff){ + + if (already_counted[v] == timestamp) { continue; } - if (dis[v] > dis[u] + E[p].w) { - dis[v] = dis[u] + E[p].w; - segment_tree_zkw.change(v, dis[v]); - } + + already_counted[v] = timestamp; + queue_storage.push_back(v); + queue_storage.push_back(actdist + 1); } } - if (number_connected == 1) + + if (nodes_reached == 1) return 0.0; else - return 1.0 * (number_connected - 1) * (number_connected - 1) / ((N - 1) * sum_dis); + return 1.0 * (nodes_reached - 1) * (nodes_reached - 1) / ((N - 1) * sum_dis); +} +// Check if the graph is unweighted +inline bool is_unweighted_graph(const Graph_L& G_l) { + const std::vector& E = G_l.edges; + for (const auto& edge : E) { + if (std::abs(edge.w - 1.0f) > 1e-9) { + return false; + } + } + return true; +} + +// Dijkstra implementation - use on-demand adjacency cache +double closeness_dijkstra_cached(const Graph_L& G_l, const int &S, int cutoff, + std::vector& dist, + std::vector& which, + FastAdjCache& cache, + int timestamp) { + int N = G_l.n; + const std::vector& E = G_l.edges; + const std::vector& head = G_l.head; + + int nodes_reached = 0; + double sum_dis = 0.0; + + std::priority_queue heap; + + dist[S] = 1.0f; + which[S] = timestamp; + heap.push({-1.0f, S}); + + while (!heap.empty()) { + HeapNode top = heap.top(); + heap.pop(); + float mindist = -top.first; + int minnei = top.second; + + if (mindist > dist[minnei]) { + continue; + } + + float actual_dist = mindist - 1.0f; + if (cutoff >= 0 && actual_dist > cutoff) { + continue; + } + + sum_dis += actual_dist; + nodes_reached++; + + cache.build_if_needed(minnei, head, E, true); + + int* neis = cache.get_neighbors_ptr(minnei); + float* ws = cache.get_weights_ptr(minnei); + int nlen = cache.get_neighbor_count(minnei); + + for (int j = 0; j < nlen; j++) { + int to = neis[j]; + float altdist = mindist + ws[j]; + float curdist = dist[to]; + + if (which[to] != timestamp) { + which[to] = timestamp; + dist[to] = altdist; + heap.push({-altdist, to}); + } else if (curdist == 0.0f || altdist < curdist) { + dist[to] = altdist; + heap.push({-altdist, to}); + } + } + } + + if (nodes_reached == 1) + return 0.0; + else + return 1.0 * (nodes_reached - 1) * (nodes_reached - 1) / ((N - 1) * sum_dis); } static py::object invoke_cpp_closeness_centrality(py::object G, py::object weight, @@ -58,27 +200,115 @@ static py::object invoke_cpp_closeness_centrality(py::object G, py::object weigh if (!cutoff.is_none()){ cutoff_ = cutoff.cast(); } - Segment_tree_zkw segment_tree_zkw(N); + + // Auto algorithm selection + bool use_bfs = (weight.is_none() || is_unweighted_graph(G_l)); + std::vector CC; + if(!sources.is_none()){ py::list sources_list = py::list(sources); int sources_list_len = py::len(sources_list); + CC.resize(sources_list_len); + + // Collect all source node IDs + std::vector source_ids(sources_list_len); for(int i = 0; i < sources_list_len; i++){ if(G_.node_to_id.attr("get")(sources_list[i],py::none()).is_none()){ printf("The node should exist in the graph!"); return py::none(); } - node_t source_id = G_.node_to_id.attr("get")(sources_list[i]).cast(); - float res = closeness_dijkstra(G_l, source_id, cutoff_,segment_tree_zkw); - CC.push_back(res); + source_ids[i] = G_.node_to_id.attr("get")(sources_list[i]).cast(); + } + + // OpenMP parallel computation + // Only enable parallelism when sources are many to avoid overhead + #pragma omp parallel if(sources_list_len > 100) + { + // Per-thread data structures (avoid race conditions) + std::vector already_counted(N + 1, 0); + std::vector queue_storage; + queue_storage.reserve(N * 2); + + std::vector dist(N + 1, 0.0f); + std::vector which(N + 1, 0); + + FastAdjCache cache; + if (!use_bfs) { + cache.init_with_weights(N); + } + + // Assign unique timestamp start for each thread + #ifdef _OPENMP + int thread_id = omp_get_thread_num(); + int num_threads = omp_get_num_threads(); + int timestamp = thread_id * sources_list_len; + #else + int timestamp = 0; + #endif + + // Parallel loop: each thread handles different source node + #pragma omp for schedule(dynamic, 1) + for(int i = 0; i < sources_list_len; i++){ + timestamp++; + double res; + if (use_bfs) { + res = closeness_bfs_direct(G_l, source_ids[i], cutoff_, + already_counted, queue_storage, timestamp); + } else { + res = closeness_dijkstra_cached(G_l, source_ids[i], cutoff_, + dist, which, cache, timestamp); + } + CC[i] = res; + } } } else{ - for(int i = 1; i <= N; i++){ - float res = closeness_dijkstra(G_l, i, cutoff_,segment_tree_zkw); - CC.push_back(res); + CC.resize(N); + + // OpenMP parallel computation for all nodes + // Only enable parallelism when node count is large + #pragma omp parallel if(N > 100) + { + // Per-thread data structures + std::vector already_counted(N + 1, 0); + std::vector queue_storage; + queue_storage.reserve(N * 2); + + std::vector dist(N + 1, 0.0f); + std::vector which(N + 1, 0); + + FastAdjCache cache; + if (!use_bfs) { + cache.init_with_weights(N); + } + + // Assign unique timestamp start for each thread + #ifdef _OPENMP + int thread_id = omp_get_thread_num(); + int num_threads = omp_get_num_threads(); + int timestamp = thread_id * N; + #else + int timestamp = 0; + #endif + + // Parallel loop: dynamic scheduling for load balancing + #pragma omp for schedule(dynamic, 10) + for(int i = 1; i <= N; i++){ + timestamp++; + double res; + if (use_bfs) { + res = closeness_bfs_direct(G_l, i, cutoff_, + already_counted, queue_storage, timestamp); + } else { + res = closeness_dijkstra_cached(G_l, i, cutoff_, + dist, which, cache, timestamp); + } + CC[i - 1] = res; + } } } + py::array::ShapeContainer ret_shape{(int)CC.size()}; py::array_t ret(ret_shape, CC.data()); diff --git a/cpp_easygraph/functions/centrality/eigenvector.cpp b/cpp_easygraph/functions/centrality/eigenvector.cpp new file mode 100644 index 00000000..6d9eb669 --- /dev/null +++ b/cpp_easygraph/functions/centrality/eigenvector.cpp @@ -0,0 +1,700 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "centrality.h" +#include "../../classes/graph.h" + +#ifdef HAVE_EIGEN +#include +#include + #ifdef HAVE_EIGEN_SPARSE_SOLVER + #include + #endif +#endif + +namespace py = pybind11; + +extern "C" { + void dsaupd_(int*, char*, int*, char*, int*, double*, double*, int*, + double*, int*, int*, int*, double*, double*, int*, int*); + void dseupd_(int*, char*, int*, double*, double*, int*, double*, + char*, int*, char*, int*, double*, double*, int*, + double*, int*, int*, int*, double*, double*, int*, int*); + void dnaupd_(int*, char*, int*, char*, int*, double*, double*, int*, + double*, int*, int*, int*, double*, double*, int*, int*); + void dneupd_(int* rvec, char* howmny, int* select, + double* dr, double* di, double* z, int* ldz, + double* sigmar, double* sigmai, double* workev, + char* bmat, int* n, char* which, int* nev, + double* tol, double* resid, int* ncv, double* v, int* ldv, + int* iparam, int* ipntr, double* workd, double* workl, + int* lworkl, int* info); +} + +// CSRMatrix 稀疏矩阵结构体 +class CSRMatrix { +public: + std::vector indptr; + std::vector indices; + std::vector data; + int rows; + int cols; + + CSRMatrix() : rows(0), cols(0) {} + CSRMatrix(int r, int c) : rows(r), cols(c) { + indptr.assign(r + 1, 0); + } + + void reserve(size_t nnz) { + indices.reserve(nnz); + data.reserve(nnz); + } + + void multiply_inplace(const std::vector& vec, std::vector& result) const { + result.assign(rows, 0.0); + #pragma omp parallel for schedule(static,64) if(rows > 1000) + for (int i = 0; i < rows; ++i) { + double sum = 0.0; + for (int j = indptr[i]; j < indptr[i + 1]; ++j) { + sum += data[j] * vec[indices[j]]; + } + result[i] = sum; + } + } + + double estimate_norm() const { + double s = 0.0; + #pragma omp parallel for reduction(+:s) if(data.size() > 1000) + for (size_t i = 0; i < data.size(); ++i) s += data[i] * data[i]; + return std::sqrt(s); + } + + double rayleigh_quotient(const std::vector& x, const std::vector& Ax) const { + double num = 0.0, den = 0.0; + #pragma omp parallel for reduction(+:num,den) if(rows > 1000) + for (int i = 0; i < rows; ++i) { + num += Ax[i] * x[i]; + den += x[i] * x[i]; + } + return den > 0.0 ? num / den : 0.0; + } + +#ifdef HAVE_EIGEN + Eigen::SparseMatrix to_eigen() const { + Eigen::SparseMatrix M(rows, cols); + std::vector> trip; + trip.reserve(data.size()); + for (int i = 0; i < rows; ++i) { + for (int j = indptr[i]; j < indptr[i+1]; ++j) { + trip.emplace_back(i, indices[j], data[j]); + } + } + M.setFromTriplets(trip.begin(), trip.end()); + M.makeCompressed(); + return M; + } +#endif +}; + +// 前向声明 +std::vector power_iteration_chebyshev( + const CSRMatrix& A, + int max_iter, + double tol, + std::vector& x +); +double vector_norm(const std::vector& x); +void normalize_vector(std::vector& x, double norm); +double vector_diff_norm(const std::vector& x1, const std::vector& x2); + + + +// 构建转置稀疏矩阵 +CSRMatrix build_transpose_matrix(Graph& graph, const std::vector& nodes, const std::string& weight_key) { + try { + std::shared_ptr csr_ptr; + if (weight_key.empty()) { + csr_ptr = graph.gen_CSR(); + } else { + csr_ptr = graph.gen_CSR(weight_key); + } + + if (!csr_ptr) { + std::cerr << "[build_transpose_matrix] Error: gen_CSR returned nullptr" << std::endl; + const int n = static_cast(nodes.size()); + CSRMatrix A(n, n); + A.indptr.assign(n + 1, 0); + return A; + } + + const int n = static_cast(nodes.size()); + std::vector src_indptr; + std::vector src_indices; + std::vector src_data; + int rows = 0, cols = 0; + + // 只用V/E/unweighted_W/W_map + if (!csr_ptr->V.empty() && !csr_ptr->E.empty()) { + rows = static_cast(csr_ptr->V.size()) - 1; + cols = rows; + src_indptr = csr_ptr->V; + src_indices = csr_ptr->E; + if (weight_key.empty()) { + src_data = csr_ptr->unweighted_W.empty() ? + std::vector(csr_ptr->E.size(), 1.0) : + csr_ptr->unweighted_W; + } else { + auto it = csr_ptr->W_map.find(weight_key); + if (it != csr_ptr->W_map.end() && it->second) { + src_data = *(it->second); + } else { + src_data = std::vector(csr_ptr->E.size(), 1.0); + } + } + } else { + std::cerr << "[build_transpose_matrix] Warning: CSR data is empty (V size=" + << csr_ptr->V.size() << ")" << std::endl; + CSRMatrix A(n, n); + A.indptr.assign(n + 1, 0); + return A; + } + + if (src_indptr.size() < 2 || src_indices.empty()) { + std::cerr << "[build_transpose_matrix] Warning: Invalid CSR structure" << std::endl; + CSRMatrix A(n, n); + A.indptr.assign(n + 1, 0); + return A; + } + + // 转置 + CSRMatrix At(cols, rows); + At.indptr.assign(cols + 1, 0); + size_t nnz = src_indices.size(); + + for (size_t idx = 0; idx < nnz; ++idx) { + int col = src_indices[idx]; + if (col >= 0 && col < cols) { + At.indptr[col + 1]++; + } + } + for (int i = 0; i < cols; ++i) { + At.indptr[i + 1] += At.indptr[i]; + } + + At.indices.assign(nnz, 0); + At.data.assign(nnz, 0.0); + + std::vector cur_pos(At.indptr.begin(), At.indptr.end()); + + for (int r = 0; r < rows; ++r) { + int start = src_indptr[r]; + int end = (r + 1 < static_cast(src_indptr.size())) ? src_indptr[r + 1] : static_cast(nnz); + for (int p = start; p < end; ++p) { + if (p >= static_cast(src_indices.size())) break; + int c = src_indices[p]; + if (c < 0 || c >= cols) continue; + int dest = cur_pos[c]++; + At.indices[dest] = r; + At.data[dest] = (p < static_cast(src_data.size())) ? src_data[p] : 1.0; + } + } + + return At; + } catch (const std::exception& e) { + std::cerr << "[build_transpose_matrix] Exception: " << e.what() << std::endl; + const int n = static_cast(nodes.size()); + CSRMatrix A(n, n); + A.indptr.assign(n + 1, 0); + return A; + } +} + +// ARPACK: 有向图 +std::vector compute_eigenvector_arpack_directed( + const CSRMatrix& A, + int max_iter, + double tol +) { + const int n_const = A.rows; + std::vector result(n_const, 0.0); + int n = n_const; + int nev = 1; + int ncv = (n > 30) ? 30 : n; + if (ncv < nev + 2) { + ncv = std::min(nev + 2, n); + } + int ido = 0; + char bmat[2] = "I"; + char which[3] = "LR"; + std::vector resid(n); + std::vector v(static_cast(n) * ncv); + std::vector workd(3 * n); + int lworkl = 3 * ncv * ncv + 6 * ncv; + std::vector workl(lworkl); + + #pragma omp parallel for schedule(static) if(n > 1000) + for (int i = 0; i < n; i++) { + int degree = A.indptr[i + 1] - A.indptr[i]; + resid[i] = degree > 0 ? + static_cast(degree) + 1e-4 * (rand() / static_cast(RAND_MAX)) : + 1.0; + } + double norm = 0.0; + for (int i = 0; i < n; i++) { + norm += resid[i] * resid[i]; + } + norm = std::sqrt(norm); + if (norm > 1e-12) { + for (int i = 0; i < n; i++) { + resid[i] /= norm; + } + } + int iparam[11] = {0}; + iparam[0] = 1; + iparam[2] = max_iter; + iparam[3] = 1; + iparam[6] = 1; + int ipntr[14]; + int ldv = n; + int info = 0; + int iter_count = 0; + auto start_time = std::chrono::high_resolution_clock::now(); + try { + while (true) { + dnaupd_(&ido, bmat, &n, which, &nev, &tol, resid.data(), &ncv, + v.data(), &ldv, iparam, ipntr, workd.data(), workl.data(), + &lworkl, &info); + if (ido == -1 || ido == 1) { + int x_idx = ipntr[0] - 1; + int y_idx = ipntr[1] - 1; + std::vector x_vec(n), y_vec(n); + std::copy(workd.begin() + x_idx, workd.begin() + x_idx + n, x_vec.begin()); + A.multiply_inplace(x_vec, y_vec); + std::copy(y_vec.begin(), y_vec.end(), workd.begin() + y_idx); + iter_count++; + } else if (ido == 99) { + break; + } else { + throw std::runtime_error("ARPACK unexpected ido: " + std::to_string(ido)); + } + if (iter_count > max_iter * 10) { + throw std::runtime_error("ARPACK exceeded iteration limit"); + } + } + } catch (const std::exception& e) { + std::cerr << "[ARPACK-Directed] Error: " << e.what() << std::endl; + throw; + } + if (info < 0) { + std::string error_msg = "ARPACK dnaupd_ error: " + std::to_string(info); + switch(info) { + case -5: error_msg += " (LWORKL too small, need at least " + std::to_string(3*ncv*ncv + 6*ncv) + ")"; break; + } + throw std::runtime_error(error_msg); + } + int rvec = 1; + char howmny[2] = "A"; + std::vector select(ncv); + std::vector dr(nev + 1); + std::vector di(nev + 1); + std::vector z(n * (nev + 1)); + std::vector workev(3 * ncv); + int ldz = n; + double sigmar = 0.0, sigmai = 0.0; + dneupd_(&rvec, howmny, select.data(), dr.data(), di.data(), z.data(), &ldz, + &sigmar, &sigmai, workev.data(), bmat, &n, which, &nev, &tol, + resid.data(), &ncv, v.data(), &ldv, iparam, ipntr, workd.data(), + workl.data(), &lworkl, &info); + if (info != 0) { + throw std::runtime_error("ARPACK dneupd_ error: " + std::to_string(info)); + } + std::copy(z.begin(), z.begin() + n, result.begin()); + return result; +} + +// ARPACK: 无向图 +std::vector compute_eigenvector_arpack( + const CSRMatrix& A, + int max_iter, + double tol +) { + const int n_const = A.rows; + std::vector result(n_const, 0.0); + int n = n_const; + int nev = 1; + int ncv = (n > 30) ? 30 : n; + if (ncv < nev + 2) { + ncv = std::min(nev + 2, n); + } + int ido = 0; + char bmat[2] = "I"; + char which[3] = "LR"; + std::vector resid(n); + std::vector v(static_cast(n) * ncv); + std::vector workd(3 * n); + int lworkl = ncv * (ncv + 8); + std::vector workl(lworkl); + + #pragma omp parallel for schedule(static) if(n > 1000) + for (int i = 0; i < n; i++) { + int degree = A.indptr[i + 1] - A.indptr[i]; + resid[i] = degree > 0 ? + static_cast(degree) + 1e-4 * (rand() / static_cast(RAND_MAX)) : + 1.0; + } + double norm = 0.0; + for (int i = 0; i < n; i++) { + norm += resid[i] * resid[i]; + } + norm = std::sqrt(norm); + if (norm > 1e-12) { + for (int i = 0; i < n; i++) { + resid[i] /= norm; + } + } + int iparam[11] = {0}; + iparam[0] = 1; + iparam[2] = max_iter; + iparam[3] = 1; + iparam[6] = 1; + int ipntr[11]; + int ldv = n; + int info = 0; + int iter_count = 0; + auto start_time = std::chrono::high_resolution_clock::now(); + try { + while (true) { + dsaupd_(&ido, bmat, &n, which, &nev, &tol, resid.data(), &ncv, + v.data(), &ldv, iparam, ipntr, workd.data(), workl.data(), + &lworkl, &info); + if (ido == -1 || ido == 1) { + int x_idx = ipntr[0] - 1; + int y_idx = ipntr[1] - 1; + std::vector x_vec(n), y_vec(n); + std::copy(workd.begin() + x_idx, workd.begin() + x_idx + n, x_vec.begin()); + A.multiply_inplace(x_vec, y_vec); + std::copy(y_vec.begin(), y_vec.end(), workd.begin() + y_idx); + iter_count++; + } else if (ido == 99) { + break; + } else { + throw std::runtime_error("ARPACK unexpected ido: " + std::to_string(ido)); + } + if (iter_count > max_iter * 10) { + throw std::runtime_error("ARPACK exceeded iteration limit"); + } + } + } catch (const std::exception& e) { + std::cerr << "[ARPACK] Error: " << e.what() << std::endl; + throw; + } + if (info < 0) { + std::string error_msg = "ARPACK dsaupd_ error: " + std::to_string(info); + switch(info) { + case -1: error_msg += " (N must be positive)"; break; + case -2: error_msg += " (NEV must be positive)"; break; + case -3: error_msg += " (NCV-NEV >= 2 and NCV <= N not satisfied)"; break; + case -5: error_msg += " (LWORKL too small, need at least " + std::to_string(ncv * (ncv + 8)) + ")"; break; + case -8: error_msg += " (Error in LAPACK)"; break; + case -9: error_msg += " (Starting vector is zero)"; break; + case -14: error_msg += " (Did not find any eigenvalues)"; break; + } + throw std::runtime_error(error_msg); + } + int rvec = 1; + char howmny[2] = "A"; + std::vector select(ncv); + std::vector d(nev); + std::vector z(n * nev); + int ldz = n; + double sigma = 0.0; + dseupd_(&rvec, howmny, select.data(), d.data(), z.data(), &ldz, &sigma, + bmat, &n, which, &nev, &tol, resid.data(), &ncv, v.data(), &ldv, + iparam, ipntr, workd.data(), workl.data(), &lworkl, &info); + if (info != 0) { + throw std::runtime_error("ARPACK dseupd_ error: " + std::to_string(info)); + } + std::copy(z.begin(), z.begin() + n, result.begin()); + return result; +} + +#if defined(HAVE_EIGEN) && defined(HAVE_EIGEN_SPARSE_SOLVER) +std::vector compute_eigenvector_eigen_sparse( + const CSRMatrix& A, + int max_iter, + double tol +) { + const int n = A.rows; + Eigen::SparseMatrix eigen_mat = A.to_eigen(); + Eigen::VectorXd x(n); + #pragma omp parallel for schedule(static) if(n > 1000) + for (int i = 0; i < n; i++) { + int degree = A.indptr[i + 1] - A.indptr[i]; + double second_order = 0.0; + for (int j = A.indptr[i]; j < A.indptr[i + 1]; j++) { + int nei = A.indices[j]; + int nei_degree = A.indptr[nei + 1] - A.indptr[nei]; + second_order += nei_degree; + } + x(i) = degree > 0 ? + static_cast(degree) + 0.1 * second_order / (degree + 1) : + 1.0; + } + x.normalize(); + double prev_eigenvalue = 0.0; + const double rel_tol = tol * std::max(1.0, A.estimate_norm()); + for (int iter = 0; iter < max_iter; iter++) { + Eigen::VectorXd x_new = eigen_mat * x; + double norm = x_new.norm(); + if (norm < 1e-12) break; + x_new /= norm; + double eigenvalue = x_new.dot(eigen_mat * x_new) / x_new.squaredNorm(); + double diff = (x_new - x).norm(); + bool converged_vector = (diff < rel_tol); + bool converged_eigenvalue = (iter > 0) && + (std::abs(eigenvalue - prev_eigenvalue) < tol * std::abs(eigenvalue)); + if (converged_vector || converged_eigenvalue) { + x = x_new; + break; + } + x = x_new; + prev_eigenvalue = eigenvalue; + } + std::vector result(n); + #pragma omp parallel for schedule(static) if(n > 1000) + for (int i = 0; i < n; i++) { + result[i] = x(i); + } + return result; +} +#endif + +// 主入口 +py::object cpp_eigenvector_centrality( + py::object G, + py::object py_max_iter, + py::object py_tol, + py::object py_nstart, + py::object py_weight +) { + try { + Graph& graph = G.cast(); + int max_iter = py_max_iter.cast(); + double tol = py_tol.cast(); + std::string weight_key = ""; + if (!py_weight.is_none()) { + weight_key = py_weight.cast(); + } + + if (graph.node.size() == 0) { + return py::dict(); + } + + std::vector nodes; + nodes.reserve(graph.node.size()); + for (auto& node_pair : graph.node) { + nodes.push_back(node_pair.first); + } + const int n = nodes.size(); + + CSRMatrix A_transpose = build_transpose_matrix(graph, nodes, weight_key); + + std::vector isolated_nodes(n, false); + #pragma omp parallel for schedule(static) if(n > 1000) + for (int i = 0; i < n; i++) { + isolated_nodes[i] = (A_transpose.indptr[i + 1] == A_transpose.indptr[i]); + } + + std::vector centrality; + + if (py_nstart.is_none()) { + bool fast_solver_success = false; +#ifdef HAVE_ARPACK + try { + if (Graph_is_directed(G)) { + centrality = compute_eigenvector_arpack_directed(A_transpose, max_iter, tol); + } else { + centrality = compute_eigenvector_arpack(A_transpose, max_iter, tol); + } + fast_solver_success = true; + } catch (const std::exception& e) { + fast_solver_success = false; + } +#endif +#if defined(HAVE_EIGEN) && defined(HAVE_EIGEN_SPARSE_SOLVER) + if (!fast_solver_success && n >= 100 && n < 500000) { + try { + centrality = compute_eigenvector_eigen_sparse(A_transpose, max_iter, tol); + fast_solver_success = true; + } catch (const std::exception&) { + fast_solver_success = false; + } + } +#endif + if (!fast_solver_success) { + std::vector x(n, 0.0); + #pragma omp parallel for schedule(static) if(n > 1000) + for (int i = 0; i < n; i++) { + if (!isolated_nodes[i]) { + const int degree = A_transpose.indptr[i + 1] - A_transpose.indptr[i]; + double second_order = 0.0; + for (int j = A_transpose.indptr[i]; j < A_transpose.indptr[i + 1]; j++) { + int nei = A_transpose.indices[j]; + int nei_degree = A_transpose.indptr[nei + 1] - A_transpose.indptr[nei]; + second_order += nei_degree; + } + x[i] = static_cast(degree) + + 0.1 * second_order / (degree + 1) + + 1e-4 * (rand() / static_cast(RAND_MAX)); + } + } + centrality = power_iteration_chebyshev(A_transpose, max_iter, tol, x); + } + } else { + py::dict nstart = py_nstart.cast(); + std::vector x(n, 0.0); + for (size_t i = 0; i < nodes.size(); i++) { + py::object node_obj = graph.id_to_node[py::cast(nodes[i])]; + if (nstart.contains(node_obj)) { + x[i] = nstart[node_obj].cast(); + } else { + x[i] = 1.0; + } + } + bool all_zeros = true; + for (double val : x) { + if (std::abs(val) > 1e-10) { + all_zeros = false; + break; + } + } + if (all_zeros) { + throw std::runtime_error("initial vector cannot have all zero values"); + } + double sum_abs = 0.0; + #pragma omp parallel for reduction(+:sum_abs) schedule(static) if(n > 1000) + for (int i = 0; i < n; i++) { + sum_abs += std::fabs(x[i]); + } + const double inv_sum = 1.0 / sum_abs; + #pragma omp parallel for schedule(static) if(n > 1000) + for (int i = 0; i < n; i++) { + x[i] *= inv_sum; + } + centrality = power_iteration_chebyshev(A_transpose, max_iter, tol, x); + } + double sum = 0.0; + #pragma omp parallel for reduction(+:sum) schedule(static) if(n > 1000) + for (int i = 0; i < n; i++) { + sum += centrality[i]; + } + if (sum < 0.0) { + #pragma omp parallel for schedule(static) if(n > 1000) + for (int i = 0; i < n; i++) { + centrality[i] = -centrality[i]; + } + } + #pragma omp parallel for schedule(static) if(n > 1000) + for (int i = 0; i < n; i++) { + if (isolated_nodes[i]) { + centrality[i] = 0.0; + } + } + double norm = vector_norm(centrality); + if (norm > 1e-12) { + normalize_vector(centrality, norm); + } + py::dict result; + for (size_t i = 0; i < nodes.size(); i++) { + py::object node_obj = graph.id_to_node[py::cast(nodes[i])]; + result[node_obj] = centrality[i]; + } + return result; + } catch (const std::exception& e) { + throw std::runtime_error(e.what()); + } +} + +// 幂迭代 +std::vector power_iteration_chebyshev( + const CSRMatrix& A, + int max_iter, + double tol, + std::vector& x +) { + const int n = A.rows; + std::vector x_old(n, 0.0); + std::vector x_new(n, 0.0); + double norm0 = 0.0; + for (int i = 0; i < n; ++i) norm0 += x[i] * x[i]; + norm0 = std::sqrt(norm0); + if (norm0 < 1e-12) { + for (int i = 0; i < n; ++i) x_old[i] = 1.0 / std::sqrt(static_cast(n)); + } else { + for (int i = 0; i < n; ++i) x_old[i] = x[i] / norm0; + } + double lambda_old = 0.0, lambda = 0.0; + for (int iter = 0; iter < max_iter; ++iter) { + A.multiply_inplace(x_old, x_new); + double norm = 0.0; + for (int i = 0; i < n; ++i) norm += x_new[i] * x_new[i]; + norm = std::sqrt(norm); + if (norm < 1e-12) break; + double inv_norm = 1.0 / norm; + for (int i = 0; i < n; ++i) x_new[i] *= inv_norm; + std::vector Ax(n, 0.0); + A.multiply_inplace(x_new, Ax); + double num = 0.0, den = 0.0; + for (int i = 0; i < n; ++i) { + num += x_new[i] * Ax[i]; + den += x_new[i] * x_new[i]; + } + lambda = (den > 0.0) ? (num / den) : 0.0; + double diff = 0.0; + for (int i = 0; i < n; ++i) { + double d = x_new[i] - x_old[i]; + diff += d * d; + } + diff = std::sqrt(diff); + if ((iter > 0 && std::abs(lambda - lambda_old) < tol * std::abs(lambda)) || (diff < tol)) { + for (int i = 0; i < n; ++i) x[i] = x_new[i]; + return x; + } + lambda_old = lambda; + x_old.swap(x_new); + } + for (int i = 0; i < n; ++i) x[i] = x_old[i]; + return x; +} + +double vector_norm(const std::vector& x) { + double s = 0.0; + #pragma omp parallel for reduction(+:s) if(x.size() > 1000) + for (size_t i = 0; i < x.size(); ++i) s += x[i] * x[i]; + return std::sqrt(s); +} + +void normalize_vector(std::vector& x, double norm) { + const double inv_norm = 1.0 / norm; + #pragma omp parallel for schedule(static) if(x.size() > 1000) + for (size_t i = 0; i < x.size(); ++i) { + x[i] *= inv_norm; + } +} + +double vector_diff_norm(const std::vector& x1, const std::vector& x2) { + double diff = 0.0; + #pragma omp parallel for reduction(+:diff) if(x1.size() > 1000) + for (size_t i = 0; i < x1.size(); ++i) { + double d = x1[i] - x2[i]; + diff += d * d; + } + return std::sqrt(diff); +} \ No newline at end of file diff --git a/easygraph/functions/centrality/__init__.py b/easygraph/functions/centrality/__init__.py index f37e55fe..bd2fec50 100644 --- a/easygraph/functions/centrality/__init__.py +++ b/easygraph/functions/centrality/__init__.py @@ -5,4 +5,5 @@ from .flowbetweenness import * from .laplacian import * from .pagerank import * -from .katz_centrality import * \ No newline at end of file +from .katz_centrality import * +from .eigenvector import * \ No newline at end of file diff --git a/easygraph/functions/centrality/eigenvector.py b/easygraph/functions/centrality/eigenvector.py new file mode 100644 index 00000000..2d7506d0 --- /dev/null +++ b/easygraph/functions/centrality/eigenvector.py @@ -0,0 +1,155 @@ +import math +import easygraph as eg +from easygraph.utils import * +from easygraph.utils.decorators import * +from scipy import sparse +from scipy.sparse import linalg +import numpy as np +from collections import defaultdict + +__all__ = ["eigenvector_centrality"] + +@not_implemented_for("multigraph") +@hybrid("cpp_eigenvector_centrality") +def eigenvector_centrality(G, max_iter=100, tol=1.0e-6, nstart=None, weight=None): + """Calculate eigenvector centrality for nodes in the graph + + Eigenvector centrality is based on the idea that a node's importance + depends on the importance of its neighboring nodes. + Specifically, a node's centrality is proportional to the sum of + centrality values of its neighbors. + + Parameters + ---------- + G : graph object + An undirected or directed graph + + max_iter : int, optional (default=100) + Maximum number of iterations for the power method + + tol : float, optional (default=1.0e-6) + Convergence threshold; algorithm terminates when the difference + between centrality values in consecutive iterations is less than this value + + nstart : dictionary, optional (default=None) + Dictionary mapping nodes to initial centrality values + If None, the ARPACK solver is used to directly compute the eigenvector + + weight : string or None, optional (default=None) + Name of the edge attribute to be used as edge weight + If None, all edges are considered to have weight 1 + + Returns + ------- + centrality : dictionary + Dictionary mapping nodes to their eigenvector centrality values + + Raises + ------ + EasyGraphPointlessConcept + When input is an empty graph + + EasyGraphError + When the algorithm fails to converge within the specified maximum iterations + + Notes + ----- + This algorithm uses the power iteration method to find the principal eigenvector. + When nstart is not provided, the ARPACK solver is used for efficiency. + The returned centrality values are normalized. + """ + + if len(G) == 0: + raise eg.EasyGraphPointlessConcept( + "cannot compute centrality for the null graph" + ) + + if len(G) == 1: + raise eg.EasyGraphPointlessConcept( + "cannot compute eigenvector centrality for a single node graph" + ) + + + # Build node list and mapping + nodelist = list(G.nodes) + n = len(nodelist) + node_map = {node: i for i, node in enumerate(nodelist)} + + # Build weighted adjacency matrix + row, col, data = [], [], [] + for u in nodelist: + u_idx = node_map[u] + for v, attrs in G[u].items(): + if v in node_map: + v_idx = node_map[v] + w = attrs.get(weight, 1.0) if weight else 1.0 + # Build transpose matrix for centrality calculation + row.append(v_idx) + col.append(u_idx) + data.append(float(w)) + + # Create CSR format sparse matrix + A = sparse.csr_matrix((data, (row, col)), shape=(n, n)) + + # Detect and handle isolated nodes + row_sums = np.array(A.sum(axis=1)).flatten() + col_sums = np.array(A.sum(axis=0)).flatten() + isolated_nodes = np.where((row_sums == 0) & (col_sums == 0))[0] + + has_isolated = len(isolated_nodes) > 0 + isolated_indices = [] + + # Add small self-loops to isolated nodes for stability + if has_isolated: + # Store isolated node indices + isolated_indices = isolated_nodes.tolist() + + # Add small self-loop weights to isolated nodes + for idx in isolated_indices: + A[idx, idx] = 1.0e-4 # Small enough to not affect results, but maintains numerical stability + if nstart is not None: + # Use custom initial vector for power iteration + v = np.array([nstart.get(n, 1.0) for n in nodelist], dtype=float) + v = v / np.sum(np.abs(v)) + + # Power iteration method to compute principal eigenvector + v_last = np.zeros_like(v) + for _ in range(max_iter): + np.copyto(v_last, v) + v = A @ v_last # Sparse matrix multiplication + + norm = np.linalg.norm(v) + if norm < 1e-10: + v = v_last.copy() + break + v = v / norm # Normalization + + # Check convergence + if np.linalg.norm(v - v_last) < tol: + break + else: + raise eg.EasyGraphError(f"Eigenvector calculation did not converge in {max_iter} iterations") + + centrality = v + else: + # Use ARPACK solver to directly compute the principal eigenvector + eigenvalues, eigenvectors = linalg.eigs(A, k=1, which='LR', + maxiter=max_iter, tol=tol) + centrality = np.real(eigenvectors[:,0]) + + # Ensure positive results and normalize + if centrality.sum() < 0: + centrality = -centrality + + centrality = centrality / np.linalg.norm(centrality) + # Set centrality of isolated nodes to zero + if has_isolated: + for idx in isolated_indices: + centrality[idx] = 0.0 + # Renormalize if needed + if np.sum(centrality) > 0: + centrality = centrality / np.linalg.norm(centrality) + + # Return dictionary of node centrality values + return {nodelist[i]: float(centrality[i]) for i in range(n)} + diff --git a/easygraph/functions/centrality/tests/test_eigenvector.py b/easygraph/functions/centrality/tests/test_eigenvector.py new file mode 100644 index 00000000..1617b857 --- /dev/null +++ b/easygraph/functions/centrality/tests/test_eigenvector.py @@ -0,0 +1,119 @@ +import unittest +import numpy as np +import easygraph as eg +from easygraph.functions.centrality import eigenvector_centrality +from easygraph.utils.exception import EasyGraphPointlessConcept, EasyGraphNotImplemented + + +class Test_eigenvector(unittest.TestCase): + def setUp(self): + self.edges = [ + (1, 4), + (2, 4), + ("String", "Bool"), + (4, 1), + (0, 4), + (4, 256), + ((None, None), (None, None)), + ] + + self.test_graphs = [] + self.test_graphs.append(eg.classes.DiGraph(self.edges)) + + self.simple_graph = eg.Graph() + self.simple_graph.add_edges_from([(0, 1), (1, 2), (2, 3)]) + + self.directed_graph = eg.DiGraph() + self.directed_graph.add_edges_from([(0, 1), (1, 2), (2, 3)]) + + self.weighted_graph = eg.Graph() + self.weighted_graph.add_edges_from([(0, 1), (1, 2), (2, 3)]) + + for u, v, data in self.weighted_graph.edges: + data["weight"] = 2.0 + + self.disconnected_graph = eg.Graph() + self.disconnected_graph.add_edges_from([(0, 1), (2, 3)]) + + self.isolated_node_graph = eg.Graph() + self.isolated_node_graph.add_edges_from([(0, 1), (1, 2)]) + self.isolated_node_graph.add_node(3) + + self.single_node_graph = eg.Graph() + self.single_node_graph.add_node(42) + + self.mixed_nodes_graph = eg.Graph() + self.mixed_nodes_graph.add_edges_from([(1, 2), ("X", "Y"), ((1, 2), (3, 4))]) + + def test_eigenvector(self): + + for G in self.test_graphs: + result = eigenvector_centrality(G) + self.assertIsInstance(result, dict) + self.assertEqual(len(result), len(G)) + + for v in result.values(): + self.assertIsInstance(v, float) + + def test_simple_graph(self): + result = eigenvector_centrality(self.simple_graph) + self.assertEqual(len(result), len(self.simple_graph)) + + for v in result.values(): + self.assertIsInstance(v, float) + + def test_directed_graph(self): + result = eigenvector_centrality(self.directed_graph) + self.assertEqual(len(result), len(self.directed_graph)) + + def test_weighted_graph(self): + result = eigenvector_centrality(self.weighted_graph, weight="weight") + self.assertEqual(len(result), len(self.weighted_graph)) + + def test_disconnected_graph(self): + + result = eigenvector_centrality(self.disconnected_graph) + self.assertEqual(len(result), len(self.disconnected_graph)) + + def test_isolated_nodes(self): + + result = eigenvector_centrality(self.isolated_node_graph) + self.assertEqual(len(result), len(self.isolated_node_graph)) + self.assertTrue(3 in result) + self.assertGreaterEqual(result[3], 0) + + def test_single_node_graph(self): + with self.assertRaises(EasyGraphPointlessConcept): + eigenvector_centrality(self.single_node_graph) + + def test_mixed_node_types(self): + result = eigenvector_centrality(self.mixed_nodes_graph) + self.assertEqual(len(result), len(self.mixed_nodes_graph)) + + def test_max_iter_parameter(self): + result = eigenvector_centrality(self.simple_graph, max_iter=50) + self.assertEqual(len(result), len(self.simple_graph)) + + def test_tol_parameter(self): + result = eigenvector_centrality(self.simple_graph, tol=1.0e-3) + self.assertEqual(len(result), len(self.simple_graph)) + + def test_nstart_parameter(self): + nstart = {node: 1.0 for node in self.simple_graph} + result = eigenvector_centrality(self.simple_graph, nstart=nstart) + self.assertEqual(len(result), len(self.simple_graph)) + + def test_multigraph_raises(self): + G = eg.MultiGraph() + G.add_edges_from([(0, 1), (0, 1)]) + with self.assertRaises(EasyGraphNotImplemented): + eigenvector_centrality(G) + + def test_empty_graph_raises(self): + G = eg.Graph() + with self.assertRaises(EasyGraphPointlessConcept): + eigenvector_centrality(G) + + +if __name__ == "__main__": + unittest.main() \ No newline at end of file