Partitioning a Pipeline
Here’s a simple use case for applying connectivity constraints in Hierarchical Clustering. Our goal is to partition a raster image of the Canadian pipeline such that all pixels in a cluster will be connected through a sequence of adjacent members.
See here for the data source: https://ssc.ca/en/case-study/what-geographical-factors-are-associated-pipeline-incidents-involve-spills
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import deque
from scipy.spatial.distance import cdist
from sklearn.cluster import AgglomerativeClustering
= plt.imread('gdm-Pipelines_EN.png')
img = plt.subplots(figsize=(10,10))
fig, ax ; plt.imshow(img)
Let’s ignore any information colour is providing here and convert to a binary image. We’ll also crop out any unnecessary whitespace.
# convert to binary image
= img[:,:,3] > 0
pixels
# crop out whitespace
= np.argmax(pixels.max(axis=1))
bot_crop = pixels.shape[0] - np.argmax(pixels.max(axis=1)[::-1])
top_crop = np.argmax(pixels.max(axis=0))
left_crop = pixels.shape[1] - np.argmax(pixels.max(axis=0)[::-1])
right_crop = pixels[bot_crop:top_crop,left_crop:right_crop]
pixels
='binary'); plt.imshow(pixels,cmap
Now let’s convert the raster to a graph where each node represents a coloured cell and each edge indicates two cells that are adjacent by a queen’s move one cell away.
Scikit-learn’s AgglomerativeClustering requires a connected graph so we’ll need to determine all the disconnected components using breadth-first search. Then we’ll use Kruskal’s algorithm to find the minimum spanning tree that connects all components and add those edges to the adjacency matrix.
# convert nonempty cells to nodes
= np.array(np.where(pixels == 1)).T
nodes = len(nodes)
n_nodes
# create adjacency matrix by assigning edges to adjacent pixels
= cdist(nodes,nodes)
dist_matrix = 1*(dist_matrix<2)
adj_matrix
# find all disconnected components
= np.zeros(n_nodes) - 1
component_labels = 0
componentId = deque()
Q for i in range(len(nodes)):
if component_labels[i] == -1:
= componentId
component_labels[i]
Q.append(i)while len(Q) > 0:
for j in np.where(adj_matrix[Q[0]] == 1)[0]:
if component_labels[j] == -1:
= componentId
component_labels[j]
Q.append(j)
Q.popleft()+= 1 componentId
# for each pair of disconnected components find the edge w/ min distance to connect
= []
comp_edges for i in range(componentId-1):
for j in range(i+1,componentId):
= np.where(component_labels == i)[0]
i_nodes = np.where(component_labels == j)[0]
j_nodes = dist_matrix[np.ix_(i_nodes, j_nodes)]
btwn_comps_matrix = np.unravel_index(btwn_comps_matrix.argmin(),btwn_comps_matrix.shape)
min_dist_matrix_coords
comp_edges.append(
[# edge: component ids
[i,j], # distance
btwn_comps_matrix[min_dist_matrix_coords], 0]],j_nodes[min_dist_matrix_coords[1]]], # edge: node ids
[i_nodes[min_dist_matrix_coords[
]
)= sorted(comp_edges, key=lambda i: i[1])
comp_edges
# Kruskal's Algorithm - Find Min Spanning Tree to reconnect entire graph
= []
comp_connectors = np.arange(componentId)
joined_comp_labels for edge in comp_edges:
if joined_comp_labels[edge[0][0]] != joined_comp_labels[edge[0][1]]:
==joined_comp_labels[edge[0][1]])] = joined_comp_labels[edge[0][0]]
joined_comp_labels[np.where(joined_comp_labels-1])
comp_connectors.append(edge[
# connect adjacency matrix
= adj_matrix.copy()
connected_adj_matrix for edge in comp_connectors:
0], edge[1]] = 1
connected_adj_matrix[edge[1], edge[0]] = 1 connected_adj_matrix[edge[
= plt.subplots(figsize=(20,20))
fig, ax ='binary',alpha=.1)
plt.imshow(pixels,cmapfor i in range(len(comp_connectors)):
= nodes[comp_connectors[i]]
pixel_coords 1],pixel_coords[:,0],linewidth=5); plt.plot(pixel_coords[:,
It’s time to partition the pipeline.
= 74 # number of regions
n_clusters = AgglomerativeClustering(
ward =74, linkage="ward", connectivity=connected_adj_matrix
n_clusters
)= ward.fit(nodes) model
Let’s create a network from the partitioned regions. We’ll need to find which clusters are adjacent.
# create centroids for each cluster
= []
centers for i in range(n_clusters):
== i)].mean(axis=0))
centers.append(nodes[np.where(ward.labels_ = np.array(centers)
centers
# find adjacent clusters
= np.zeros((n_clusters,n_clusters))
dist_btwn_clusters for i in range(n_clusters-1):
for j in range(i+1,n_clusters):
= np.where(ward.labels_ == i)[0]
i_nodes = np.where(ward.labels_ == j)[0]
j_nodes = dist_matrix[np.ix_(i_nodes, j_nodes)].min()
dist_btwn_clusters[i,j] = dist_btwn_clusters[i, j]
dist_btwn_clusters[j, i]
= (dist_btwn_clusters < 2)
clusters_adj_matrix False) np.fill_diagonal(clusters_adj_matrix,
=(18,16))
plt.figure(figsize='binary',alpha=0)
plt.imshow(pixels,cmap1], nodes[:,0], c=ward.labels_, s=0.5, cmap='jet',alpha=0.2)
plt.scatter(nodes[:,1], centers[:,0], c='k', s=30);
plt.scatter(centers[:,
= np.where(clusters_adj_matrix)
edges for i in range(len(edges[0])):
plt.plot(0][i],1],centers[edges[1][i],1]],
[centers[edges[0][i],0],centers[edges[1][i],0]],
[centers[edges[='k'
c )
And that’s it! As a bonus the source provided some oilspill data. Let’s check where they occur. Oilspill coordinates are in latitude and longitude so we’ll have to convert it to our image’s coordinates using a mercator projection. The function below is derived from https://en.wikipedia.org/wiki/Mercator_projection#Mathematics
def convertGeoCordsToPixel(
coordinates,
pixelWidth,
pixelHeight,
longLeftmostPixel,
longRightmostPixel,
latSouthmostPixel):= coordinates[:,0]
latitudes = coordinates[:,1]
longitudes
= (pixelWidth / (longRightmostPixel - longLeftmostPixel))
pixelperLongDegree = pixelperLongDegree * 360
entireWorldWidth = entireWorldWidth / (2 * np.pi)
globeRadius
= globeRadius * np.pi / 180 * (longitudes - longLeftmostPixel)
x
= latitudes * np.pi / 180
latitudes_Rad = latSouthmostPixel * np.pi / 180
latSouthmostPixel_Rad = globeRadius / 2 * np.log((1 + np.sin(latSouthmostPixel_Rad)) / (1 - np.sin(latSouthmostPixel_Rad)))
yOffset = (yOffset + pixelHeight) - globeRadius / 2 * np.log((1 + np.sin(latitudes_Rad)) / (1 - np.sin(latitudes_Rad)))
y
return np.array([y, x]).T
= pd.read_csv('oilspills.csv')
oilspills = oilspills[['Latitude','Longitude']].values
spill_latlong
= convertGeoCordsToPixel(
spill_coords
spill_latlong,1],
pixels.shape[0],
pixels.shape[-134.9,
-48.5,
42
)
=(10,10))
plt.figure(figsize='binary', alpha=.2)
plt.imshow(pixels, cmap1], spill_coords[:,0], ec='k', alpha=.5); plt.scatter(spill_coords[:,
Looks close enough. Finally, let’s assign each spill to the closest cluster.
'cluster'] = ward.labels_[cdist(spill_coords, nodes).argmin(axis=1)]
oilspills[= oilspills['cluster'].values.tolist()
spill_labels = [spill_labels.count(i) for i in range(n_clusters)]
spills_counts
=(18,16))
plt.figure(figsize0,img.shape[1])
plt.xlim(-img.shape[0],0)
plt.ylim(1], -nodes[:,0],c=ward.labels_, s=0.5, cmap='jet',alpha=0.5,zorder=-100000000)
plt.scatter(nodes[:,1], -centers[:,0],c='k', s=25,zorder=-10000000);
plt.scatter(centers[:,for i in range(len(centers)):
1],-centers[i,0]),size=15,bbox=dict(boxstyle="circle,pad=0.3", fc='w', ec="k", lw=2),zorder=-i)
plt.annotate(spills_counts[i],(centers[i,'off'); plt.axis(