ST_ConnectedComponents
Signatures
-- Creates two tables:
-- INPUT_EDGES_NODE_CC[NODE_ID, CONNECTED_COMPONENT]
-- INPUT_EDGES_EDGE_CC[EDGE_ID, CONNECTED_COMPONENT]
ST_ConnectedComponents('INPUT_EDGES', 'o[ - eo]');
Description
Calculates the connected components (CCs, for undirected
graphs) or strongly connected components (SCCs, for directed
graphs) of the graph represented by INPUT_EDGES
.
Produces two tables (nodes and edges) containing a node or edge ID
and a connected component ID.
Variable
Meaning
INPUT_EDGES
Table containing integer columns EDGE_ID
, START_NODE
and END_NODE
, and optionally an edge orientation column eo
(required if global orientation is not undirected
)
o
Global orientation string: directed
, reversed
or undirected
eo
Edge orientation column name indicating individual edge orientations: 1
(directed), -1
(reversed) or 0
(undirected); required if global orientation is directed
or reversed
Edges in no SCC are assigned a connected component ID of
-1.
Such edges have a start nodes and end nodes in different
SCCs.
Connected component IDs are not guaranteed to be the same
after each execution of ST_ConnectedComponents
.
But of course, the vertex and edge set partitions they
represent remain consistent. In the examples below, you may have
to adjust certain requests according to how
ST_ConnectedComponents
numbers the connected
components.
Examples
SCCs (directed graph)
-- Prepare example data:
DROP TABLE IF EXISTS EDGES;
CREATE TABLE EDGES(EDGE_ID INT AUTO_INCREMENT PRIMARY KEY,
START_NODE INT,
END_NODE INT,
EDGE_ORIENTATION INT);
INSERT INTO EDGES(START_NODE, END_NODE, EDGE_ORIENTATION)
VALUES (1, 2, 1),
(2, 3, 1),
(2, 5, 1),
(2, 6, 1),
(3, 4, 1),
(3, 7, 1),
(4, 3, 1),
(4, 8, 1),
(5, 1, 1),
(5, 6, 1),
(6, 7, 1),
(7, 6, 1),
(8, 4, 1),
(8, 7, 1),
(9, 10, 1),
(10, 9, 1),
(10, 11, 1),
(12, 12, 1);
SELECT * FROM EDGES;
-- | EDGE_ID | START_NODE | END_NODE | EDGE_ORIENTATION |
-- |---------|------------|----------|------------------|
-- | 1 | 1 | 2 | 1 |
-- | 2 | 2 | 3 | 1 |
-- | 3 | 2 | 5 | 1 |
-- | 4 | 2 | 6 | 1 |
-- | 5 | 3 | 4 | 1 |
-- | 6 | 3 | 7 | 1 |
-- | 7 | 4 | 3 | 1 |
-- | 8 | 4 | 8 | 1 |
-- | 9 | 5 | 1 | 1 |
-- | 10 | 5 | 6 | 1 |
-- | 11 | 6 | 7 | 1 |
-- | 12 | 7 | 6 | 1 |
-- | 13 | 8 | 4 | 1 |
-- | 14 | 8 | 7 | 1 |
-- | 15 | 9 | 10 | 1 |
-- | 16 | 10 | 9 | 1 |
-- | 17 | 10 | 11 | 1 |
-- | 18 | 12 | 12 | 1 |
-- Do the SCC calculation and diplay the results:
CALL ST_ConnectedComponents('EDGES', 'directed - EDGE_ORIENTATION');
SELECT * FROM EDGES_NODE_CC
ORDER BY CONNECTED_COMPONENT ASC;
-- | NODE_ID | CONNECTED_COMPONENT |
-- |---------|---------------------|
-- | 12 | 1 |
-- | 9 | 2 |
-- | 10 | 2 |
-- | 11 | 3 |
-- | 5 | 4 |
-- | 2 | 4 |
-- | 1 | 4 |
-- | 8 | 5 |
-- | 3 | 5 |
-- | 4 | 5 |
-- | 6 | 6 |
-- | 7 | 6 |
SELECT * FROM EDGES_EDGE_CC
ORDER BY CONNECTED_COMPONENT ASC;
-- | EDGE_ID | CONNECTED_COMPONENT |
-- |---------|---------------------|
-- | 14 | -1 |
-- | 6 | -1 |
-- | 17 | -1 |
-- | 10 | -1 |
-- | 4 | -1 |
-- | 2 | -1 |
-- | 18 | 1 |
-- | 16 | 2 |
-- | 15 | 2 |
-- | 1 | 4 |
-- | 3 | 4 |
-- | 9 | 4 |
-- | 13 | 5 |
-- | 8 | 5 |
-- | 7 | 5 |
-- | 5 | 5 |
-- | 11 | 6 |
-- | 12 | 6 |
Counting the number of edges in each SCC
-- Count the number of edges in each SCC:
-- (We could similarly count the number of nodes in each SCC.)
DROP TABLE IF EXISTS EDGE_CC_TOTALS;
CREATE TABLE EDGE_CC_TOTALS AS
SELECT CONNECTED_COMPONENT CC,
COUNT(CONNECTED_COMPONENT) CC_COUNT
FROM EDGES_EDGE_CC
GROUP BY CC
ORDER BY CC_COUNT DESC;
SELECT * FROM EDGE_CC_TOTALS;
-- | CC | CC_COUNT |
-- |----|----------|
-- | -1 | 6 |
-- | 5 | 4 |
-- | 4 | 3 |
-- | 2 | 2 |
-- | 6 | 2 |
-- | 1 | 1 |
Selecting the largest SCC
-- Creating these indices will greatly speed up the following
-- calculations.
CREATE INDEX ON EDGES(EDGE_ID);
CREATE INDEX ON EDGES_EDGE_CC(EDGE_ID);
-- Select the largest SCC:
DROP TABLE IF EXISTS EDGES_LARGEST;
CREATE TABLE EDGES_LARGEST AS
SELECT A.*, B.CONNECTED_COMPONENT CC
FROM EDGES A, EDGES_EDGE_CC B
WHERE A.EDGE_ID=B.EDGE_ID
AND B.CONNECTED_COMPONENT=5;
SELECT * FROM EDGES_LARGEST;
-- | EDGE_ID | START_NODE | END_NODE | EDGE_ORIENTATION | CC |
-- |---------|------------|----------|------------------|----|
-- | 5 | 3 | 4 | 1 | 5 |
-- | 7 | 4 | 3 | 1 | 5 |
-- | 8 | 4 | 8 | 1 | 5 |
-- | 13 | 8 | 4 | 1 | 5 |
-- We can also select the edges which are in no SCC:
DROP TABLE IF EXISTS EDGES_NO_SCC;
CREATE TABLE EDGES_NO_SCC AS
SELECT A.*, B.CONNECTED_COMPONENT CC
FROM EDGES A, EDGES_EDGE_CC B
WHERE A.EDGE_ID=B.EDGE_ID
AND B.CONNECTED_COMPONENT=-1;
SELECT * FROM EDGES_NO_SCC;
-- | EDGE_ID | START_NODE | END_NODE | EDGE_ORIENTATION | CC |
-- |---------|------------|----------|------------------|----|
-- | 2 | 2 | 3 | 1 | -1 |
-- | 4 | 2 | 6 | 1 | -1 |
-- | 6 | 3 | 7 | 1 | -1 |
-- | 10 | 5 | 6 | 1 | -1 |
-- | 14 | 8 | 7 | 1 | -1 |
-- | 17 | 10 | 11 | 1 | -1 |
CCs (undirected graph)
-- Now we will do the same calculation, this time considering the
-- graph to be undirected:
DROP TABLE IF EXISTS EDGES_NODE_CC;
DROP TABLE IF EXISTS EDGES_EDGE_CC;
CALL ST_ConnectedComponents('EDGES', 'undirected');
-- Notice that this time, we only have three CCs, as expected.
SELECT * FROM EDGES_NODE_CC
ORDER BY CONNECTED_COMPONENT ASC;
-- | NODE_ID | CONNECTED_COMPONENT |
-- |---------|---------------------|
-- | 4 | 1 |
-- | 5 | 1 |
-- | 8 | 1 |
-- | 7 | 1 |
-- | 2 | 1 |
-- | 1 | 1 |
-- | 3 | 1 |
-- | 6 | 1 |
-- | 9 | 2 |
-- | 10 | 2 |
-- | 11 | 2 |
-- | 12 | 3 |
SELECT * FROM EDGES_EDGE_CC
ORDER BY CONNECTED_COMPONENT ASC;
-- | EDGE_ID | CONNECTED_COMPONENT |
-- |---------|---------------------|
-- | 10 | 1 |
-- | 8 | 1 |
-- | 4 | 1 |
-- | 11 | 1 |
-- | 12 | 1 |
-- | 13 | 1 |
-- | 14 | 1 |
-- | 1 | 1 |
-- | 2 | 1 |
-- | 3 | 1 |
-- | 9 | 1 |
-- | 5 | 1 |
-- | 7 | 1 |
-- | 6 | 1 |
-- | 16 | 2 |
-- | 17 | 2 |
-- | 15 | 2 |
-- | 18 | 3 |
Exercise
Will
CALL ST_ConnectedComponents('EDGES', 'directed - EDGE_ORIENTATION');
and
CALL ST_ConnectedComponents('EDGES', 'reversed - EDGE_ORIENTATION');
return the same results? Why or why not?
See also