基于MySQL实现基础图数据库的详细步骤_Mysql

一、概念

图数据库是一种用于存储和查询具有复杂关系的数据的数据库。在这种数据库中，数据被表示为节点（实体）和边（关系）。图数据库的核心优势在于能够快速地查询和处理节点之间的关系。

图数据库特点：

高效处理复杂关系：图数据库擅长处理复杂、多层级的关系，这使得它在社交网络分析、推荐系统等领域具有显著优势。
灵活的查询语言：图数据库通常使用类似自然语言的查询语言，如gremlin或cypher，使得查询过程更加直观。

但并非只有专业的图数据库可以实现图的一些操作，比如：图挖掘，实际也可以通过mysql来实现。本文主要讲解如何通过mysql构建图数据存储，当然mysql构建图结构数据与专业图数据库还是有能力上的差异，比如：图算法需要自己通过sql实现、整体效率不及专业图数据库等。

二、应用场景

基于mysql实现图数据库，是通过多表关联来实现操作，因此性能和整体能力肯定不及专业图数据库。

mysql实现图存储最适合场景：

中小规模图数据（≤10万节点）
需要强事务保证的业务系统
图查询以1-3度关系为主
已有mysql基础设施且预算有限

专业图数据库场景：

大规模图数据（≥100万节点）
需要复杂图算法（社区发现等）
深度路径查询（≥4度关系）
实时图分析需求

三、实现

环境搭建

首先我们需要有mysql环境，我这里为了方便就直接通过docker搭建mysql：

docker run -d \
  --name mysql8 \
  --restart always \
  -p 3306:3306 \
  -e tz=asia/shanghai \
  -e mysql_root_password=123456 \
  -v /users/ziyi2/docker-home/mysql/data:/var/lib/mysql \
  mysql:8.0

存储结构定义

图主要包含节点、边，因此我们这里选择定义两个数据表来实现。同时节点和边都具有很多属性，且为kv对，这里我们就采用mysql中的json格式存储。

-- 节点表
create table if not exists node (
    node_id bigint not null auto_increment primary key,
    properties json comment '节点属性'
);

-- 边表
create table if not exists edge (
    edge_id bigint not null auto_increment primary key,
    source_id bigint not null comment '源节点id',
    target_id bigint not null comment '目标节点id',
    properties json comment '边属性',
    foreign key(source_id) references node(node_id) on delete cascade,
    foreign key(target_id) references node(node_id) on delete cascade
);

-- 索引创建
create index idx_edge_source on edge(source_id);
create index idx_edge_target on edge(target_id);

基础功能

创建

节点创建：

-- 创建用户节点
insert into node (properties) values
('{"type": "user", "name": "张三", "age": 28, "interests": ["篮球","音乐"]}'),
('{"type": "user", "name": "李四", "age": 32, "interests": ["电影","美食"]}'),
('{"type": "user", "name": "王五", "age": 27, "interests": ["跑步","美食"]}');

边创建：

-- 创建好友关系
insert into edge (source_id, target_id, properties) values
(1, 3, '{"type": "friend", "since": "2023-01-01"}'),
(2, 3, '{"type": "friend", "since": "2023-01-01"}');

查询

根据节点属性查询节点

select * from node
where properties->>'$.name' = '张三';

查询某个节点关联的另一个节点

-- 查询张三的好友
select n2.node_id, n2.properties->>'$.name' as friend_name
from edge e
join node n1 on e.source_id = n1.node_id
join node n2 on e.target_id = n2.node_id
where n1.properties->>'$.name' = '张三'
and e.properties->>'$.type' = 'friend';

查询两个节点的公共节点。查询共同好友，因为张三、王五是好友，李四、王五是好友，所以张三跟李四的共同好友就是王五

-- 查询共同好友
select n3.properties->>'$.name' as common_friend
from edge e1
join edge e2 on e1.target_id = e2.target_id
join node n1 on e1.source_id = n1.node_id
join node n2 on e2.source_id = n2.node_id
join node n3 on e1.target_id = n3.node_id
where n1.properties->>'$.name' = '张三'
and n2.properties->>'$.name' = '李四'
and e1.properties->>'$.type' = 'friend'
and e2.properties->>'$.type' = 'friend';

递归

查找某个节点关联的所有节点，类似与neo4j中的expand展开。

-- 递归查找所有关联节点
with recursive node_path as (
    select
        source_id,
        target_id,
        properties,
        1 as depth
    from edge
    where source_id = 1

    union all

    select
        np.source_id,
        e.target_id,
        e.properties,
        np.depth + 1
    from node_path np
    join edge e on np.target_id = e.source_id
    where np.depth < 5 -- 控制最大深度
)
select * from node_path;

效果：

更新

-- 更新节点已有属性值【更新完之后查询效果】
select * from node
where properties->>'$.name' = '张三';

update node
set properties = json_set(properties, '$.age', 29)
where properties->>'$.name' = '张三';

-- 新增节点属性：添加新兴趣
update node
set properties = json_array_append(properties, '$.interests', '游泳')
where properties->>'$.name' = '张三';


select * from node
where properties->>'$.name' = '张三';

删除

-- 删除关系
delete from edge 
where source_id = (select node_id from node where properties->>'$.name' = '张三')
and target_id = (select node_id from node where properties->>'$.name' = '王五');

-- 删除节点及其关系
delete from node where properties->>'$.name' = '张三';

下面演示删除关系过程，删除节点同理：

1.删除之前

select * from edge
where source_id = (select node_id from node where properties->>'$.name' = '张三')
and target_id = (select node_id from node where properties->>'$.name' = '王五');

2. 执行sql删除后

-- 删除关系
delete from edge 
where source_id = (select node_id from node where properties->>'$.name' = '张三')
and target_id = (select node_id from node where properties->>'$.name' = '王五');

图算法实现

1. 度中心性算法

度中心性算法（degree centrality）

介绍：中心性是刻画节点中心性的最直接度量指标。节点的度是指一个节点连接的边的数量，一个节点的度越大就意味着这个节点的度中心性越高，该节点在网络中就越重要。对于有向图，还要分别考虑出度/入度/出入度。
计算：统计节点连接的边数量。
应用：计算某个领域的kol关键人物，头部商家、用户、up主…

数据构造：

-- 删除之前数据，避免用户数据重复等
delete from edge;
delete from node;
alter table node auto_increment = 1;
alter table edge auto_increment = 1;

-- 创建用户节点
insert into node (properties) values
('{"type":"user","name":"张三","title":"科技博主"}'),
('{"type":"user","name":"李四","title":"美食达人"}'),
('{"type":"user","name":"王五","title":"旅行摄影师"}'),
('{"type":"user","name":"赵六","title":"投资专家"}'),
('{"type":"user","name":"钱七","title":"健身教练"}'),
('{"type":"user","name":"周八","title":"宠物博主"}'),
('{"type":"user","name":"吴九","title":"历史学者"}');

-- 创建关注关系
insert into edge (source_id, target_id, properties) values
-- 张三被关注关系
(2,1, '{"type":"follow","timestamp":"2023-01-10"}'),
(3,1, '{"type":"follow","timestamp":"2023-01-12"}'),
(4,1, '{"type":"follow","timestamp":"2023-01-15"}'),
(5,1, '{"type":"follow","timestamp":"2023-01-18"}'),
-- 李四被关注关系
(1,2, '{"type":"follow","timestamp":"2023-01-20"}'),
(3,2, '{"type":"follow","timestamp":"2023-01-22"}'),
(6,2, '{"type":"follow","timestamp":"2023-01-25"}'),
-- 王五被关注关系
(1,3, '{"type":"follow","timestamp":"2023-02-01"}'),
(7,3, '{"type":"follow","timestamp":"2023-02-05"}'),
-- 赵六被关注关系
(4,4, '{"type":"follow","timestamp":"2023-02-10"}'); -- 自关注（特殊情况）

度中心性算法实现：

-- 计算用户被关注度（入度中心性）
select 
    n.node_id,
    n.properties->>'$.name' as user_name,
    n.properties->>'$.title' as title,
    count(e.edge_id) as follower_count,
    -- 计算标准化中心性（0-1范围）
    round(count(e.edge_id) / (select count(*)-1 from node where properties->>'$.type'='user'), 3) as normalized_centrality
from node n
left join edge e on n.node_id = e.target_id
and e.properties->>'$.type' = 'follow'
where n.properties->>'$.type' = 'user'
group by n.node_id
order by follower_count desc;

效果：

2. 相似度算法

图场景中相似度算法主流的主要包含：余弦相似度、杰卡德相似度。这里主要介绍下jaccard相似度算法。

杰卡德相似度（jaccard similarity）
介绍：节点a和节点b的杰卡德相似度定义为，节点a邻居和节点b邻居的交集节点数量除以并集节点数量。jaccard系数计算的是两个节点的邻居集合的重合程度，以此来衡量两个节点的相似度。
计算：计算两个节点邻居集合的交集数量和并集数量，然后再相除。公式：|a ∩ b| / (|a| + |b| - |a ∩ b|)
应用：共同好友推荐、电商商品推荐猜你喜欢

数据构造：

-- 清理之前数据，避免混淆
delete from edge;
delete from node;
alter table node auto_increment = 1;
alter table edge auto_increment = 1;
-- 创建用户节点（包含风险标记）
insert into node (properties) values
('{"type":"user","name":"张三","phone":"13800138000","risk_score":5,"register_time":"2023-01-01"}'),
('{"type":"user","name":"李四","phone":"13900139000","risk_score":85,"register_time":"2023-01-05"}'), -- 黑产用户
('{"type":"user","name":"王五","phone":"13700137000","risk_score":92,"register_time":"2023-01-10"}'), -- 黑产用户
('{"type":"user","name":"赵六","phone":"13600136000","risk_score":15,"register_time":"2023-01-15"}'),
('{"type":"user","name":"钱七","phone":"13500135000","risk_score":8,"register_time":"2023-01-20"}'),
('{"type":"user","name":"孙八","phone":"13400134000","risk_score":95,"register_time":"2023-01-25"}'); -- 黑产用户

-- 创建设备节点
insert into node (properties) values
('{"type":"device","device_id":"d001","model":"iphone12","os":"ios14"}'),
('{"type":"device","device_id":"d002","model":"huaweip40","os":"android10"}'),
('{"type":"device","device_id":"d003","model":"xiaomi11","os":"android11"}'),
('{"type":"device","device_id":"d004","model":"opporeno5","os":"android11"}');

-- 创建银行卡节点
insert into node (properties) values
('{"type":"bank_card","card_no":"622588******1234","bank":"招商银行"}'),
('{"type":"bank_card","card_no":"622848******5678","bank":"农业银行"}'),
('{"type":"bank_card","card_no":"622700******9012","bank":"建设银行"}'),
('{"type":"bank_card","card_no":"622262******3456","bank":"交通银行"}');

-- 创建ip地址节点
insert into node (properties) values
('{"type":"ip","ip_address":"192.168.1.101","location":"广东深圳"}'),
('{"type":"ip","ip_address":"192.168.2.202","location":"浙江杭州"}'),
('{"type":"ip","ip_address":"192.168.3.303","location":"江苏南京"}'),
('{"type":"ip","ip_address":"192.168.4.404","location":"北京朝阳"}');

-- 创建关联关系
insert into edge (source_id, target_id, properties) values
-- 用户-设备关系
(1,7, '{"type":"use","first_time":"2023-01-01"}'),  -- 张三使用d001
(2,7, '{"type":"use","first_time":"2023-01-05"}'),  -- 李四使用d001
(2,8, '{"type":"use","first_time":"2023-01-06"}'),  -- 李四使用d002
(3,8, '{"type":"use","first_time":"2023-01-10"}'),  -- 王五使用d002
(3,9, '{"type":"use","first_time":"2023-01-11"}'),  -- 王五使用d003
(4,10,'{"type":"use","first_time":"2023-01-15"}'),  -- 赵六使用d004
(5,9, '{"type":"use","first_time":"2023-01-20"}'),  -- 钱七使用d003
(6,7, '{"type":"use","first_time":"2023-01-25"}'),  -- 孙八使用d001

-- 用户-银行卡关系
(1,11, '{"type":"bind","time":"2023-01-02"}'),  -- 张三绑定银行卡1
(2,11, '{"type":"bind","time":"2023-01-05"}'),  -- 李四绑定银行卡1
(2,12, '{"type":"bind","time":"2023-01-07"}'),  -- 李四绑定银行卡2
(3,12, '{"type":"bind","time":"2023-01-11"}'),  -- 王五绑定银行卡2
(3,13, '{"type":"bind","time":"2023-01-12"}'),  -- 王五绑定银行卡3
(4,14, '{"type":"bind","time":"2023-01-16"}'),  -- 赵六绑定银行卡4
(5,13, '{"type":"bind","time":"2023-01-21"}'),  -- 钱七绑定银行卡3
(6,11, '{"type":"bind","time":"2023-01-26"}'),  -- 孙八绑定银行卡1

-- 用户-ip关系
(1,15, '{"type":"login","time":"2023-01-03"}'),  -- 张三登录ip1
(2,15, '{"type":"login","time":"2023-01-05"}'),  -- 李四登录ip1
(2,16, '{"type":"login","time":"2023-01-08"}'),  -- 李四登录ip2
(3,16, '{"type":"login","time":"2023-01-10"}'),  -- 王五登录ip2
(3,17, '{"type":"login","time":"2023-01-13"}'),  -- 王五登录ip3
(4,18, '{"type":"login","time":"2023-01-17"}'),  -- 赵六登录ip4
(5,17, '{"type":"login","time":"2023-01-22"}'),  -- 钱七登录ip3
(6,15, '{"type":"login","time":"2023-01-27"}');  -- 孙八登录ip1

算法实现：

jaccard相似度数学公式：|a ∩ b| / (|a| + |b| - |a ∩ b|)

-- 基于jaccard相似度的图相似度算法实现
with user_entities as (
    select
        u.node_id as user_id,
        (
            select json_arrayagg(ed.target_id)
            from edge ed
            where ed.source_id = u.node_id
            and ed.properties->>'$.type' = 'use'
            and ed.target_id in (select node_id from node where properties->>'$.type' = 'device')
        ) as devices,
        (
            select json_arrayagg(ec.target_id)
            from edge ec
            where ec.source_id = u.node_id
            and ec.properties->>'$.type' = 'bind'
            and ec.target_id in (select node_id from node where properties->>'$.type' = 'bank_card')
        ) as cards,
        (
            select json_arrayagg(ei.target_id)
            from edge ei
            where ei.source_id = u.node_id
            and ei.properties->>'$.type' = 'login'
            and ei.target_id in (select node_id from node where properties->>'$.type' = 'ip')
        ) as ips
    from node u
    where u.properties->>'$.type' = 'user'
),
-- 已知黑产用户
black_users as (
    select node_id
    from node
    where properties->>'$.type' = 'user'
    and cast(properties->>'$.risk_score' as unsigned) > 80
),
-- 相似度计算
similarity_calc as (
    select
        u1.user_id as target_user,
        u2.user_id as black_user,
        -- 设备相似度 (jaccard系数): |a ∩ b| / (|a| + |b| - |a ∩ b|)
        case
            when u1.devices is null or u2.devices is null
                 or json_length(u1.devices) = 0 or json_length(u2.devices) = 0
            then 0
            else (
                -- 分子部分: |a ∩ b| (交集的大小)
                select count(distinct d1.device_id)
                from json_table(u1.devices, '$[*]' columns(device_id bigint path '$')) d1
                inner join json_table(u2.devices, '$[*]' columns(device_id bigint path '$')) d2
                on d1.device_id = d2.device_id
            ) * 1.0 / (
                -- 分母部分: (|a| + |b| - |a ∩ b|) (并集的大小)
                json_length(u1.devices) +                -- |a| 集合a的大小
                json_length(u2.devices) -                -- |b| 集合b的大小
                (
                    -- |a ∩ b| 交集的大小（再次计算用于分母）
                    select count(distinct d1.device_id)
                    from json_table(u1.devices, '$[*]' columns(device_id bigint path '$')) d1
                    inner join json_table(u2.devices, '$[*]' columns(device_id bigint path '$')) d2
                    on d1.device_id = d2.device_id
                )
            )
        end as device_sim,

        -- 银行卡相似度 (jaccard系数): |a ∩ b| / (|a| + |b| - |a ∩ b|)
        case
            when u1.cards is null or u2.cards is null
                 or json_length(u1.cards) = 0 or json_length(u2.cards) = 0
            then 0
            else (
                -- 分子部分: |a ∩ b| (交集的大小)
                select count(distinct c1.card_id)
                from json_table(u1.cards, '$[*]' columns(card_id bigint path '$')) c1
                inner join json_table(u2.cards, '$[*]' columns(card_id bigint path '$')) c2
                on c1.card_id = c2.card_id
            ) * 1.0 / (
                -- 分母部分: (|a| + |b| - |a ∩ b|) (并集的大小)
                json_length(u1.cards) +                  -- |a| 集合a的大小
                json_length(u2.cards) -                  -- |b| 集合b的大小
                (
                    -- |a ∩ b| 交集的大小（再次计算用于分母）
                    select count(distinct c1.card_id)
                    from json_table(u1.cards, '$[*]' columns(card_id bigint path '$')) c1
                    inner join json_table(u2.cards, '$[*]' columns(card_id bigint path '$')) c2
                    on c1.card_id = c2.card_id
                )
            )
        end as card_sim,

        -- ip相似度 (jaccard系数): |a ∩ b| / (|a| + |b| - |a ∩ b|)
        case
            when u1.ips is null or u2.ips is null
                 or json_length(u1.ips) = 0 or json_length(u2.ips) = 0
            then 0
            else (
                -- 分子部分: |a ∩ b| (交集的大小)
                select count(distinct i1.ip_id)
                from json_table(u1.ips, '$[*]' columns(ip_id bigint path '$')) i1
                inner join json_table(u2.ips, '$[*]' columns(ip_id bigint path '$')) i2
                on i1.ip_id = i2.ip_id
            ) * 1.0 / (
                -- 分母部分: (|a| + |b| - |a ∩ b|) (并集的大小)
                json_length(u1.ips) +                    -- |a| 集合a的大小
                json_length(u2.ips) -                    -- |b| 集合b的大小
                (
                    -- |a ∩ b| 交集的大小（再次计算用于分母）
                    select count(distinct i1.ip_id)
                    from json_table(u1.ips, '$[*]' columns(ip_id bigint path '$')) i1
                    inner join json_table(u2.ips, '$[*]' columns(ip_id bigint path '$')) i2
                    on i1.ip_id = i2.ip_id
                )
            )
        end as ip_sim

    from user_entities u1
    join user_entities u2 on u2.user_id in (select node_id from black_users)
    where u1.user_id not in (select node_id from black_users)  -- 排除已知黑产
)
-- 最终结果查询
select
    u.properties->>'$.name' as target_user,
    u.properties->>'$.phone' as phone,
    cast(u.properties->>'$.risk_score' as unsigned) as risk_score,
    bu.properties->>'$.name' as black_user,
    round(sc.device_sim, 3) as device_similarity,
    round(sc.card_sim, 3) as card_similarity,
    round(sc.ip_sim, 3) as ip_similarity,
    round((sc.device_sim * 0.5 + sc.card_sim * 0.3 + sc.ip_sim * 0.2), 3) as total_similarity,
    case
        when (sc.device_sim * 0.5 + sc.card_sim * 0.3 + sc.ip_sim * 0.2) > 0.7 then '高风险'
        when (sc.device_sim * 0.5 + sc.card_sim * 0.3 + sc.ip_sim * 0.2) > 0.4 then '中风险'
        else '低风险'
    end as risk_level
from similarity_calc sc
join node u on sc.target_user = u.node_id
join node bu on sc.black_user = bu.node_id
order by total_similarity desc
limit 5;

效果：

四、项目实战

基于mysql搭建的图数据库，模拟实现好友推荐功能。

数据准备：

-- 创建用户
insert into node (properties) values
('{"type":"user","name":"张三","age":25,"city":"北京"}'),
('{"type":"user","name":"李四","age":28,"city":"北京"}'),
('{"type":"user","name":"王五","age":30,"city":"上海"}'),
('{"type":"user","name":"赵六","age":26,"city":"广州"}'),
('{"type":"user","name":"钱七","age":27,"city":"深圳"}'),
('{"type":"user","name":"jack","age":18,"city":"杭州"}'),
('{"type":"user","name":"tom","age":45,"city":"贵州"}'),
('{"type":"user","name":"mike","age":35,"city":"上海"}');

-- 创建好友关系
insert into edge (source_id, target_id, properties) values
(1,2, '{"type":"friend"}'),
(1,3, '{"type":"friend"}'),
(2,4, '{"type":"friend"}'),
(3,5, '{"type":"friend"}'),
(4,5, '{"type":"friend"}'),
(6,7, '{"type":"friend"}'),
(7,8, '{"type":"friend"}');

具体实现

-- 综合推荐算法：为张三推荐3个好友，排除现有好友
with target_user as (
    select
        node_id,
        properties->>'$.city' as city
    from node
    where properties->>'$.name' = '张三'
),
existing_friends as (
    select target_id
    from edge
    where source_id = (select node_id from target_user)
    and properties->>'$.type' = 'friend'
),
common_friends as (
    select
        f2.target_id as candidate_id,
        count(*) as common_friend_count
    from edge f1
    join edge f2 on f1.target_id = f2.source_id
    where f1.source_id = (select node_id from target_user)
    and f2.target_id not in (select target_id from existing_friends)  -- 排除现有好友
    and f2.target_id != (select node_id from target_user)  -- 排除自己
    and f1.properties->>'$.type' = 'friend'
    and f2.properties->>'$.type' = 'friend'
    group by f2.target_id
),
same_city as (
    select
        n.node_id as candidate_id,
        1 as same_city_score
    from node n
    where n.properties->>'$.city' = (select city from target_user)
    and n.node_id != (select node_id from target_user)
    and n.node_id not in (select target_id from existing_friends)  -- 排除现有好友
),
final_candidates as (
    select
        cf.candidate_id,
        coalesce(cf.common_friend_count, 0) as common_friends,
        coalesce(sc.same_city_score, 0) as same_city,
        coalesce(cf.common_friend_count, 0) * 0.6 +
        coalesce(sc.same_city_score, 0) * 0.4 as recommendation_score
    from common_friends cf
    left join same_city sc on cf.candidate_id = sc.candidate_id

    union all

    select
        sc.candidate_id,
        0 as common_friends,
        sc.same_city_score as same_city,
        sc.same_city_score * 0.4 as recommendation_score
    from same_city sc
    where sc.candidate_id not in (select candidate_id from common_friends)
)
select
    n.properties->>'$.name' as recommended_name,
    fc.common_friends,
    fc.same_city,
    fc.recommendation_score
from final_candidates fc
join node n on fc.candidate_id = n.node_id
order by recommendation_score desc
limit 3;

效果展示

可以看到最后只给张三推荐了赵六和钱七，并没有推荐tom、jack等用户。