在做接口对接时,对方提交过来的数据存在重复数据,这时候可以通过 python 轻松提取出来
syncdevice_2026-01-07.log
2026-01-07 11:41:33 | [{"devicemac":"ed:0c:51:c2:b2:ea","devicesn":"240103 p50162"},{"devicemac":"c0:7a:a1:6c:67:aa","devicesn":"221130p50012"},{"devicemac":"d0:d0:02:39:83:d4","devicesn":"221130p50012"}]
2026-01-07 11:41:33 | [{"devicemac":"d0:d0:02:39:83:d4","devicesn":"221130p50012"},{"devicemac":"dc:8e:33:ba:3d:6d","devicesn":"221130p50013"}]
python 代码如下:
import json
import re
from collections import defaultdict
def find_duplicate_devices_unique_mac(log_file_path):
# 按 devicesn 分组,每组内用集合去重 devicemac
devices_by_sn = defaultdict(list)
mac_seen_by_sn = defaultdict(set) # 用于跟踪每个 sn 下已见过的 devicemac
# 读取日志文件
with open(log_file_path, 'r', encoding='utf-8') as file:
for line_num, line in enumerate(file, 1):
# 使用正则表达式提取 json 部分
match = re.search(r'\[.*\]', line)
if match:
try:
# 解析 json 数组
devices = json.loads(match.group())
# 将每个设备添加到对应 devicesn 的分组中,并去重 devicemac
for device in devices:
device_sn = device.get('devicesn')
device_mac = device.get('devicemac')
if device_sn and device_mac:
# 如果这个 sn 下还没见过这个 mac,则添加
if device_mac not in mac_seen_by_sn[device_sn]:
devices_by_sn[device_sn].append(device)
mac_seen_by_sn[device_sn].add(device_mac)
except json.jsondecodeerror as e:
print(f"第 {line_num} 行解析 json 时出错: {e}")
continue
# 找出重复的 devicesn(去重 mac 后仍然有多个记录的)
duplicate_devices = {}
for device_sn, devices in devices_by_sn.items():
if len(devices) > 1:
duplicate_devices[device_sn] = devices
return duplicate_devices
def print_duplicate_devices(duplicate_devices):
if not duplicate_devices:
print("没有找到重复的 devicesn")
return
print("找到以下重复的 devicesn (已对 devicemac 去重):\n")
for device_sn, devices in duplicate_devices.items():
print(f"devicesn: {device_sn} (去重后出现 {len(devices)} 次)")
print("-" * 50)
for i, device in enumerate(devices, 1):
print(f"第 {i} 条记录:")
# 美化输出 json
print(json.dumps(device, indent=2, ensure_ascii=false))
print()
print("=" * 80)
# 版本2:更简洁的实现,直接输出去重结果
def find_and_print_duplicates_unique(log_file):
# 存储去重后的设备
unique_devices_by_sn = defaultdict(list)
seen_mac_by_sn = defaultdict(set)
with open(log_file, 'r') as f:
for line in f:
# 提取 json 数组部分
json_match = re.search(r'\[.*\]', line)
if json_match:
try:
devices = json.loads(json_match.group())
for device in devices:
sn = device.get('devicesn')
mac = device.get('devicemac')
if sn and mac:
# 如果这个 mac 还没在这个 sn 组中出现过
if mac not in seen_mac_by_sn[sn]:
unique_devices_by_sn[sn].append(device)
seen_mac_by_sn[sn].add(mac)
except:
continue
# 找出并打印重复项
print("重复的设备sn及其数据 (已对devicemac去重):")
print("=" * 80)
found_duplicates = false
for sn, devices in unique_devices_by_sn.items():
if len(devices) > 1:
found_duplicates = true
print(f"\n设备sn: {sn} (去重后出现 {len(devices)} 次)")
print("-" * 50)
for i, device in enumerate(devices, 1):
print(f"记录 {i}:")
# 格式化时间戳
if 'productiondate' in device:
import datetime
timestamp = device['productiondate'] / 1000
dt = datetime.datetime.fromtimestamp(timestamp)
device['productiondate_formatted'] = dt.strftime('%y-%m-%d %h:%m:%s')
print(json.dumps(device, indent=2, ensure_ascii=false))
print()
if not found_duplicates:
print("没有找到重复的 devicesn (或所有重复都是相同的 devicemac)")
# 主程序
if __name__ == "__main__":
log_file_path = "syncdevice_2026-01-07.log"
try:
print("=" * 80)
print("方法1:详细版")
print("=" * 80)
# 查找重复设备(去重 mac)
duplicate_devices = find_duplicate_devices_unique_mac(log_file_path)
# 打印结果
print_duplicate_devices(duplicate_devices)
# 统计信息
print("\n统计信息:")
print(f"总共有 {len(duplicate_devices)} 个重复的 devicesn")
for device_sn, devices in duplicate_devices.items():
print(f" - {device_sn}: {len(devices)} 条不重复的记录")
print("\n" + "=" * 80)
print("方法2:简洁版")
print("=" * 80)
find_and_print_duplicates_unique(log_file_path)
except filenotfounderror:
print(f"错误: 找不到文件 {log_file_path}")
except exception as e:
print(f"处理文件时出错: {e}")
输入结果:
================================================================================
方法1:详细版
================================================================================
找到以下重复的 devicesn (已对 devicemac 去重):
devicesn: 221130p50012 (去重后出现 2 次)
--------------------------------------------------
第 1 条记录:
{
"devicemac": "c0:7a:a1:6c:67:aa",
"devicesn": "221130p50012"
}
第 2 条记录:
{
"devicemac": "d0:d0:02:39:83:d4",
"devicesn": "221130p50012"
}
================================================================================
统计信息:
总共有 1 个重复的 devicesn
- 221130p50012: 2 条不重复的记录
================================================================================
方法2:简洁版
================================================================================
重复的设备sn及其数据 (已对devicemac去重):
================================================================================
设备sn: 221130p50012 (去重后出现 2 次)
--------------------------------------------------
记录 1:
{
"devicemac": "c0:7a:a1:6c:67:aa",
"devicesn": "221130p50012"
}
记录 2:
{
"devicemac": "d0:d0:02:39:83:d4",
"devicesn": "221130p50012"
}
process finished with exit code 0
到此这篇关于利用python轻松实现找出同步日志中的重复数据的文章就介绍到这了,更多相关python查找同步日志中重复数据内容请搜索代码网以前的文章或继续浏览下面的相关文章希望大家以后多多支持代码网!
发表评论