XJTU-SY轴承振动数据集的json自封装

1.最终形式的形式

不用再去翻文档找对应的故障类型，采样率等信息了，所有的信息自包含在.json文件里，15个测试例，一个测试例对应一整个.json文件。

{
    "dataset": {
        "name": "XJTU-SY_Bearing_Datasets",
        "memo": "PHM vibration dataset - bearing",
        "author": "xjtu_sy",
        "release_date": "2018",
        "reference of dataset": "https://gr.xjtu.edu.cn/web/yaguolei",
        "DOI": "10.1109/TR.2018.2882682"
    },
    "test_case": {
        "tag": "35Hz12kN\\Bearing1_2",
        "scale_of_sensor": "g",
        "saps": 256000,
        "len_of_saps": 32768,
        "shaft_rotation_speed_in_hz": 35,
        "load_rating_in_N": 12000,
        "fault_reason": [
            "Outer race"
        ],
        "sample_cnt": 161
    },
    "sample_data_desc": {
        "bin_data.desc": "[hexStrOfHorizon, hexStrOfVertical] or [hexStrOfSample]",
        "sample_data_raw.bin_data.type": "float32",
        "sample_data_raw.bin_data.storageType": "hex_in_8bytes_from_float32"
    },
    "aux_info": {
        "auto_generated_date": "2025-05-28 18:32:50",
        "auto_generated_by": "twicave",
        "json_fmt_version": "1.0.20250528",
        "key_word": [
            "float32",
            "g",
            "hex",
            "h",
            "v",
            "bearing"
        ]
    },
    "sample_data_raw": [
        {
            "time": "2018-02-15 08:00:00",
            "bin_data": [
                "3E20E6A2BE15A23D3E3CCB41BDA01B86",
                "3E20E6A2BE15A23D3E3CCB41BDA01B86"
            ]
        }
    ]
}

1.1 额外的好处 - 占用空间更小

1.2 使用这个数据集

1.2.1 使用数据集 - 代码片段

import gp_dataset_xjtu_sy as xjtu_json

app = xjtu_json.gp_dataset_xjtu_sy()
y = app.get_record_of(13, 20, 'H')
y = y[0] #为了兼顾正交采样，返回的始终是一个数组
x = app.get_x_axis(y, self.get_saps_of(case_idxBase1))
        
plt.plot(x, y, label=memo)
plt.xlabel('time(s)')
plt.ylabel('acc(g)')
plt.legend()
plt.show()

1.2.2 简单的采样值显示

附录A 相关源码

留神saps,最早的发布错误地将其设置为256000，应该是25600

1.使用.json数据集 - 辅助函数

import json, pymysql
from tkinter import FIRST
import gp_dataset_define
import datetime
import copy
import csv
import os
import numpy as np
import re
import struct
import glog as log
import gp_dataset_import_xjtu_sy
import matplotlib.pyplot as plt

#.json格式数据集读取函数
class gp_dataset_xjtu_sy:
    #初始化 dataset_path = 15个.json文件所在目录，可以更少
    def __init__(self, dataset_path=r'D:\DataSet\Vibration\xjtu_sy_bearing_2018'):
        self.path_of_dataset = dataset_path
        self.gp_json_filenames= self.get_dataset_cases()
        self.jsonCases={}
        self.importer = gp_dataset_import_xjtu_sy.gp_dataset_importer()
        self.records={}
        
    #根据json文件名来对应到具体的通道
    def get_dataset_cases(self):
        arJsonFiles = {}
        folder_path = self.path_of_dataset
        json_count = len([f for f in os.listdir(folder_path) if f.endswith(".json")])
        # 文件名样例 dataset_xjtu_sy_02_35Hz12kN_Bearing1_2_raw_sample.json
        for f in os.listdir(folder_path):
            if not f.endswith(".json"):
                continue
            if not f.startswith("dataset_xjtu_sy_"):
                continue
            match = re.search(r'_(\d{2})_', f)
            sequence_number = int(match.group(1))  # 输出 4
            arJsonFiles[f'{sequence_number}'] = os.path.join(self.path_of_dataset, f)
        return arJsonFiles

    #得到某个测试案例，后续可自行按照json语法参照gp_dataset_define来访问里面的数据或相关配置，注释信息
    def get_case(self, case_idxBase1):
        caseSn = case_idxBase1
        chCase = f"{caseSn}"
        if chCase in self.jsonCases.keys():
            return self.jsonCases[chCase]
        if chCase in self.gp_json_filenames.keys():
            with open(self.gp_json_filenames[chCase], 'r') as f:
                content = ''.join([line for line in f if not re.match(r'\s*//', line)])
                jsonObj =  json.loads(content)
                self.jsonCases[chCase] = jsonObj
        if chCase in self.jsonCases.keys():
            return self.jsonCases[chCase]
        else:
            return {}

    #快捷方式，得到采样率    
    def get_saps_of(self, case_idxBase1):
        chCase = self.get_case(case_idxBase1)
        return chCase["test_case"]["saps"]
    
    #快捷方式，得到某个测试的总records条数
    def get_records_cnt_of(self, case_idxBase1):
        chCase = self.get_case(case_idxBase1)
        return chCase["test_case"]["sample_cnt"]
    
    #快捷方式，得到某个测试的失效原因
    def get_phm_fault_reason_of(self, case_idxBase1):
        chCase = self.get_case(case_idxBase1)
        return chCase["test_case"]["fault_reason"]
    
    #得到一笔记录，返回值是float32
    def get_record_of(self, case_idxBase1, records_idxBase0, data_type="h"):
        keyOfRecords = f'{case_idxBase1}.{records_idxBase0}.{str.capitalize(data_type)}'
        if keyOfRecords in self.records.keys():
            return self.records[keyOfRecords]
        
        chCase = self.get_case(case_idxBase1)
        jsonRec = chCase["sample_data_raw"][records_idxBase0]
        hexRecords = []
        if str.capitalize(data_type)=="H":
            hexRecords.append(jsonRec["bin_data"][0])
        else:
           if str.capitalize(data_type)=="V":
                hexRecords.append(jsonRec["bin_data"][1])
           else:
               hexRecords.append(jsonRec["bin_data"][0])
               hexRecords.append(jsonRec["bin_data"][1])
        floatAr = []
        floatAr.append(self.importer.hex_to_float_array(hexRecords[0]))
        if len(hexRecords)>1 :
            floatAr.append(self.importer.hex_to_float_array(hexRecords[1]))
        self.records[keyOfRecords] = floatAr
        return floatAr
    
    #把特定通道 - 特定一次采样的数据绘制出来
    def draw_one_sample(self, case_idxBase1, records_idxBase0, memo, data_type="h"):
        y = self.get_record_of(case_idxBase1, records_idxBase0, data_type)
        y = y[0]
        x = self.get_x_axis(y, self.get_saps_of(case_idxBase1))
        
        plt.plot(x, y, label=memo)
        plt.xlabel('time(s)')
        plt.ylabel('acc(g)')
        plt.legend()
        plt.show()
    
    def get_x_axis(self, sample, saps):
        x = np.arange(len(sample))
        x = [i*1/saps for i in x]
        return x
        
    def unit_test(self):
        #测试例程：
        case = 11
        #显示地11通道第130个采样波形
        cnt = self.get_records_cnt_of(11)
        record = cnt-1
        memo = f'xjtu-sy case{case}.record{record}.fault reason={self.get_phm_fault_reason_of(case)}'
        self.draw_one_sample(11, cnt-1, memo)

        
app = gp_dataset_xjtu_sy()
app.unit_test()

2.将原始数据集转换程序

2.1 将原始分目录存放的.csv形式转换至.json格式

import json, pymysql
from tkinter import FIRST
import gp_dataset_define
import datetime
import copy
import csv
import os
import numpy as np
import re
import struct
import glog as log

class gp_dataset_importer:
    def __init__(self, dataset_path=r"D:\DataSet\Vibration\xjtu_sy_bearing_2018"):
        self.gp_dataset_template = gp_dataset_define.gp_dataset
        self.test_condition_desc = gp_dataset_define.test_condition_desc
        self.gp_bearing_info = gp_dataset_define.gp_bearing_info
        self.original_dataset_path = dataset_path
        self.time_of_begin_of_sample = datetime.datetime(2018, 2, 15, 8, 0, 0)
        self.timespan_of_sample = datetime.timedelta(minutes=1)
        
    def get_strOf_time(self, dt):
        formatted_time = dt.strftime("%Y-%m-%d %H:%M:%S")
        return formatted_time

    def sanitize_filename(self, input_str):
        # 匹配需要替换的字符：. , / \ 
        # 正则表达式模式解释：[.,/\\\\] 中四个反斜杠是为了匹配单个反斜杠（Python字符串转义+正则转义）
        pattern = r"[.,/\\\\]"
        return re.sub(pattern, "_", input_str)
    
    def float_array_to_hex(self, float_array, precision='single'):
        """
        将float数组转换为无分隔符的紧密十六进制字符串
        :param float_array: 输入的float数组
        :param precision: 'single'（4字节单精度）或 'double'（8字节双精度）
        :return: 十六进制字符串（无分隔符）
        """
        hex_str = ""
        fmt = '>f' if precision == 'single' else '>d'  # 大端序保证字节顺序一致性

        for num in float_array:
            # 将float转为字节序列
            byte_data = struct.pack(fmt, num)
            # 转换为十六进制并拼接
            hex_str += byte_data.hex().upper()  # 大写字母形式

        return hex_str
    
    def hex_to_float_array(self, hex_str, precision='single'):
        """
        将紧密排列的十六进制字符串转换为float数组
        :param hex_str: 输入的十六进制字符串（无分隔符）
        :param precision: 'single'（4字节单精度）或 'double'（8字节双精度）
        :return: 原始float数组
        """
        float_array = []
    
        # 确定每个浮点数的十六进制字符长度
        byte_length = 4 if precision == 'single' else 8
        hex_length = byte_length * 2  # 每个字节对应2个十六进制字符
    
        # 检查输入有效性
        if len(hex_str) % hex_length != 0:
            raise ValueError("Hex string length does not match precision")
    
        # 分割十六进制字符串
        chunks = [hex_str[i:i+hex_length] for i in range(0, len(hex_str), hex_length)]
    
        # 解包字节序列为浮点数
        fmt = '>f' if precision == 'single' else '>d'  # 大端序格式符
        for chunk in chunks:
            byte_data = bytes.fromhex(chunk)
            num = struct.unpack(fmt, byte_data)[0]
            float_array.append(num)
    
        return float_array

    # 示例log.info(float2hex(26.3, 'single'))  # 输出：41D26666（单精度）
    #log.info(float2hex(3.1415926535, 'double'))  # 输出：400921FB54442EEA（双精度）
    def float2hex(self, num, precision='single'):
        """
        将浮点数转换为十六进制字符串
        :param num: 输入的浮点数
        :param precision: 'single'(32位单精度) 或 'double'(64位双精度)
        :return: 十六进制字符串（不带前缀）
        """
        if precision == 'single':
            packed = struct.pack('>f', num)  # 大端序打包为单精度
            hex_str = packed.hex()
        elif precision == 'double':
            packed = struct.pack('>d', num)  # 大端序打包为双精度
            hex_str = packed.hex()
        else:
            raise ValueError("precision 必须是 'single' 或 'double'")
        return hex_str.upper()
    
    #log.info(hex2float('41D26666', 'single'))  # 输出：26.299999237060547（精度损失）
    def hex2float(self, hex_str, precision='single'):
        """
        将十六进制字符串转换为浮点数
        :param hex_str: 输入的十六进制字符串（不带前缀）
        :param precision: 'single' 或 'double'
        :return: 浮点数
        """
        byte_data = bytes.fromhex(hex_str)
        if precision == 'single':
            return struct.unpack('>f', byte_data)[0]
        elif precision == 'double':
            return struct.unpack('>d', byte_data)[0]
        else:
            raise ValueError("precision 必须是 'single' 或 'double'")

    # 自定义递归合并函数
    def deep_merge(self, source, overrides):
        for key, value in overrides.items():
            if isinstance(value, dict) and isinstance(source.get(key), dict):
                source[key] = deep_merge(source[key], value)
            else:
                source[key] = value
        return source
    
    def DealAllTestCase(self):
        log.info(f">>>>>start xjtu-sy bearing dataset transfer...{datetime.datetime.now()}")
        cntOftestCases = len(self.test_condition_desc["test_case"])
        for n in np.arange(cntOftestCases):
            self.DealOneTestCase(n)
    
    def DealOneTestCase(self, snOfTestCase):
        #get test case desc        
        if snOfTestCase<0:
            return
        if snOfTestCase>=len(self.test_condition_desc["test_case"]):
            return
        testCase = self.test_condition_desc["test_case"][snOfTestCase]
        
        #遍历这个test_case,对所有的.csv依次得到h,v 分量
        folder_path = self.original_dataset_path
        csv_count = len([f for f in os.listdir(folder_path) if f.endswith(".csv")])
        samples = []
        timeAnchor = copy.deepcopy(self.time_of_begin_of_sample)
        lenOfRealSamples = csv_count
        #判断记录数和理论值是否一致
        log.info(f'开始转换 {subPathOfTest}....{self.get_strOf_time(datetime.datetime.now())}')
        log.info(f'理论记录数:{testCase["sample_cnt"]}, 实际数据条数:{lenOfRealSamples}, 数据完整:{lenOfRealSamples==testCase["sample_cnt"]}')

        for sn in np.arange(csv_count):
            csvFileName = os.path.join(folder_path, f"{sn+1}.csv")
            h,v = self.LoadOneSampleOfTestCase(csvFileName)
            oneSamples = {}
            oneSamples["time"] = self.get_strOf_time(timeAnchor)
            oneSamples["bin_data"] = [self.float_array_to_hex(h),self.float_array_to_hex(v)]
            samples.append(oneSamples)
            timeAnchor += self.timespan_of_sample
            log.info(f"{subPathOfTest}...{sn+1:04d}") #不换行
        log.info("") #换行
        
        #整理这个testCase
        jsonOut = copy.deepcopy(self.gp_dataset_template)
        jsonOut["aux_info"]["auto_generated_date"] = self.get_strOf_time(datetime.datetime.now())
        jsonOut["aux_info"]["auto_generated_by"] = "twicave"
        jsonOut["test_case"]["tag"] = os.path.join(subPathOfTest)
        
        mergedObj = self.deep_merge(jsonOut["test_case"], testCase)
        jsonOut["test_case"] = mergedObj
        jsonOut["sample_data_raw"] = samples

        #输出这个testCase
        memo = self.sanitize_filename(firstKeyName)
        
        with open(f'dataset_xjtu_sy_{snOfTestCase+1:02d}_{memo}_raw_sample.json', 'w') as f:
            json.dump(jsonOut, f)
            

    def LoadOneSampleOfTestCase(self, file_path_of_csv):
        horizontal_signals = []
        vertical_signals = []

        with open(file_path_of_csv, newline='') as csvfile:
            reader = csv.reader(csvfile)
            next(reader)  # 跳过标题行
            for row in reader:
                horizontal_signals.append(float(row[0]))
                vertical_signals.append(float(row[1]))

        return (horizontal_signals, vertical_signals)

2.2 转换过程中使用的参数

gp_bearing_info = {
   "Type":"LDK UER204",
   "Outer race diameter":39.80e-3,
   "Inner race diameter":29.30e-3,
   "Bearing mean diameter":34.55e-3,
   "Ball diameter":7.92e-3,
   "Number of Ball":8,
   "Contact angle":0,
   "Load rating(static)":6.65e3,
   "Load rating(dynamic)":12.82e3
}

test_condition_desc = {"test_case":[
    {"35Hz12kN\Bearing1_1":{"shaft_rotation_speed_in_hz":35, "load_rating_in_N":12e3, "fault_reason":["Outer race"], "sample_cnt":123}},
    {"35Hz12kN\Bearing1_2":{"shaft_rotation_speed_in_hz":35, "load_rating_in_N":12e3, "fault_reason":["Outer race"], "sample_cnt":161}},
    {"35Hz12kN\Bearing1_3":{"shaft_rotation_speed_in_hz":35, "load_rating_in_N":12e3, "fault_reason":["Outer race"], "sample_cnt":158}},
    {"35Hz12kN\Bearing1_4":{"shaft_rotation_speed_in_hz":35, "load_rating_in_N":12e3, "fault_reason":["Cage"], "sample_cnt":122}},
    {"35Hz12kN\Bearing1_5":{"shaft_rotation_speed_in_hz":35, "load_rating_in_N":12e3, "fault_reason":["Inner race", "Outer race"], "sample_cnt":52}},

    {"37.5Hz11kN\Bearing2_1":{"shaft_rotation_speed_in_hz":37.5, "load_rating_in_N":11e3, "fault_reason":["Inner race"], "sample_cnt":491}},
    {"37.5Hz11kN\Bearing2_2":{"shaft_rotation_speed_in_hz":37.5, "load_rating_in_N":11e3, "fault_reason":["Outer race"], "sample_cnt":161}},
    {"37.5Hz11kN\Bearing2_3":{"shaft_rotation_speed_in_hz":37.5, "load_rating_in_N":11e3, "fault_reason":["Cage"], "sample_cnt":533}},
    {"37.5Hz11kN\Bearing2_4":{"shaft_rotation_speed_in_hz":37.5, "load_rating_in_N":11e3, "fault_reason":["Outer race"], "sample_cnt":42}},
    {"37.5Hz11kN\Bearing2_5":{"shaft_rotation_speed_in_hz":37.5, "load_rating_in_N":11e3, "fault_reason":["Outer race"], "sample_cnt":339}},

    {"40Hz10kN\Bearing3_1":{"shaft_rotation_speed_in_hz":40, "load_rating_in_N":10e3, "fault_reason":["Outer race"], "sample_cnt":2538}},
    {"40Hz10kN\Bearing3_2":{"shaft_rotation_speed_in_hz":40, "load_rating_in_N":10e3, "fault_reason":["Inner race", "Ball", "Cage","Outer race"], "sample_cnt":2496}},
    {"40Hz10kN\Bearing3_3":{"shaft_rotation_speed_in_hz":40, "load_rating_in_N":10e3, "fault_reason":["Inner race"], "sample_cnt":371}},
    {"40Hz10kN\Bearing3_4":{"shaft_rotation_speed_in_hz":40, "load_rating_in_N":10e3, "fault_reason":["Inner race"], "sample_cnt":1515}},
    {"40Hz10kN\Bearing3_5":{"shaft_rotation_speed_in_hz":40, "load_rating_in_N":10e3, "fault_reason":["Outer race"], "sample_cnt":114}}
]}


gp_dataset = {
     "dataset": 
     {
         "name": "XJTU-SY_Bearing_Datasets", 
         "memo": "PHM vibration dataset - bearing", 
         "author": "xjtu_sy", 
         "release_date": "2018",
         "reference of dataset": "https://gr.xjtu.edu.cn/web/yaguolei",     
         "DOI": "10.1109/TR.2018.2882682"
     },
     "test_case": 
     {
         "tag":"35Hz12kN\\Bearing1_1",
         "scale_of_sensor": "g",
         "saps": 25600,
         "len_of_saps": 32768,
         "shaft_rotation_speed_in_hz": 35,
         "load_rating_in_N": 12000.0,
         "fault_reason": ["Outer race"],
         "sample_cnt": 123     
     },
     "sample_data_desc":
     {
        "bin_data.desc": "[hexStrOfHorizon, hexStrOfVertical] or [hexStrOfSample]",    
        "sample_data_raw.bin_data.type": "float32",
        "sample_data_raw.bin_data.storageType": "hex_in_8bytes_from_float32"
     },
     "aux_info":
     {
         "auto_generated_date": "2025-05-28 17:49:13", 
         "auto_generated_by": "twicave", 
         "json_fmt_version": "1.0.20250528",
         "key_word":["float32", "g", "hex", "h", "v", "bearing"]
     },
     "sample_data_raw":[{"time":"2025-05-28 15:23:00", "bin_data":[]}]
}

2.3.转换过程截屏

I0528 18:37:03.843738 6128 gp_dataset_import_xjtu_sy.py:135] 37.5Hz11kN\Bearing2_1...0489
I0528 18:37:04.172952 6128 gp_dataset_import_xjtu_sy.py:135] 37.5Hz11kN\Bearing2_1...0490
I0528 18:37:04.488946 6128 gp_dataset_import_xjtu_sy.py:135] 37.5Hz11kN\Bearing2_1...0491
I0528 18:37:04.488946 6128 gp_dataset_import_xjtu_sy.py:136]
I0528 18:37:05.193193 6128 gp_dataset_import_xjtu_sy.py:124] 开始转换 37.5Hz11kN\Bearing2_2....2025-05-28 18:37:05
I0528 18:37:05.193193 6128 gp_dataset_import_xjtu_sy.py:125] 理论记录数:161, 实际数据条数:161, 数据完整:True
I0528 18:37:05.472151 6128 gp_dataset_import_xjtu_sy.py:135] 37.5Hz11kN\Bearing2_2...0001
I0528 18:37:05.752146 6128 gp_dataset_import_xjtu_sy.py:135] 37.5Hz11kN\Bearing2_2...0002
I0528 18:37:06.029141 6128 gp_dataset_import_xjtu_sy.py:135] 37.5Hz11kN\Bearing2_2...0003
I0528 18:37:06.312358 6128 gp_dataset_import_xjtu_sy.py:135] 37.5Hz11kN\Bearing2_2...0004