数据可视化模拟题1-数据生成

# -*- coding: utf-8 -*-
"""
医疗健康数据模拟生成脚本
生成门诊数据和住院数据，并注入脏数据用于数据清洗练习
"""

import numpy as np
import pandas as pd
import os

# 设置随机种子确保可复现
np.random.seed(42)

# 输出目录
OUTPUT_DIR = "./第一套_医疗健康数据分析"

# ============================================================
# 第一部分：生成门诊数据 hospital_outpatient.csv
# ============================================================

def generate_outpatient_data(n=12000):
    """生成门诊数据"""

    # --- 预设数据 ---
    # 医生名单
    doctors = ["张伟", "李明", "王芳", "刘洋", "陈静",
               "赵磊", "孙丽", "周强", "吴敏", "郑涛"]

    # 科室及其关联的诊断
    dept_diagnoses = {
        "内科": ["上呼吸道感染", "高血压", "糖尿病", "胃炎", "支气管炎", "冠心病", "贫血", "甲状腺功能异常"],
        "外科": ["阑尾炎", "胆结石", "疝气", "骨折", "软组织损伤", "静脉曲张", "肠梗阻", "甲状腺结节"],
        "儿科": ["小儿感冒", "手足口病", "小儿腹泻", "过敏性紫癜", "小儿肺炎", "腮腺炎", "水痘", "百日咳"],
        "妇产科": ["月经不调", "子宫肌瘤", "盆腔炎", "先兆流产", "异位妊娠", "卵巢囊肿", "宫颈炎", "妊娠期糖尿病"],
        "急诊科": ["急性阑尾炎", "急性心肌梗死", "脑卒中", "急性中毒", "创伤", "急性哮喘", "消化道出血", "休克"]
    }

    departments = list(dept_diagnoses.keys())

    # 科室别名（脏数据用）
    dept_aliases = ["内科室", "外科室", "儿科门诊", "妇产科室", "急诊"]

    # --- 生成基础数据 ---

    # 就诊日期：2018-01 至 2023-12
    # 使用季节性权重：冬季（12、1、2月）急诊和内科增多
    start_date = np.datetime64("2018-01-01")
    end_date = np.datetime64("2023-12-31")
    total_days = int((end_date - start_date) / np.timedelta64(1, "D"))

    # 为每个月分配权重，模拟季节性波动
    month_weights = {
        1: 1.3,   # 冬季，门诊多
        2: 1.2,
        3: 1.0,
        4: 0.9,
        5: 0.9,
        6: 0.95,
        7: 0.95,
        8: 0.9,
        9: 0.95,
        10: 1.0,
        11: 1.05,
        12: 1.3   # 冬季，门诊多
    }

    # 生成带权重的随机日期
    # 先随机选天数，然后根据月份权重接受/拒绝
    dates = []
    while len(dates) < n:
        batch_size = max(n - len(dates), 10000)
        random_days = np.random.randint(0, total_days + 1, size=batch_size)
        candidate_dates = start_date + random_days.astype("timedelta64[D]")
        # 获取月份
        months = pd.to_datetime(candidate_dates).month
        # 计算权重
        weights = np.array([month_weights.get(m, 1.0) for m in months])
        # 按权重接受
        accept_prob = weights / weights.max()
        accepted = candidate_dates[np.random.random(batch_size) < accept_prob]
        dates.extend(accepted.tolist())

    dates = dates[:n]

    # 科室分配：急诊科在冬季占比更高
    dept_list = []
    for d in dates:
        dt = pd.Timestamp(d)
        month = dt.month
        if month in [12, 1, 2]:
            # 冬季：急诊科概率增加
            probs = [0.25, 0.15, 0.15, 0.15, 0.30]
        elif month in [6, 7, 8]:
            # 夏季：儿科增多
            probs = [0.20, 0.15, 0.25, 0.15, 0.25]
        else:
            probs = [0.25, 0.20, 0.18, 0.17, 0.20]
        dept_list.append(np.random.choice(departments, p=probs))

    # 患者年龄：正态分布，均值45，标准差15，范围0-90
    ages = np.clip(np.random.normal(45, 15, n), 0, 90).astype(int)

    # 患者性别
    genders = np.random.choice(["男", "女"], size=n)

    # 主治医生
    doctor_list = [np.random.choice(doctors) for _ in range(n)]

    # 诊断结果（与科室关联）
    diagnoses = [np.random.choice(dept_diagnoses[dept]) for dept in dept_list]

    # 诊疗费用：根据科室设定不同范围
    cost_ranges = {
        "内科": (50, 500),
        "外科": (100, 2000),
        "儿科": (30, 400),
        "妇产科": (80, 1500),
        "急诊科": (100, 3000)
    }
    costs = np.array([
        np.random.uniform(cost_ranges[dept][0], cost_ranges[dept][1])
        for dept in dept_list
    ])
    # 保留两位小数
    costs = np.round(costs, 2)

    # 就诊记录ID
    visit_ids = []
    for i, d in enumerate(dates):
        year = pd.Timestamp(d).year
        visit_ids.append(f"V{year}{i + 1:06d}")

    # --- 构建DataFrame ---
    df = pd.DataFrame({
        "visit_id": visit_ids,
        "visit_date": dates,
        "department": dept_list,
        "patient_age": ages,
        "patient_gender": genders,
        "doctor_name": doctor_list,
        "diagnosis": diagnoses,
        "cost": costs
    })

    # --- 注入脏数据 ---

    # 1. 5%的visit_date格式异常
    #    部分用/分隔（如2023/03/15），部分无分隔符（如20230315）
    date_dirty_mask = np.random.random(n) < 0.05
    for idx in df.index[date_dirty_mask]:
        dt = pd.Timestamp(df.loc[idx, "visit_date"])
        if np.random.random() < 0.5:
            # 用/分隔
            df.loc[idx, "visit_date"] = f"{dt.year}/{dt.month:02d}/{dt.day:02d}"
        else:
            # 无分隔符
            df.loc[idx, "visit_date"] = f"{dt.year}{dt.month:02d}{dt.day:02d}"

    # 2. 3%的cost为缺失值（NaN）
    cost_nan_mask = np.random.random(n) < 0.03
    df.loc[cost_nan_mask, "cost"] = np.nan

    # 3. 2%的cost为负数（在非NaN的cost中）
    valid_cost_mask = ~cost_nan_mask
    neg_cost_mask = np.zeros(n, dtype=bool)
    neg_indices = np.where(valid_cost_mask)[0]
    neg_count = int(n * 0.02)
    neg_selected = np.random.choice(neg_indices, size=neg_count, replace=False)
    neg_cost_mask[neg_selected] = True
    df.loc[neg_cost_mask, "cost"] = -np.abs(df.loc[neg_cost_mask, "cost"])

    # 4. 2%的department为空值或别名
    dept_dirty_mask = np.random.random(n) < 0.02
    for idx in df.index[dept_dirty_mask]:
        if np.random.random() < 0.5:
            # 空值
            df.loc[idx, "department"] = np.nan
        else:
            # 别名
            original_dept = dept_list[idx]
            dept_idx = departments.index(original_dept)
            df.loc[idx, "department"] = dept_aliases[dept_idx]

    # 5. 1%的重复记录（相同visit_id）
    dup_count = int(n * 0.01)
    # 随机选取一些行复制
    dup_source_indices = np.random.choice(n, size=dup_count, replace=False)
    dup_rows = df.iloc[dup_source_indices].copy()
    df = pd.concat([df, dup_rows], ignore_index=True)

    # 将visit_date转为字符串以保留格式异常
    df["visit_date"] = df["visit_date"].astype(str)

    return df

# ============================================================
# 第二部分：生成住院数据 hospital_inpatient.csv
# ============================================================

def generate_inpatient_data(n=6000):
    """生成住院数据"""

    # --- 预设数据 ---
    departments = ["内科", "外科", "骨科", "心内科", "神经内科"]

    # 科室关联的诊断
    dept_diagnoses = {
        "内科": ["肺炎", "慢性阻塞性肺疾病", "肝硬化", "肾功能不全", "糖尿病酮症酸中毒", "胃溃疡", "系统性红斑狼疮"],
        "外科": ["胆囊炎", "肠梗阻", "胃穿孔", "甲状腺癌", "乳腺癌", "肺癌", "结肠癌"],
        "骨科": ["股骨颈骨折", "腰椎间盘突出", "膝关节损伤", "肩周炎", "骨质疏松性骨折", "颈椎病"],
        "心内科": ["急性心肌梗死", "心力衰竭", "心房颤动", "高血压危象", "冠状动脉粥样硬化性心脏病", "心肌炎"],
        "神经内科": ["脑梗死", "脑出血", "帕金森病", "癫痫", "脑膜炎", "多发性硬化"]
    }

    # 住院基础费用和每日费用（按科室）
    dept_cost_params = {
        "内科":     {"base": 3000,  "daily": 500},
        "外科":     {"base": 8000,  "daily": 800},
        "骨科":     {"base": 10000, "daily": 600},
        "心内科":   {"base": 12000, "daily": 1000},
        "神经内科": {"base": 8000,  "daily": 700}
    }

    # --- 生成基础数据 ---

    # 入院日期：2022-01 至 2023-12
    start_date = np.datetime64("2022-01-01")
    end_date = np.datetime64("2023-12-31")
    total_days = int((end_date - start_date) / np.timedelta64(1, "D"))

    admission_days = np.random.randint(0, total_days + 1, size=n)
    admission_dates = start_date + admission_days.astype("timedelta64[D]")

    # 住院天数：1-30天
    stay_days = np.random.randint(1, 31, size=n)

    # 出院日期 = 入院日期 + 住院天数
    discharge_dates = admission_dates + stay_days.astype("timedelta64[D]")

    # 科室
    dept_list = np.random.choice(departments, size=n)

    # 患者年龄：正态分布，均值55，标准差15，范围18-95
    ages = np.clip(np.random.normal(55, 15, n), 18, 95).astype(int)

    # 患者性别
    genders = np.random.choice(["男", "女"], size=n)

    # 是否手术（外科和骨科手术概率高）
    surgery_probs = {
        "内科": 0.1,
        "外科": 0.7,
        "骨科": 0.65,
        "心内科": 0.3,
        "神经内科": 0.15
    }
    surgery = []
    for dept in dept_list:
        surgery.append("是" if np.random.random() < surgery_probs[dept] else "否")

    # 诊断结果（与科室关联）
    diagnoses = [np.random.choice(dept_diagnoses[dept]) for dept in dept_list]

    # 住院总费用 = 基础费用 + 每日费用 * 天数 + 随机波动
    total_costs = np.array([
        dept_cost_params[dept]["base"] +
        dept_cost_params[dept]["daily"] * days +
        np.random.uniform(-500, 1000)
        for dept, days in zip(dept_list, stay_days)
    ])
    # 手术患者额外增加费用
    for i in range(n):
        if surgery[i] == "是":
            total_costs[i] += np.random.uniform(5000, 20000)
    total_costs = np.round(total_costs, 2)

    # 患者ID
    patient_ids = [f"P{pd.Timestamp(admission_dates[i]).year}{i + 1:06d}" for i in range(n)]

    # --- 构建DataFrame ---
    df = pd.DataFrame({
        "patient_id": patient_ids,
        "admission_date": admission_dates,
        "discharge_date": discharge_dates,
        "department": dept_list,
        "patient_age": ages,
        "patient_gender": genders,
        "stay_days": stay_days,
        "total_cost": total_costs,
        "surgery": surgery,
        "diagnosis": diagnoses
    })

    # --- 注入脏数据 ---

    # 1. 3%的age为异常值（0或>150）
    age_dirty_mask = np.random.random(n) < 0.03
    for idx in df.index[age_dirty_mask]:
        if np.random.random() < 0.5:
            df.loc[idx, "patient_age"] = 0
        else:
            df.loc[idx, "patient_age"] = np.random.randint(151, 200)

    # 2. 4%的total_cost为缺失值
    cost_nan_mask = np.random.random(n) < 0.04
    df.loc[cost_nan_mask, "total_cost"] = np.nan

    # 3. 2%的total_cost为极端值（如999999）
    valid_cost_mask = ~cost_nan_mask
    extreme_count = int(n * 0.02)
    extreme_indices = np.random.choice(np.where(valid_cost_mask)[0], size=extreme_count, replace=False)
    df.loc[extreme_indices, "total_cost"] = 999999.0

    # 4. 1%的discharge_date早于admission_date
    date_swap_mask = np.random.random(n) < 0.01
    for idx in df.index[date_swap_mask]:
        # 交换入院和出院日期
        adm = df.loc[idx, "admission_date"]
        dis = df.loc[idx, "discharge_date"]
        df.loc[idx, "admission_date"] = dis
        df.loc[idx, "discharge_date"] = adm
        # 重新计算住院天数（可能为负数）
        new_stay = int((df.loc[idx, "discharge_date"] - df.loc[idx, "admission_date"]) / np.timedelta64(1, "D"))
        df.loc[idx, "stay_days"] = new_stay

    # 将日期转为字符串
    df["admission_date"] = df["admission_date"].astype(str)
    df["discharge_date"] = df["discharge_date"].astype(str)

    return df

# ============================================================
# 第三部分：主函数 - 生成并保存数据
# ============================================================

def main():
    """主函数：生成门诊和住院数据并保存为CSV"""

    # 确保输出目录存在
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    print("=" * 60)
    print("开始生成医疗健康模拟数据...")
    print("=" * 60)

    # --- 生成门诊数据 ---
    print("\n正在生成门诊数据（约12000条）...")
    df_outpatient = generate_outpatient_data(n=12000)
    outpatient_path = os.path.join(OUTPUT_DIR, "hospital_outpatient.csv")
    df_outpatient.to_csv(outpatient_path, index=False, encoding="utf-8-sig")
    print(f"门诊数据已保存至: {outpatient_path}")
    print(f"  总行数: {len(df_outpatient)}")

    # --- 生成住院数据 ---
    print("\n正在生成住院数据（约6000条）...")
    df_inpatient = generate_inpatient_data(n=6000)
    inpatient_path = os.path.join(OUTPUT_DIR, "hospital_inpatient.csv")
    df_inpatient.to_csv(inpatient_path, index=False, encoding="utf-8-sig")
    print(f"住院数据已保存至: {inpatient_path}")
    print(f"  总行数: {len(df_inpatient)}")

    # --- 验证脏数据 ---
    print("\n" + "=" * 60)
    print("脏数据验证报告")
    print("=" * 60)

    # 门诊数据验证
    print("\n【门诊数据 hospital_outpatient.csv】")
    total_out = len(df_outpatient)

    # visit_date格式异常（非标准YYYY-MM-DD格式）
    date_anomaly = df_outpatient["visit_date"].apply(
        lambda x: not (len(str(x)) == 10 and str(x)[4] == "-" and str(x)[7] == "-")
    )
    date_anomaly_count = date_anomaly.sum()
    date_anomaly_pct = date_anomaly_count / total_out * 100
    print(f"  visit_date格式异常: {date_anomaly_count}条 ({date_anomaly_pct:.2f}%) [预期约5%]")

    # cost缺失值
    cost_nan_count = df_outpatient["cost"].isna().sum()
    cost_nan_pct = cost_nan_count / total_out * 100
    print(f"  cost缺失值(NaN): {cost_nan_count}条 ({cost_nan_pct:.2f}%) [预期约3%]")

    # cost负数
    cost_neg_count = (df_outpatient["cost"] < 0).sum()
    cost_neg_pct = cost_neg_count / total_out * 100
    print(f"  cost负数: {cost_neg_count}条 ({cost_neg_pct:.2f}%) [预期约2%]")

    # department空值或别名
    valid_depts = {"内科", "外科", "儿科", "妇产科", "急诊科"}
    dept_dirty_count = df_outpatient["department"].apply(
        lambda x: pd.isna(x) or x not in valid_depts
    ).sum()
    dept_dirty_pct = dept_dirty_count / total_out * 100
    print(f"  department空值或别名: {dept_dirty_count}条 ({dept_dirty_pct:.2f}%) [预期约2%]")

    # 重复记录
    dup_count = df_outpatient["visit_id"].duplicated().sum()
    dup_pct = dup_count / total_out * 100
    print(f"  重复visit_id: {dup_count}条 ({dup_pct:.2f}%) [预期约1%]")

    # 住院数据验证
    print("\n【住院数据 hospital_inpatient.csv】")
    total_in = len(df_inpatient)

    # age异常值
    age_anomaly_count = ((df_inpatient["patient_age"] == 0) | (df_inpatient["patient_age"] > 150)).sum()
    age_anomaly_pct = age_anomaly_count / total_in * 100
    print(f"  patient_age异常值(0或>150): {age_anomaly_count}条 ({age_anomaly_pct:.2f}%) [预期约3%]")

    # total_cost缺失值
    inpatient_cost_nan = df_inpatient["total_cost"].isna().sum()
    inpatient_cost_nan_pct = inpatient_cost_nan / total_in * 100
    print(f"  total_cost缺失值(NaN): {inpatient_cost_nan}条 ({inpatient_cost_nan_pct:.2f}%) [预期约4%]")

    # total_cost极端值
    inpatient_cost_extreme = (df_inpatient["total_cost"] == 999999.0).sum()
    inpatient_cost_extreme_pct = inpatient_cost_extreme / total_in * 100
    print(f"  total_cost极端值(999999): {inpatient_cost_extreme}条 ({inpatient_cost_extreme_pct:.2f}%) [预期约2%]")

    # discharge_date早于admission_date
    adm_dates = pd.to_datetime(df_inpatient["admission_date"], errors="coerce")
    dis_dates = pd.to_datetime(df_inpatient["discharge_date"], errors="coerce")
    date_swap_count = (dis_dates < adm_dates).sum()
    date_swap_pct = date_swap_count / total_in * 100
    print(f"  出院日期早于入院日期: {date_swap_count}条 ({date_swap_pct:.2f}%) [预期约1%]")

    print("\n" + "=" * 60)
    print("数据生成完成！")
    print("=" * 60)

if __name__ == "__main__":
    main()
楚怡杯