数据清洗模块模拟训练题1-数据生成

1. generate_mock_data
generate_mock_data

"""
模拟数据生成脚本
为Python程序开发竞赛数据清洗模块生成4套模拟数据
"""
import pandas as pd
import numpy as np
import random
import os

OUTPUT_DIR = "in"
os.makedirs(OUTPUT_DIR, exist_ok=True)

random.seed(42)
np.random.seed(42)

# ============================================================
# 模拟题一：电商订单数据
# ============================================================
def generate_order_data():
    """生成电商订单模拟数据（含脏数据）"""
    categories = {
        "手机数码": ["iPhone 14 Pro", "华为Mate50", "小米13", "OPPO Find X6", "vivo X90"],
        "电脑办公": ["联想小新Pro16", "MacBook Air M2", "戴尔灵越15", "华为MateBook D16"],
        "家用电器": ["美的空调", "格力冰箱", "海尔洗衣机", "苏泊尔电饭煲", "九阳豆浆机"],
        "服装鞋帽": ["Nike Air Max", "Adidas Ultraboost", "优衣库羽绒服", "李宁运动鞋"],
        "食品饮料": ["三只松鼠坚果", "蒙牛纯牛奶", "农夫山泉", "百事可乐", "良品铺子零食"],
        "美妆个护": ["兰蔻小黑瓶", "雅诗兰黛眼霜", "欧莱雅面膜", "资生堂洗面奶"],
        "图书文具": ["Python编程从入门到精通", "数据结构与算法", "晨光中性笔套装"],
        "家居家装": ["宜家书架", "全友家居沙发", "罗莱家纺四件套"],
    }

    pay_methods = ["微信支付", "支付宝", "银行卡支付", "京东白条", "花呗"]
    statuses = ["已完成", "已完成", "已完成", "待发货", "已取消", "已退款", "待收货"]
    cities = ["湖南省长沙市岳麓区", "湖南省长沙市天心区", "湖南省株洲市天元区",
              "广东省深圳市南山区", "北京市朝阳区", "上海市浦东新区",
              "浙江省杭州市西湖区", "湖北省武汉市洪山区", "四川省成都市武侯区"]

    weekdays = ["星期一", "星期二", "星期三", "星期四", "星期五", "星期六", "星期日"]

    records = []
    order_id = 1

    for month in range(1, 13):
        for day in range(1, 29):
            n_orders = random.randint(3, 8)
            for _ in range(n_orders):
                cat = random.choice(list(categories.keys()))
                product = random.choice(categories[cat])
                qty = random.randint(1, 10)
                price = round(random.uniform(5, 15000), 2)
                total = round(qty * price, 2)
                date_str = f"2023年{month:02d}月{day:02d}日"
                weekday = weekdays[random.randint(0, 6)]
                order_time = f"{date_str} {weekday}"
                user_id = f"U{random.randint(10000, 99999)}"
                order_no = f"DD2023{month:02d}{day:02d}-{order_id:04d}"
                pay = random.choice(pay_methods)
                status = random.choice(statuses)
                address = f"{random.choice(cities)}XX路{random.randint(1,200)}号"

                records.append([order_no, order_time, user_id, product, cat,
                                qty, price, total, pay, status, address])
                order_id += 1

    df = pd.DataFrame(records, columns=[
        "订单编号", "下单时间", "用户ID", "商品名称", "商品类别",
        "购买数量", "单价(元)", "订单金额(元)", "支付方式", "订单状态", "收货地址"
    ])

    # ===== 注入脏数据 =====

    # 1. 注入缺失值（约5%的行）
    n_rows = len(df)
    for _ in range(int(n_rows * 0.05)):
        idx = random.randint(0, n_rows - 1)
        col = random.choice(["用户ID", "商品名称", "支付方式", "收货地址"])
        df.at[idx, col] = np.nan

    # 2. 注入日期格式错误
    for _ in range(15):
        idx = random.randint(0, n_rows - 1)
        df.at[idx, "下单时间"] = random.choice([
            "2023年02月30日 星期五",  # 不存在的日期
            "2023年十三月01日 星期一",  # 不存在的月份
            "日期格式错误",
            "2023/01/15",
            "N/A",
        ])

    # 3. 注入异常值（负数、超大值）
    for _ in range(10):
        idx = random.randint(0, n_rows - 1)
        field = random.choice(["购买数量", "单价(元)", "订单金额(元)"])
        if field == "购买数量":
            df.at[idx, field] = random.choice([-1, -5, -10, 150, 200, 500])
        else:
            df.at[idx, field] = random.choice([-99.99, -1000, -5000])

    # 4. 注入重复订单
    dup_indices = random.sample(range(n_rows), 20)
    for idx in dup_indices:
        dup_row = df.iloc[idx].copy()
        df = pd.concat([df, pd.DataFrame([dup_row])], ignore_index=True)

    # 5. 打乱数据顺序
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)

    filepath = os.path.join(OUTPUT_DIR, "order_data.csv")
    df.to_csv(filepath, index=False, encoding="utf-8-sig")
    print(f"[✓] 模拟题一数据已生成: {filepath} ({len(df)} 条记录)")
    return df

# ============================================================
# 模拟题二：学生成绩数据
# ============================================================
def generate_student_score_data():
    """生成学生成绩模拟数据（含脏数据）"""
    colleges = {
        "信息工程学院": {
            "软件技术": ["软工2101班", "软工2102班", "大数据2101班"],
            "计算机网络技术": ["网络2101班", "网络2102班"],
        },
        "机电工程学院": {
            "机电一体化": ["机电2101班", "机电2102班"],
            "工业机器人": ["机器人2101班"],
        },
        "经济管理学院": {
            "电子商务": ["电商2101班", "电商2102班"],
            "会计": ["会计2101班"],
        },
        "建筑工程学院": {
            "建筑工程技术": ["建工2101班"],
            "工程造价": ["造价2101班"],
        },
    }

    subjects = ["Python程序开发", "数据结构与算法", "数据库原理", "Web前端开发",
                "计算机网络", "Java程序设计", "高等数学", "大学英语"]
    weekdays = ["周一", "周二", "周三", "周四", "周五", "周六", "周日"]
    exam_dates = [
        "2023/01/10", "2023/03/15", "2023/06/20", "2023/06/21",
        "2023/09/12", "2023/12/25", "2024/01/08", "2024/03/20"
    ]
    levels = ["优秀", "良好", "良好", "中等", "中等", "及格", "不及格"]

    surnames = ["张", "李", "王", "刘", "陈", "杨", "黄", "赵", "周", "吴",
                "徐", "孙", "胡", "朱", "高", "林", "何", "郭", "马", "罗"]
    given_names = ["伟", "芳", "娜", "秀英", "敏", "静", "丽", "强", "磊", "军",
                   "洋", "勇", "艳", "杰", "娟", "涛", "明", "超", "秀兰", "霞"]

    records = []
    student_id = 20210101001

    for college, majors in colleges.items():
        for major, classes in majors.items():
            for cls in classes:
                n_students = random.randint(30, 50)
                for _ in range(n_students):
                    name = random.choice(surnames) + random.choice(given_names)
                    for _ in range(random.randint(2, 5)):
                        subject = random.choice(subjects)
                        date = random.choice(exam_dates)
                        weekday = weekdays[random.randint(0, 6)]
                        exam_date = f"{date} {weekday}"

                        usual = random.randint(40, 100)
                        midterm = random.randint(30, 100)
                        final = random.randint(25, 100)
                        total = round(usual * 0.2 + midterm * 0.3 + final * 0.5, 1)

                        if total >= 90:
                            level = "优秀"
                        elif total >= 80:
                            level = "良好"
                        elif total >= 70:
                            level = "中等"
                        elif total >= 60:
                            level = "及格"
                        else:
                            level = "不及格"

                        records.append([
                            str(student_id), name, college, major, cls,
                            exam_date, subject, usual, midterm, final, total, level
                        ])
                    student_id += 1

    df = pd.DataFrame(records, columns=[
        "学号", "姓名", "学院", "专业", "班级",
        "考试日期", "科目名称", "平时成绩", "期中成绩", "期末成绩", "总评成绩", "等级"
    ])

    # ===== 注入脏数据 =====
    n_rows = len(df)

    # 1. 缺失值
    for _ in range(int(n_rows * 0.04)):
        idx = random.randint(0, n_rows - 1)
        col = random.choice(["姓名", "专业", "平时成绩", "期中成绩", "期末成绩"])
        df.loc[idx, col] = pd.NA

    # 2. 日期格式错误
    for _ in range(12):
        idx = random.randint(0, n_rows - 1)
        df.at[idx, "考试日期"] = random.choice([
            "2023/02/30 周四",  # 不存在的日期
            "2023/13/01 周一",  # 不存在的月份
            "日期错误",
            "2023-06-20",
            "未知",
        ])

    # 3. 异常值
    for _ in range(15):
        idx = random.randint(0, n_rows - 1)
        field = random.choice(["平时成绩", "期中成绩", "期末成绩", "总评成绩"])
        if field == "总评成绩":
            df.at[idx, field] = random.choice([-10, -5.5, -20])
        else:
            df.at[idx, field] = random.choice([-5, -10, 105, 120, 150, -20])

    # 4. 重复记录
    dup_indices = random.sample(range(n_rows), 25)
    for idx in dup_indices:
        dup_row = df.iloc[idx].copy()
        df = pd.concat([df, pd.DataFrame([dup_row])], ignore_index=True)

    # 5. 打乱顺序
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)

    filepath = os.path.join(OUTPUT_DIR, "student_score.csv")
    df.to_csv(filepath, index=False, encoding="utf-8-sig")
    print(f"[✓] 模拟题二数据已生成: {filepath} ({len(df)} 条记录)")
    return df

# ============================================================
# 练习题三：多文件销售数据
# ============================================================
def generate_multi_file_sales_data():
    """生成6个月的超市销售数据文件"""
    months_info = [
        ("01", "一月"), ("02", "二月"), ("03", "三月"),
        ("04", "四月"), ("05", "五月"), ("06", "六月"),
    ]

    stores = [
        ("S001", "长沙岳麓店"), ("S002", "长沙天心店"), ("S003", "株洲中心店"),
        ("S004", "湘潭雨湖店"), ("S005", "衡阳蒸湘店"),
    ]

    product_categories = {
        "食品饮料": ["可口可乐330ml", "百事可乐500ml", "农夫山泉550ml", "蒙牛纯牛奶250ml",
                     "康师傅方便面", "三只松鼠坚果", "良品铺子零食", "旺旺雪饼"],
        "日用百货": ["维达抽纸", "蓝月亮洗衣液", "舒肤佳香皂", "佳洁士牙膏",
                     "清风卷纸", "立白洗洁精"],
        "生鲜水果": ["红富士苹果", "海南香蕉", "广西砂糖橘", "山东大葱",
                     "本地白菜", "新鲜草莓"],
        "酒水茶饮": ["青岛啤酒500ml", "长城干红", "西湖龙井", "金典有机奶"],
    }

    records_all = {}
    for month_num, month_name in months_info:
        records = []
        days_in_month = [31, 28, 31, 30, 31, 30][int(month_num) - 1]

        for day in range(1, days_in_month + 1):
            if random.random() < 0.1:  # 10%的天数没有数据
                continue
            date_str = f"2023-{month_num}-{day:02d}"
            n_transactions = random.randint(5, 20)

            for _ in range(n_transactions):
                store_id, store_name = random.choice(stores)
                cat = random.choice(list(product_categories.keys()))
                product = random.choice(product_categories[cat])
                qty = random.randint(1, 30)
                price = round(random.uniform(2, 200), 2)
                amount = round(qty * price, 2)

                records.append([date_str, store_id, store_name, cat, product,
                                qty, amount, month_name])

        df = pd.DataFrame(records, columns=[
            "日期", "门店编号", "门店名称", "商品类别", "商品名称",
            "销售数量", "销售金额", "月份"
        ])

        # 注入脏数据
        n = len(df)
        if n > 0:
            # 缺失值
            for _ in range(max(1, int(n * 0.03))):
                idx = random.randint(0, n - 1)
                col = random.choice(["商品名称", "销售数量", "销售金额"])
                df.at[idx, col] = pd.NA

            # 异常值
            for _ in range(max(1, int(n * 0.02))):
                idx = random.randint(0, n - 1)
                if random.random() < 0.5:
                    df.at[idx, "销售数量"] = random.choice([-5, -10, -1])
                else:
                    df.at[idx, "销售金额"] = random.choice([-50.5, -100, -20])

            # 重复行
            if n > 5:
                dup_indices = random.sample(range(n), min(5, n))
                dup_rows = df.iloc[dup_indices]
                df = pd.concat([df, dup_rows], ignore_index=True)

        filepath = os.path.join(OUTPUT_DIR, f"sales_2023_{month_num}.csv")
        df.to_csv(filepath, index=False, encoding="utf-8-sig")
        records_all[month_num] = len(df)
        print(f"[✓] 练习题三数据已生成: {filepath} ({len(df)} 条记录)")

    return records_all

# ============================================================
# 练习题四：中文数字数据
# ============================================================
def chinese_num(n):
    """将阿拉伯数字转为中文数字（用于生成数据）"""
    if n == 0:
        return "零"
    units = ["", "十", "百", "千", "万", "十", "百", "千", "亿"]
    digits = ["零", "一", "二", "三", "四", "五", "六", "七", "八", "九"]

    result = ""
    s = str(n)
    length = len(s)
    for i, ch in enumerate(s):
        d = int(ch)
        pos = length - i - 1
        if d == 0:
            if result and result[-1] != "零":
                result += "零"
        else:
            result += digits[d] + units[pos]
    # 清理多余的零
    result = result.replace("零零", "零")
    if result.endswith("零"):
        result = result[:-1]
    # 处理"一十"开头的简写
    if result.startswith("一十"):
        result = result[1:]  # "一十二" -> "十二"
    return result

def generate_chinese_number_data():
    """生成含中文数字的数据"""
    projects = [
        "城市道路改造工程", "智慧校园建设", "图书馆数字化升级",
        "校园网络改造", "实训基地建设", "体育馆翻新工程",
        "学生宿舍楼建设", "教学楼节能改造", "校园安防系统升级",
        "实验室设备采购", "绿化景观工程", "供水管网改造",
    ]

    records = []
    for i, project in enumerate(projects):
        # 生成各种范围的金额（万元）
        contract = random.choice([50, 80, 100, 120, 150, 200, 250, 300, 350, 500, 800, 1000, 1500, 2000])
        actual = random.randint(int(contract * 0.6), int(contract * 1.05))
        days = random.choice([30, 45, 60, 90, 120, 150, 180, 200, 240, 365])
        people = random.choice([8, 10, 12, 15, 20, 25, 30, 35, 40, 45, 50, 60, 80, 100])

        contract_cn = chinese_num(contract) + "万元"
        actual_cn = chinese_num(actual) + "万元"
        days_cn = chinese_num(days) + "天"
        people_cn = chinese_num(people) + "人"

        records.append([
            f"R{i+1:03d}", project, contract_cn, actual_cn, days_cn, people_cn
        ])

    df = pd.DataFrame(records, columns=[
        "编号", "项目名称", "合同金额(中文)", "实际支出(中文)", "工期(中文)", "参与人数(中文)"
    ])

    # 注入少量缺失值
    n = len(df)
    for _ in range(2):
        idx = random.randint(0, n - 1)
        col = random.choice(["合同金额(中文)", "工期(中文)"])
        df.at[idx, col] = np.nan

    filepath = os.path.join(OUTPUT_DIR, "chinese_number_data.csv")
    df.to_csv(filepath, index=False, encoding="utf-8-sig")
    print(f"[✓] 练习题四数据已生成: {filepath} ({len(df)} 条记录)")
    return df

# ============================================================
# 主函数
# ============================================================
if __name__ == "__main__":
    print("=" * 60)
    print("Python程序开发竞赛 - 数据清洗模块模拟数据生成")
    print("=" * 60)
    print()

    generate_order_data()
    print()
    generate_student_score_data()
    print()
    generate_multi_file_sales_data()
    print()
    generate_chinese_number_data()

    print()
    print("=" * 60)
    print("所有模拟数据生成完毕！")
    print(f"数据保存目录: {OUTPUT_DIR}")
    print("=" * 60)
楚怡杯

generate_mock_data