1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412
| """ 模拟数据生成脚本 为Python程序开发竞赛数据清洗模块生成4套模拟数据 """ import pandas as pd import numpy as np import random import os
OUTPUT_DIR = "in" os.makedirs(OUTPUT_DIR, exist_ok=True)
random.seed(42) np.random.seed(42)
# ============================================================ # 模拟题一:电商订单数据 # ============================================================ def generate_order_data(): """生成电商订单模拟数据(含脏数据)""" categories = { "手机数码": ["iPhone 14 Pro", "华为Mate50", "小米13", "OPPO Find X6", "vivo X90"], "电脑办公": ["联想小新Pro16", "MacBook Air M2", "戴尔灵越15", "华为MateBook D16"], "家用电器": ["美的空调", "格力冰箱", "海尔洗衣机", "苏泊尔电饭煲", "九阳豆浆机"], "服装鞋帽": ["Nike Air Max", "Adidas Ultraboost", "优衣库羽绒服", "李宁运动鞋"], "食品饮料": ["三只松鼠坚果", "蒙牛纯牛奶", "农夫山泉", "百事可乐", "良品铺子零食"], "美妆个护": ["兰蔻小黑瓶", "雅诗兰黛眼霜", "欧莱雅面膜", "资生堂洗面奶"], "图书文具": ["Python编程从入门到精通", "数据结构与算法", "晨光中性笔套装"], "家居家装": ["宜家书架", "全友家居沙发", "罗莱家纺四件套"], }
pay_methods = ["微信支付", "支付宝", "银行卡支付", "京东白条", "花呗"] statuses = ["已完成", "已完成", "已完成", "待发货", "已取消", "已退款", "待收货"] cities = ["湖南省长沙市岳麓区", "湖南省长沙市天心区", "湖南省株洲市天元区", "广东省深圳市南山区", "北京市朝阳区", "上海市浦东新区", "浙江省杭州市西湖区", "湖北省武汉市洪山区", "四川省成都市武侯区"]
weekdays = ["星期一", "星期二", "星期三", "星期四", "星期五", "星期六", "星期日"]
records = [] order_id = 1
for month in range(1, 13): for day in range(1, 29): n_orders = random.randint(3, 8) for _ in range(n_orders): cat = random.choice(list(categories.keys())) product = random.choice(categories[cat]) qty = random.randint(1, 10) price = round(random.uniform(5, 15000), 2) total = round(qty * price, 2) date_str = f"2023年{month:02d}月{day:02d}日" weekday = weekdays[random.randint(0, 6)] order_time = f"{date_str} {weekday}" user_id = f"U{random.randint(10000, 99999)}" order_no = f"DD2023{month:02d}{day:02d}-{order_id:04d}" pay = random.choice(pay_methods) status = random.choice(statuses) address = f"{random.choice(cities)}XX路{random.randint(1,200)}号"
records.append([order_no, order_time, user_id, product, cat, qty, price, total, pay, status, address]) order_id += 1
df = pd.DataFrame(records, columns=[ "订单编号", "下单时间", "用户ID", "商品名称", "商品类别", "购买数量", "单价(元)", "订单金额(元)", "支付方式", "订单状态", "收货地址" ])
# ===== 注入脏数据 =====
# 1. 注入缺失值(约5%的行) n_rows = len(df) for _ in range(int(n_rows * 0.05)): idx = random.randint(0, n_rows - 1) col = random.choice(["用户ID", "商品名称", "支付方式", "收货地址"]) df.at[idx, col] = np.nan
# 2. 注入日期格式错误 for _ in range(15): idx = random.randint(0, n_rows - 1) df.at[idx, "下单时间"] = random.choice([ "2023年02月30日 星期五", # 不存在的日期 "2023年十三月01日 星期一", # 不存在的月份 "日期格式错误", "2023/01/15", "N/A", ])
# 3. 注入异常值(负数、超大值) for _ in range(10): idx = random.randint(0, n_rows - 1) field = random.choice(["购买数量", "单价(元)", "订单金额(元)"]) if field == "购买数量": df.at[idx, field] = random.choice([-1, -5, -10, 150, 200, 500]) else: df.at[idx, field] = random.choice([-99.99, -1000, -5000])
# 4. 注入重复订单 dup_indices = random.sample(range(n_rows), 20) for idx in dup_indices: dup_row = df.iloc[idx].copy() df = pd.concat([df, pd.DataFrame([dup_row])], ignore_index=True)
# 5. 打乱数据顺序 df = df.sample(frac=1, random_state=42).reset_index(drop=True)
filepath = os.path.join(OUTPUT_DIR, "order_data.csv") df.to_csv(filepath, index=False, encoding="utf-8-sig") print(f"[✓] 模拟题一数据已生成: {filepath} ({len(df)} 条记录)") return df
# ============================================================ # 模拟题二:学生成绩数据 # ============================================================ def generate_student_score_data(): """生成学生成绩模拟数据(含脏数据)""" colleges = { "信息工程学院": { "软件技术": ["软工2101班", "软工2102班", "大数据2101班"], "计算机网络技术": ["网络2101班", "网络2102班"], }, "机电工程学院": { "机电一体化": ["机电2101班", "机电2102班"], "工业机器人": ["机器人2101班"], }, "经济管理学院": { "电子商务": ["电商2101班", "电商2102班"], "会计": ["会计2101班"], }, "建筑工程学院": { "建筑工程技术": ["建工2101班"], "工程造价": ["造价2101班"], }, }
subjects = ["Python程序开发", "数据结构与算法", "数据库原理", "Web前端开发", "计算机网络", "Java程序设计", "高等数学", "大学英语"] weekdays = ["周一", "周二", "周三", "周四", "周五", "周六", "周日"] exam_dates = [ "2023/01/10", "2023/03/15", "2023/06/20", "2023/06/21", "2023/09/12", "2023/12/25", "2024/01/08", "2024/03/20" ] levels = ["优秀", "良好", "良好", "中等", "中等", "及格", "不及格"]
surnames = ["张", "李", "王", "刘", "陈", "杨", "黄", "赵", "周", "吴", "徐", "孙", "胡", "朱", "高", "林", "何", "郭", "马", "罗"] given_names = ["伟", "芳", "娜", "秀英", "敏", "静", "丽", "强", "磊", "军", "洋", "勇", "艳", "杰", "娟", "涛", "明", "超", "秀兰", "霞"]
records = [] student_id = 20210101001
for college, majors in colleges.items(): for major, classes in majors.items(): for cls in classes: n_students = random.randint(30, 50) for _ in range(n_students): name = random.choice(surnames) + random.choice(given_names) for _ in range(random.randint(2, 5)): subject = random.choice(subjects) date = random.choice(exam_dates) weekday = weekdays[random.randint(0, 6)] exam_date = f"{date} {weekday}"
usual = random.randint(40, 100) midterm = random.randint(30, 100) final = random.randint(25, 100) total = round(usual * 0.2 + midterm * 0.3 + final * 0.5, 1)
if total >= 90: level = "优秀" elif total >= 80: level = "良好" elif total >= 70: level = "中等" elif total >= 60: level = "及格" else: level = "不及格"
records.append([ str(student_id), name, college, major, cls, exam_date, subject, usual, midterm, final, total, level ]) student_id += 1
df = pd.DataFrame(records, columns=[ "学号", "姓名", "学院", "专业", "班级", "考试日期", "科目名称", "平时成绩", "期中成绩", "期末成绩", "总评成绩", "等级" ])
# ===== 注入脏数据 ===== n_rows = len(df)
# 1. 缺失值 for _ in range(int(n_rows * 0.04)): idx = random.randint(0, n_rows - 1) col = random.choice(["姓名", "专业", "平时成绩", "期中成绩", "期末成绩"]) df.loc[idx, col] = pd.NA
# 2. 日期格式错误 for _ in range(12): idx = random.randint(0, n_rows - 1) df.at[idx, "考试日期"] = random.choice([ "2023/02/30 周四", # 不存在的日期 "2023/13/01 周一", # 不存在的月份 "日期错误", "2023-06-20", "未知", ])
# 3. 异常值 for _ in range(15): idx = random.randint(0, n_rows - 1) field = random.choice(["平时成绩", "期中成绩", "期末成绩", "总评成绩"]) if field == "总评成绩": df.at[idx, field] = random.choice([-10, -5.5, -20]) else: df.at[idx, field] = random.choice([-5, -10, 105, 120, 150, -20])
# 4. 重复记录 dup_indices = random.sample(range(n_rows), 25) for idx in dup_indices: dup_row = df.iloc[idx].copy() df = pd.concat([df, pd.DataFrame([dup_row])], ignore_index=True)
# 5. 打乱顺序 df = df.sample(frac=1, random_state=42).reset_index(drop=True)
filepath = os.path.join(OUTPUT_DIR, "student_score.csv") df.to_csv(filepath, index=False, encoding="utf-8-sig") print(f"[✓] 模拟题二数据已生成: {filepath} ({len(df)} 条记录)") return df
# ============================================================ # 练习题三:多文件销售数据 # ============================================================ def generate_multi_file_sales_data(): """生成6个月的超市销售数据文件""" months_info = [ ("01", "一月"), ("02", "二月"), ("03", "三月"), ("04", "四月"), ("05", "五月"), ("06", "六月"), ]
stores = [ ("S001", "长沙岳麓店"), ("S002", "长沙天心店"), ("S003", "株洲中心店"), ("S004", "湘潭雨湖店"), ("S005", "衡阳蒸湘店"), ]
product_categories = { "食品饮料": ["可口可乐330ml", "百事可乐500ml", "农夫山泉550ml", "蒙牛纯牛奶250ml", "康师傅方便面", "三只松鼠坚果", "良品铺子零食", "旺旺雪饼"], "日用百货": ["维达抽纸", "蓝月亮洗衣液", "舒肤佳香皂", "佳洁士牙膏", "清风卷纸", "立白洗洁精"], "生鲜水果": ["红富士苹果", "海南香蕉", "广西砂糖橘", "山东大葱", "本地白菜", "新鲜草莓"], "酒水茶饮": ["青岛啤酒500ml", "长城干红", "西湖龙井", "金典有机奶"], }
records_all = {} for month_num, month_name in months_info: records = [] days_in_month = [31, 28, 31, 30, 31, 30][int(month_num) - 1]
for day in range(1, days_in_month + 1): if random.random() < 0.1: # 10%的天数没有数据 continue date_str = f"2023-{month_num}-{day:02d}" n_transactions = random.randint(5, 20)
for _ in range(n_transactions): store_id, store_name = random.choice(stores) cat = random.choice(list(product_categories.keys())) product = random.choice(product_categories[cat]) qty = random.randint(1, 30) price = round(random.uniform(2, 200), 2) amount = round(qty * price, 2)
records.append([date_str, store_id, store_name, cat, product, qty, amount, month_name])
df = pd.DataFrame(records, columns=[ "日期", "门店编号", "门店名称", "商品类别", "商品名称", "销售数量", "销售金额", "月份" ])
# 注入脏数据 n = len(df) if n > 0: # 缺失值 for _ in range(max(1, int(n * 0.03))): idx = random.randint(0, n - 1) col = random.choice(["商品名称", "销售数量", "销售金额"]) df.at[idx, col] = pd.NA
# 异常值 for _ in range(max(1, int(n * 0.02))): idx = random.randint(0, n - 1) if random.random() < 0.5: df.at[idx, "销售数量"] = random.choice([-5, -10, -1]) else: df.at[idx, "销售金额"] = random.choice([-50.5, -100, -20])
# 重复行 if n > 5: dup_indices = random.sample(range(n), min(5, n)) dup_rows = df.iloc[dup_indices] df = pd.concat([df, dup_rows], ignore_index=True)
filepath = os.path.join(OUTPUT_DIR, f"sales_2023_{month_num}.csv") df.to_csv(filepath, index=False, encoding="utf-8-sig") records_all[month_num] = len(df) print(f"[✓] 练习题三数据已生成: {filepath} ({len(df)} 条记录)")
return records_all
# ============================================================ # 练习题四:中文数字数据 # ============================================================ def chinese_num(n): """将阿拉伯数字转为中文数字(用于生成数据)""" if n == 0: return "零" units = ["", "十", "百", "千", "万", "十", "百", "千", "亿"] digits = ["零", "一", "二", "三", "四", "五", "六", "七", "八", "九"]
result = "" s = str(n) length = len(s) for i, ch in enumerate(s): d = int(ch) pos = length - i - 1 if d == 0: if result and result[-1] != "零": result += "零" else: result += digits[d] + units[pos] # 清理多余的零 result = result.replace("零零", "零") if result.endswith("零"): result = result[:-1] # 处理"一十"开头的简写 if result.startswith("一十"): result = result[1:] # "一十二" -> "十二" return result
def generate_chinese_number_data(): """生成含中文数字的数据""" projects = [ "城市道路改造工程", "智慧校园建设", "图书馆数字化升级", "校园网络改造", "实训基地建设", "体育馆翻新工程", "学生宿舍楼建设", "教学楼节能改造", "校园安防系统升级", "实验室设备采购", "绿化景观工程", "供水管网改造", ]
records = [] for i, project in enumerate(projects): # 生成各种范围的金额(万元) contract = random.choice([50, 80, 100, 120, 150, 200, 250, 300, 350, 500, 800, 1000, 1500, 2000]) actual = random.randint(int(contract * 0.6), int(contract * 1.05)) days = random.choice([30, 45, 60, 90, 120, 150, 180, 200, 240, 365]) people = random.choice([8, 10, 12, 15, 20, 25, 30, 35, 40, 45, 50, 60, 80, 100])
contract_cn = chinese_num(contract) + "万元" actual_cn = chinese_num(actual) + "万元" days_cn = chinese_num(days) + "天" people_cn = chinese_num(people) + "人"
records.append([ f"R{i+1:03d}", project, contract_cn, actual_cn, days_cn, people_cn ])
df = pd.DataFrame(records, columns=[ "编号", "项目名称", "合同金额(中文)", "实际支出(中文)", "工期(中文)", "参与人数(中文)" ])
# 注入少量缺失值 n = len(df) for _ in range(2): idx = random.randint(0, n - 1) col = random.choice(["合同金额(中文)", "工期(中文)"]) df.at[idx, col] = np.nan
filepath = os.path.join(OUTPUT_DIR, "chinese_number_data.csv") df.to_csv(filepath, index=False, encoding="utf-8-sig") print(f"[✓] 练习题四数据已生成: {filepath} ({len(df)} 条记录)") return df
# ============================================================ # 主函数 # ============================================================ if __name__ == "__main__": print("=" * 60) print("Python程序开发竞赛 - 数据清洗模块模拟数据生成") print("=" * 60) print()
generate_order_data() print() generate_student_score_data() print() generate_multi_file_sales_data() print() generate_chinese_number_data()
print() print("=" * 60) print("所有模拟数据生成完毕!") print(f"数据保存目录: {OUTPUT_DIR}") print("=" * 60)
|