数据清洗模块模拟训练题1-数据生成

  • ~19.10K 字
  1. 1. generate_mock_data

generate_mock_data

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
"""
模拟数据生成脚本
为Python程序开发竞赛数据清洗模块生成4套模拟数据
"""
import pandas as pd
import numpy as np
import random
import os

OUTPUT_DIR = "in"
os.makedirs(OUTPUT_DIR, exist_ok=True)

random.seed(42)
np.random.seed(42)

# ============================================================
# 模拟题一:电商订单数据
# ============================================================
def generate_order_data():
"""生成电商订单模拟数据(含脏数据)"""
categories = {
"手机数码": ["iPhone 14 Pro", "华为Mate50", "小米13", "OPPO Find X6", "vivo X90"],
"电脑办公": ["联想小新Pro16", "MacBook Air M2", "戴尔灵越15", "华为MateBook D16"],
"家用电器": ["美的空调", "格力冰箱", "海尔洗衣机", "苏泊尔电饭煲", "九阳豆浆机"],
"服装鞋帽": ["Nike Air Max", "Adidas Ultraboost", "优衣库羽绒服", "李宁运动鞋"],
"食品饮料": ["三只松鼠坚果", "蒙牛纯牛奶", "农夫山泉", "百事可乐", "良品铺子零食"],
"美妆个护": ["兰蔻小黑瓶", "雅诗兰黛眼霜", "欧莱雅面膜", "资生堂洗面奶"],
"图书文具": ["Python编程从入门到精通", "数据结构与算法", "晨光中性笔套装"],
"家居家装": ["宜家书架", "全友家居沙发", "罗莱家纺四件套"],
}

pay_methods = ["微信支付", "支付宝", "银行卡支付", "京东白条", "花呗"]
statuses = ["已完成", "已完成", "已完成", "待发货", "已取消", "已退款", "待收货"]
cities = ["湖南省长沙市岳麓区", "湖南省长沙市天心区", "湖南省株洲市天元区",
"广东省深圳市南山区", "北京市朝阳区", "上海市浦东新区",
"浙江省杭州市西湖区", "湖北省武汉市洪山区", "四川省成都市武侯区"]

weekdays = ["星期一", "星期二", "星期三", "星期四", "星期五", "星期六", "星期日"]

records = []
order_id = 1

for month in range(1, 13):
for day in range(1, 29):
n_orders = random.randint(3, 8)
for _ in range(n_orders):
cat = random.choice(list(categories.keys()))
product = random.choice(categories[cat])
qty = random.randint(1, 10)
price = round(random.uniform(5, 15000), 2)
total = round(qty * price, 2)
date_str = f"2023年{month:02d}月{day:02d}日"
weekday = weekdays[random.randint(0, 6)]
order_time = f"{date_str} {weekday}"
user_id = f"U{random.randint(10000, 99999)}"
order_no = f"DD2023{month:02d}{day:02d}-{order_id:04d}"
pay = random.choice(pay_methods)
status = random.choice(statuses)
address = f"{random.choice(cities)}XX路{random.randint(1,200)}号"

records.append([order_no, order_time, user_id, product, cat,
qty, price, total, pay, status, address])
order_id += 1

df = pd.DataFrame(records, columns=[
"订单编号", "下单时间", "用户ID", "商品名称", "商品类别",
"购买数量", "单价(元)", "订单金额(元)", "支付方式", "订单状态", "收货地址"
])

# ===== 注入脏数据 =====

# 1. 注入缺失值(约5%的行)
n_rows = len(df)
for _ in range(int(n_rows * 0.05)):
idx = random.randint(0, n_rows - 1)
col = random.choice(["用户ID", "商品名称", "支付方式", "收货地址"])
df.at[idx, col] = np.nan

# 2. 注入日期格式错误
for _ in range(15):
idx = random.randint(0, n_rows - 1)
df.at[idx, "下单时间"] = random.choice([
"2023年02月30日 星期五", # 不存在的日期
"2023年十三月01日 星期一", # 不存在的月份
"日期格式错误",
"2023/01/15",
"N/A",
])

# 3. 注入异常值(负数、超大值)
for _ in range(10):
idx = random.randint(0, n_rows - 1)
field = random.choice(["购买数量", "单价(元)", "订单金额(元)"])
if field == "购买数量":
df.at[idx, field] = random.choice([-1, -5, -10, 150, 200, 500])
else:
df.at[idx, field] = random.choice([-99.99, -1000, -5000])

# 4. 注入重复订单
dup_indices = random.sample(range(n_rows), 20)
for idx in dup_indices:
dup_row = df.iloc[idx].copy()
df = pd.concat([df, pd.DataFrame([dup_row])], ignore_index=True)

# 5. 打乱数据顺序
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

filepath = os.path.join(OUTPUT_DIR, "order_data.csv")
df.to_csv(filepath, index=False, encoding="utf-8-sig")
print(f"[✓] 模拟题一数据已生成: {filepath} ({len(df)} 条记录)")
return df

# ============================================================
# 模拟题二:学生成绩数据
# ============================================================
def generate_student_score_data():
"""生成学生成绩模拟数据(含脏数据)"""
colleges = {
"信息工程学院": {
"软件技术": ["软工2101班", "软工2102班", "大数据2101班"],
"计算机网络技术": ["网络2101班", "网络2102班"],
},
"机电工程学院": {
"机电一体化": ["机电2101班", "机电2102班"],
"工业机器人": ["机器人2101班"],
},
"经济管理学院": {
"电子商务": ["电商2101班", "电商2102班"],
"会计": ["会计2101班"],
},
"建筑工程学院": {
"建筑工程技术": ["建工2101班"],
"工程造价": ["造价2101班"],
},
}

subjects = ["Python程序开发", "数据结构与算法", "数据库原理", "Web前端开发",
"计算机网络", "Java程序设计", "高等数学", "大学英语"]
weekdays = ["周一", "周二", "周三", "周四", "周五", "周六", "周日"]
exam_dates = [
"2023/01/10", "2023/03/15", "2023/06/20", "2023/06/21",
"2023/09/12", "2023/12/25", "2024/01/08", "2024/03/20"
]
levels = ["优秀", "良好", "良好", "中等", "中等", "及格", "不及格"]

surnames = ["张", "李", "王", "刘", "陈", "杨", "黄", "赵", "周", "吴",
"徐", "孙", "胡", "朱", "高", "林", "何", "郭", "马", "罗"]
given_names = ["伟", "芳", "娜", "秀英", "敏", "静", "丽", "强", "磊", "军",
"洋", "勇", "艳", "杰", "娟", "涛", "明", "超", "秀兰", "霞"]

records = []
student_id = 20210101001

for college, majors in colleges.items():
for major, classes in majors.items():
for cls in classes:
n_students = random.randint(30, 50)
for _ in range(n_students):
name = random.choice(surnames) + random.choice(given_names)
for _ in range(random.randint(2, 5)):
subject = random.choice(subjects)
date = random.choice(exam_dates)
weekday = weekdays[random.randint(0, 6)]
exam_date = f"{date} {weekday}"

usual = random.randint(40, 100)
midterm = random.randint(30, 100)
final = random.randint(25, 100)
total = round(usual * 0.2 + midterm * 0.3 + final * 0.5, 1)

if total >= 90:
level = "优秀"
elif total >= 80:
level = "良好"
elif total >= 70:
level = "中等"
elif total >= 60:
level = "及格"
else:
level = "不及格"

records.append([
str(student_id), name, college, major, cls,
exam_date, subject, usual, midterm, final, total, level
])
student_id += 1

df = pd.DataFrame(records, columns=[
"学号", "姓名", "学院", "专业", "班级",
"考试日期", "科目名称", "平时成绩", "期中成绩", "期末成绩", "总评成绩", "等级"
])

# ===== 注入脏数据 =====
n_rows = len(df)

# 1. 缺失值
for _ in range(int(n_rows * 0.04)):
idx = random.randint(0, n_rows - 1)
col = random.choice(["姓名", "专业", "平时成绩", "期中成绩", "期末成绩"])
df.loc[idx, col] = pd.NA

# 2. 日期格式错误
for _ in range(12):
idx = random.randint(0, n_rows - 1)
df.at[idx, "考试日期"] = random.choice([
"2023/02/30 周四", # 不存在的日期
"2023/13/01 周一", # 不存在的月份
"日期错误",
"2023-06-20",
"未知",
])

# 3. 异常值
for _ in range(15):
idx = random.randint(0, n_rows - 1)
field = random.choice(["平时成绩", "期中成绩", "期末成绩", "总评成绩"])
if field == "总评成绩":
df.at[idx, field] = random.choice([-10, -5.5, -20])
else:
df.at[idx, field] = random.choice([-5, -10, 105, 120, 150, -20])

# 4. 重复记录
dup_indices = random.sample(range(n_rows), 25)
for idx in dup_indices:
dup_row = df.iloc[idx].copy()
df = pd.concat([df, pd.DataFrame([dup_row])], ignore_index=True)

# 5. 打乱顺序
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

filepath = os.path.join(OUTPUT_DIR, "student_score.csv")
df.to_csv(filepath, index=False, encoding="utf-8-sig")
print(f"[✓] 模拟题二数据已生成: {filepath} ({len(df)} 条记录)")
return df

# ============================================================
# 练习题三:多文件销售数据
# ============================================================
def generate_multi_file_sales_data():
"""生成6个月的超市销售数据文件"""
months_info = [
("01", "一月"), ("02", "二月"), ("03", "三月"),
("04", "四月"), ("05", "五月"), ("06", "六月"),
]

stores = [
("S001", "长沙岳麓店"), ("S002", "长沙天心店"), ("S003", "株洲中心店"),
("S004", "湘潭雨湖店"), ("S005", "衡阳蒸湘店"),
]

product_categories = {
"食品饮料": ["可口可乐330ml", "百事可乐500ml", "农夫山泉550ml", "蒙牛纯牛奶250ml",
"康师傅方便面", "三只松鼠坚果", "良品铺子零食", "旺旺雪饼"],
"日用百货": ["维达抽纸", "蓝月亮洗衣液", "舒肤佳香皂", "佳洁士牙膏",
"清风卷纸", "立白洗洁精"],
"生鲜水果": ["红富士苹果", "海南香蕉", "广西砂糖橘", "山东大葱",
"本地白菜", "新鲜草莓"],
"酒水茶饮": ["青岛啤酒500ml", "长城干红", "西湖龙井", "金典有机奶"],
}

records_all = {}
for month_num, month_name in months_info:
records = []
days_in_month = [31, 28, 31, 30, 31, 30][int(month_num) - 1]

for day in range(1, days_in_month + 1):
if random.random() < 0.1: # 10%的天数没有数据
continue
date_str = f"2023-{month_num}-{day:02d}"
n_transactions = random.randint(5, 20)

for _ in range(n_transactions):
store_id, store_name = random.choice(stores)
cat = random.choice(list(product_categories.keys()))
product = random.choice(product_categories[cat])
qty = random.randint(1, 30)
price = round(random.uniform(2, 200), 2)
amount = round(qty * price, 2)

records.append([date_str, store_id, store_name, cat, product,
qty, amount, month_name])

df = pd.DataFrame(records, columns=[
"日期", "门店编号", "门店名称", "商品类别", "商品名称",
"销售数量", "销售金额", "月份"
])

# 注入脏数据
n = len(df)
if n > 0:
# 缺失值
for _ in range(max(1, int(n * 0.03))):
idx = random.randint(0, n - 1)
col = random.choice(["商品名称", "销售数量", "销售金额"])
df.at[idx, col] = pd.NA

# 异常值
for _ in range(max(1, int(n * 0.02))):
idx = random.randint(0, n - 1)
if random.random() < 0.5:
df.at[idx, "销售数量"] = random.choice([-5, -10, -1])
else:
df.at[idx, "销售金额"] = random.choice([-50.5, -100, -20])

# 重复行
if n > 5:
dup_indices = random.sample(range(n), min(5, n))
dup_rows = df.iloc[dup_indices]
df = pd.concat([df, dup_rows], ignore_index=True)

filepath = os.path.join(OUTPUT_DIR, f"sales_2023_{month_num}.csv")
df.to_csv(filepath, index=False, encoding="utf-8-sig")
records_all[month_num] = len(df)
print(f"[✓] 练习题三数据已生成: {filepath} ({len(df)} 条记录)")

return records_all

# ============================================================
# 练习题四:中文数字数据
# ============================================================
def chinese_num(n):
"""将阿拉伯数字转为中文数字(用于生成数据)"""
if n == 0:
return "零"
units = ["", "十", "百", "千", "万", "十", "百", "千", "亿"]
digits = ["零", "一", "二", "三", "四", "五", "六", "七", "八", "九"]

result = ""
s = str(n)
length = len(s)
for i, ch in enumerate(s):
d = int(ch)
pos = length - i - 1
if d == 0:
if result and result[-1] != "零":
result += "零"
else:
result += digits[d] + units[pos]
# 清理多余的零
result = result.replace("零零", "零")
if result.endswith("零"):
result = result[:-1]
# 处理"一十"开头的简写
if result.startswith("一十"):
result = result[1:] # "一十二" -> "十二"
return result

def generate_chinese_number_data():
"""生成含中文数字的数据"""
projects = [
"城市道路改造工程", "智慧校园建设", "图书馆数字化升级",
"校园网络改造", "实训基地建设", "体育馆翻新工程",
"学生宿舍楼建设", "教学楼节能改造", "校园安防系统升级",
"实验室设备采购", "绿化景观工程", "供水管网改造",
]

records = []
for i, project in enumerate(projects):
# 生成各种范围的金额(万元)
contract = random.choice([50, 80, 100, 120, 150, 200, 250, 300, 350, 500, 800, 1000, 1500, 2000])
actual = random.randint(int(contract * 0.6), int(contract * 1.05))
days = random.choice([30, 45, 60, 90, 120, 150, 180, 200, 240, 365])
people = random.choice([8, 10, 12, 15, 20, 25, 30, 35, 40, 45, 50, 60, 80, 100])

contract_cn = chinese_num(contract) + "万元"
actual_cn = chinese_num(actual) + "万元"
days_cn = chinese_num(days) + "天"
people_cn = chinese_num(people) + "人"

records.append([
f"R{i+1:03d}", project, contract_cn, actual_cn, days_cn, people_cn
])

df = pd.DataFrame(records, columns=[
"编号", "项目名称", "合同金额(中文)", "实际支出(中文)", "工期(中文)", "参与人数(中文)"
])

# 注入少量缺失值
n = len(df)
for _ in range(2):
idx = random.randint(0, n - 1)
col = random.choice(["合同金额(中文)", "工期(中文)"])
df.at[idx, col] = np.nan

filepath = os.path.join(OUTPUT_DIR, "chinese_number_data.csv")
df.to_csv(filepath, index=False, encoding="utf-8-sig")
print(f"[✓] 练习题四数据已生成: {filepath} ({len(df)} 条记录)")
return df

# ============================================================
# 主函数
# ============================================================
if __name__ == "__main__":
print("=" * 60)
print("Python程序开发竞赛 - 数据清洗模块模拟数据生成")
print("=" * 60)
print()

generate_order_data()
print()
generate_student_score_data()
print()
generate_multi_file_sales_data()
print()
generate_chinese_number_data()

print()
print("=" * 60)
print("所有模拟数据生成完毕!")
print(f"数据保存目录: {OUTPUT_DIR}")
print("=" * 60)

分享