Skip to content

Commit

Permalink
doc: 更新数据统计方法
Browse files Browse the repository at this point in the history
  • Loading branch information
YDX-2147483647 committed Sep 2, 2024
1 parent 02f29e9 commit f3b18f0
Showing 1 changed file with 56 additions and 18 deletions.
74 changes: 56 additions & 18 deletions doc/agg.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,22 +6,43 @@ $ poetry install --with agg

## 流程

`just shell`中运行以下内容,定稿所有超期答卷。
### 从服务器导出

```shell
`just shell`中运行以下内容(运行`just shell`,然后粘贴进去),定稿所有超期答卷。

```python
from quiz.models import DraftResponse
from quiz.views import continue_or_finalize

for r in DraftResponse.objects.all():
print(continue_or_finalize(r))
```

利用`just manage dumpdata``just manage loaddata`,将数据复制到本地。(数据量大,而服务器资源有限)
利用`just manage dumpdata`导出数据,然后下载到本地。(数据量大,而服务器资源有限)

```shell
$ just manage dumpdata quiz contenttypes auth --format jsonl --output db.jsonl
```

> [!NOTE]
>
> - 这里只导出了`quiz`相关数据,文件会小几 MB。
> - JSON lines(`*.jsonl`)格式每行一条记录,方便检查。
### 在本地统计

清空本地数据库再创建空表,然后导入下载下来的数据。

```shell
$ rm ./contest/db.sqlite3 && just manage migrate
$ just manage loaddata db.jsonl --verbosity 3
```

再在`just shell`中运行以下内容,导出分数为`scores.csv`

```python
"""导出分数"""

from collections import deque
from pathlib import Path

Expand All @@ -40,35 +61,46 @@ for s in track(
):
data.append((s.user.username, s.user.last_name, str(s.final_score())))

#! EDIT HERE: scores.csv 的导出路径
Path("scores.csv").write_text("\n".join(map(",".join, data)), encoding="utf-8")
```

然后进一步计算每连情况。

```python
"""计算每连情况"""

from pathlib import Path

import polars as pl

# 1. Load data and verify

scores_path = next(Path(__file__).parent.glob("scores*.csv"))
#! EDIT HERE: scores.csv 的路径,由上一步导出
scores_path = next(Path.cwd().glob("scores*.csv"))
print(f"分析“{scores_path}”。")

scores = pl.scan_csv(
scores_path,
schema={"id": pl.Utf8, "name": pl.Utf8, "score": pl.Int16},
)

people = pl.read_excel(
Path("D:/大学/Clubs/NetPioneer_2022_2023/技术保障中心/国防知识竞赛/连队人员信息0830.xlsx"),
read_csv_options={
"schema": {"营团": pl.Utf8, "连队": pl.Int64, "id": pl.Utf8, "name": pl.Utf8}
},
).join(scores.collect(), on="id", how="outer")
people = (
pl.read_excel(
#! EDIT HERE: 学生名单的路径,由学工部提供
Path("2024-08-20军训学生分连队.xlsx"),
read_csv_options={
#! EDIT HERE: 按顺序记录名单格式,多余的列可在后面 drop
"schema": {"序号": pl.Utf8, "id": pl.Utf8, "name": pl.Utf8, "连队": pl.Utf8}
},
)
.drop("序号")
# 解析 "1连" → 1
.with_columns(pl.col("连队").str.strip_suffix("").cast(pl.Int16))
.join(scores.collect(), on="id", how="outer")
)

print("名单没有的同学:", people.filter(pl.col("name").is_null()))
print("名单没有的同学:", people.filter(pl.col("name").is_null()).sort("id"))

inconsistent = people.filter(
pl.col("name").is_not_null()
Expand All @@ -81,7 +113,7 @@ people = (
people.lazy()
.with_columns(pl.col("name").fill_null(pl.col("name_right")))
.drop("name_right")
.filter(pl.col("营团").is_not_null() & pl.col("连队").is_not_null())
.filter(pl.col("连队").is_not_null())
.collect()
)

Expand All @@ -91,17 +123,23 @@ print("名单中同学:", people.describe())

q = (
people.lazy()
.group_by("营团", "连队")
#! EDIT HERE: 若分了营团,可考虑改为 group_by("营团", "连队") 和 sort("营团", "连队")
.group_by("连队")
.agg(
pl.count().alias("总人数"),
pl.col("score").fill_null(0).mean().alias("平均分"),
(pl.col("score") > 0).sum().alias("答题人数"),
pl.count().alias("应参与人数"),
(pl.col("score") > 0).sum().alias("实际参与人数"),
#! EDIT HERE: 需要统计的数据
(pl.col("score").fill_null(0) > 0).mean().alias("参与率"),
pl.col("score").fill_null(0).sum().alias("总得分"),
pl.col("score").fill_null(0).mean().alias("均分"),
pl.col("score").filter(pl.col("score") > 0).mean().alias("有成绩学生平均分"),
)
.with_columns((pl.col("答题人数") / pl.col("总人数")).alias("答题比例"))
.sort("营团", "连队")
.sort("连队")
.with_columns(pl.col("连队").cast(pl.Utf8) + "")
)
df = q.collect()
print("各连情况:", df.describe())
#! EDIT HERE: 统计结果的导出路径
agg_path = scores_path.with_name(f"{scores_path.stem}-agg.xlsx")
df.write_excel(agg_path, column_formats={"答题比例": "0.00%"})
print(f"已保存到“{agg_path}”。")
Expand Down

0 comments on commit f3b18f0

Please sign in to comment.