From f3b18f0e83d0aae6e228c04f25d487e11279b4ed Mon Sep 17 00:00:00 2001 From: "Y.D.X." <73375426+YDX-2147483647@users.noreply.github.com> Date: Mon, 2 Sep 2024 14:42:20 +0800 Subject: [PATCH] =?UTF-8?q?doc:=20=E6=9B=B4=E6=96=B0=E6=95=B0=E6=8D=AE?= =?UTF-8?q?=E7=BB=9F=E8=AE=A1=E6=96=B9=E6=B3=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- doc/agg.md | 74 +++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 56 insertions(+), 18 deletions(-) diff --git a/doc/agg.md b/doc/agg.md index 1704b19..15d7961 100644 --- a/doc/agg.md +++ b/doc/agg.md @@ -6,9 +6,11 @@ $ poetry install --with agg ## 流程 -在`just shell`中运行以下内容,定稿所有超期答卷。 +### 从服务器导出 -```shell +在`just shell`中运行以下内容(运行`just shell`,然后粘贴进去),定稿所有超期答卷。 + +```python from quiz.models import DraftResponse from quiz.views import continue_or_finalize @@ -16,12 +18,31 @@ for r in DraftResponse.objects.all(): print(continue_or_finalize(r)) ``` -利用`just manage dumpdata`和`just manage loaddata`,将数据复制到本地。(数据量大,而服务器资源有限) +利用`just manage dumpdata`导出数据,然后下载到本地。(数据量大,而服务器资源有限) + +```shell +$ just manage dumpdata quiz contenttypes auth --format jsonl --output db.jsonl +``` + +> [!NOTE] +> +> - 这里只导出了`quiz`相关数据,文件会小几 MB。 +> - JSON lines(`*.jsonl`)格式每行一条记录,方便检查。 + +### 在本地统计 + +清空本地数据库再创建空表,然后导入下载下来的数据。 + +```shell +$ rm ./contest/db.sqlite3 && just manage migrate +$ just manage loaddata db.jsonl --verbosity 3 +``` 再在`just shell`中运行以下内容,导出分数为`scores.csv`。 ```python """导出分数""" + from collections import deque from pathlib import Path @@ -40,6 +61,7 @@ for s in track( ): data.append((s.user.username, s.user.last_name, str(s.final_score()))) +#! EDIT HERE: scores.csv 的导出路径 Path("scores.csv").write_text("\n".join(map(",".join, data)), encoding="utf-8") ``` @@ -47,13 +69,15 @@ Path("scores.csv").write_text("\n".join(map(",".join, data)), encoding="utf-8") ```python """计算每连情况""" + from pathlib import Path import polars as pl # 1. Load data and verify -scores_path = next(Path(__file__).parent.glob("scores*.csv")) +#! EDIT HERE: scores.csv 的路径,由上一步导出 +scores_path = next(Path.cwd().glob("scores*.csv")) print(f"分析“{scores_path}”。") scores = pl.scan_csv( @@ -61,14 +85,22 @@ scores = pl.scan_csv( schema={"id": pl.Utf8, "name": pl.Utf8, "score": pl.Int16}, ) -people = pl.read_excel( - Path("D:/大学/Clubs/NetPioneer_2022_2023/技术保障中心/国防知识竞赛/连队人员信息0830.xlsx"), - read_csv_options={ - "schema": {"营团": pl.Utf8, "连队": pl.Int64, "id": pl.Utf8, "name": pl.Utf8} - }, -).join(scores.collect(), on="id", how="outer") +people = ( + pl.read_excel( + #! EDIT HERE: 学生名单的路径,由学工部提供 + Path("2024-08-20军训学生分连队.xlsx"), + read_csv_options={ + #! EDIT HERE: 按顺序记录名单格式,多余的列可在后面 drop + "schema": {"序号": pl.Utf8, "id": pl.Utf8, "name": pl.Utf8, "连队": pl.Utf8} + }, + ) + .drop("序号") + # 解析 "1连" → 1 + .with_columns(pl.col("连队").str.strip_suffix("连").cast(pl.Int16)) + .join(scores.collect(), on="id", how="outer") +) -print("名单没有的同学:", people.filter(pl.col("name").is_null())) +print("名单没有的同学:", people.filter(pl.col("name").is_null()).sort("id")) inconsistent = people.filter( pl.col("name").is_not_null() @@ -81,7 +113,7 @@ people = ( people.lazy() .with_columns(pl.col("name").fill_null(pl.col("name_right"))) .drop("name_right") - .filter(pl.col("营团").is_not_null() & pl.col("连队").is_not_null()) + .filter(pl.col("连队").is_not_null()) .collect() ) @@ -91,17 +123,23 @@ print("名单中同学:", people.describe()) q = ( people.lazy() - .group_by("营团", "连队") + #! EDIT HERE: 若分了营团,可考虑改为 group_by("营团", "连队") 和 sort("营团", "连队") + .group_by("连队") .agg( - pl.count().alias("总人数"), - pl.col("score").fill_null(0).mean().alias("平均分"), - (pl.col("score") > 0).sum().alias("答题人数"), + pl.count().alias("应参与人数"), + (pl.col("score") > 0).sum().alias("实际参与人数"), + #! EDIT HERE: 需要统计的数据 + (pl.col("score").fill_null(0) > 0).mean().alias("参与率"), + pl.col("score").fill_null(0).sum().alias("总得分"), + pl.col("score").fill_null(0).mean().alias("均分"), + pl.col("score").filter(pl.col("score") > 0).mean().alias("有成绩学生平均分"), ) - .with_columns((pl.col("答题人数") / pl.col("总人数")).alias("答题比例")) - .sort("营团", "连队") + .sort("连队") + .with_columns(pl.col("连队").cast(pl.Utf8) + "连") ) df = q.collect() print("各连情况:", df.describe()) +#! EDIT HERE: 统计结果的导出路径 agg_path = scores_path.with_name(f"{scores_path.stem}-agg.xlsx") df.write_excel(agg_path, column_formats={"答题比例": "0.00%"}) print(f"已保存到“{agg_path}”。")