Skip to content

Commit

Permalink
refactor: simplify parsing logic in reply chains
Browse files Browse the repository at this point in the history
  • Loading branch information
KarelZe committed Feb 3, 2024
1 parent 37866e1 commit 86d8da0
Show file tree
Hide file tree
Showing 4 changed files with 37 additions and 57 deletions.
5 changes: 1 addition & 4 deletions .github/workflows/build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ jobs:
.\dist\ms_teams_parser.exe -f ".\forensicsim-data\jane_doe_v_1_4_00_11161\IndexedDB\https_teams.microsoft.com_0.indexeddb.leveldb" -o "jane_doe_v_1_4_00_11161.json"
.\dist\ms_teams_parser.exe -f ".\forensicsim-data\john_doe_v_1_4_00_11161\IndexedDB\https_teams.microsoft.com_0.indexeddb.leveldb" -o "john_doe_v_1_4_00_11161.json"
.\dist\ms_teams_parser.exe -f ".\forensicsim-data\karelze_v_23306_3309_2530_1346\IndexedDB\https_teams.live.com_0.indexeddb.leveldb" -o "karelze_v_23306_3309_2530_1346.json"
.\dist\ms_teams_parser.exe -f ".\forensicsim-data\mboufahja_v_23231_413_2355_7555\IndexedDB\https_teams.microsoft.com_0.indexeddb.leveldb" -o "mboufahja_v_23231_413_2355_7555.json"
- name: Upload results📲
uses: actions/upload-artifact@v4
with:
Expand All @@ -34,7 +33,6 @@ jobs:
jane_doe_v_1_4_00_11161.json
john_doe_v_1_4_00_11161.json
karelze_v_23306_3309_2530_1346.json
mboufahja_v_23231_413_2355_7555.json
retention-days: 1
- name: Test calling script 🖱️
run: |
Expand All @@ -43,8 +41,7 @@ jobs:
python tools/dump_localstorage.py --help
python tools/dump_sessionstorage.py --help
- name: Calculate diff 👽
run: |
run: |
git diff --no-index --word-diff .\forensicsim-data\expected-result\jane_doe_v_1_4_00_11161.json jane_doe_v_1_4_00_11161.json
git diff --no-index --word-diff .\forensicsim-data\expected-result\john_doe_v_1_4_00_11161.json john_doe_v_1_4_00_11161.json
git diff --no-index --word-diff .\forensicsim-data\expected-result\karelze_v_23306_3309_2530_1346.json karelze_v_23306_3309_2530_1346.json
git diff --no-index --word-diff .\forensicsim-data\expected-result\mboufahja_v_23231_413_2355_7555.json mboufahja_v_23231_413_2355_7555.json
85 changes: 34 additions & 51 deletions src/forensicsim/parser.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
import logging
import warnings
from dataclasses import dataclass, field
from datetime import datetime
Expand Down Expand Up @@ -189,88 +190,79 @@ def _parse_people(people: list[dict], version: str) -> set[Contact]:
and p.get("mri") is not None
and version in ("v1", "v2")
):
p |= p.get("value", {})
parsed_people.add(Contact.from_json(json.dumps(p)))
parsed_people.add(Contact.from_dict(p | p.get("value", {})))
else:
print("Teams Version is unknown. Can not extract records of type people.")
logging.warning(
"Teams Version is unknown. Can not extract records of type people."
)
return parsed_people


def _parse_buddies(buddies: list[dict], version: str) -> set[Contact]:
parsed_buddies = set()

for b in buddies:
# Skip empty records
b_value = b.get("value", {})
# Fetch relevant data
if b_value and version in ("v1", "v2"):
buddies_of_b = b_value.get("buddies", [])
for b_of_b in buddies_of_b:
parsed_buddies.add(Contact.from_json(json.dumps(b_of_b)))
parsed_buddies.add(Contact.from_dict(b_of_b))
else:
print("Teams Version is unknown. Can not extract records of type buddies.")
logging.warning(
"Teams Version is unknown. Can not extract records of type buddies."
)
return parsed_buddies


# Conversations can contain multiple artefacts
# -> If type:Meeting then its a meeting
def _parse_conversations(conversations: list[dict], version: str) -> set[Meeting]:
cleaned_conversations = set()

for c in conversations:
if c.get("value") is not None and version in ("v1", "v2"):
if c.get("value", {}).get("type", "") == "Meeting" and "meeting" in c.get(
"value", {}
).get("threadProperties", {}):
c |= c.get("value", {})
c |= {"cached_deduplication_key": c.get("id")}
cleaned_conversations.add(Meeting.from_json(json.dumps(c)))
value = c.get("value", {})
thread_properties = value.get("threadProperties", {})
# Conversations can contain multiple artefacts. Filter only for meetings.
if version in ("v1", "v2") and "meeting" in thread_properties:
c |= value
c |= {"cached_deduplication_key": c.get("id")}
cleaned_conversations.add(Meeting.from_dict(c))
else:
print("Teams Version is unknown. Can not extract records of type meeting.")
logging.warning(
"Teams Version is unknown. Can not extract records of type meeting."
)
return cleaned_conversations


def _parse_reply_chains(reply_chains: list[dict], version: str) -> set[Message]:
cleaned_reply_chains = set()

for rc in reply_chains:
rc_value = rc.get("value", {})

# Skip empty records
if rc["value"] is None:
if not rc_value:
continue

# Fetch relevant data
rc |= rc.get("value", {})
rc |= {"origin_file": rc.get("origin_file")}

rc |= rc_value
message_dict = {}
if version == "v1":
message_dict = rc.get("value", {}).get("messages", {})
message_dict = rc_value.get("messages", {})
elif version == "v2":
message_dict = rc.get("value", {}).get("messageMap", {})
message_dict = rc_value.get("messageMap", {})
else:
print(
logging.warning(
"Teams Version is unknown. Can not extract records of type reply_chains."
)
continue

for k in message_dict:
md = message_dict[k]
if (
md.get("messagetype", "") == "RichText/Html"
or md.get("messagetype", "") == "Text"
or md.get("messageType", "") == "RichText/Html"
or md.get("messageType", "") == "Text"
):
if version == "v1":
rc |= {"cached_deduplication_key": md.get("cachedDeduplicationKey")}
rc |= {"clientmessageid": md.get("clientmessageid")}
rc |= {"composetime": md.get("composetime")}
rc |= {"contenttype": md.get("contenttype")}
rc |= {"created_time": md.get("createdTime")}
rc |= {"is_from_me": md.get("isFromMe")}
rc |= {"messagetype": md.get("messagetype")}
rc |= {"messageKind": md.get("messageKind")}
rc |= {"original_arrival_time": md.get("originalarrivaltime")}

elif version == "v2":
if md.get("messagetype", "") in ("RichText/Html", "Text") or md.get(
"messageType"
) in ("RichText/Html", "Text"):
rc |= md
# map to teams 1.x keys
if version == "v2":
rc |= {"cached_deduplication_key": md.get("dedupeKey")}
rc |= {"clientmessageid": md.get("clientMessageId")}
# set to clientArrivalTime as compose Time is no longer present
Expand All @@ -280,15 +272,6 @@ def _parse_reply_chains(reply_chains: list[dict], version: str) -> set[Message]:
rc |= {"created_time": md.get("clientArrivalTime")}
rc |= {"is_from_me": md.get("isSentByCurrentUser")}
rc |= {"messagetype": md.get("messageType")}
rc |= {"original_arrival_time": md.get("originalArrivalTime")}

# Similar across versions
rc |= {"creator": md.get("creator")}
rc |= {"conversation_id": md.get("conversationId")}
rc |= {"content": md.get("content")}
rc |= {"client_arrival_time": md.get("clientArrivalTime")}
rc |= {"version": md.get("version")}
rc |= {"properties": md.get("properties")}

cleaned_reply_chains.add(Message.from_dict(rc))

Expand Down
2 changes: 1 addition & 1 deletion tools/Forensicsim_Parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@
# Factory that defines the name and details of the module and allows Autopsy
# to create instances of the modules that will do the analysis.
class ForensicIMIngestModuleFactory(IngestModuleFactoryAdapter):
def __init__(self) -> None:
def __init__(self):
self.settings = None

moduleName = "Microsoft Teams Parser"
Expand Down

0 comments on commit 86d8da0

Please sign in to comment.