Skip to content

Latest commit

 

History

History
723 lines (607 loc) · 13.8 KB

README.md

File metadata and controls

723 lines (607 loc) · 13.8 KB

BNCparse

Quirin Würschinger, LMU Munich

[email protected]

Documentation: https://wuqui.github.io/bncparse/

Please visit the above website, as GitHub cannot render everything properly in the version below.

Data overview

The diagram below illustrates all of the data that is currently available. Variables that have been added to what was available from the downloadable version of the BNC are marked with a + prefix.

Load packages

Package requirements are stored in requirements.yml.

Variables

For development, I use a small subset of the corpus contained in data/test that only contains the first 10 texts.

testing = True

if testing:
    path_bnc = Path('../data/test/bnc-2014-spoken')
    assert path_bnc.exists()
    texts_n = 10
    tokens_n = 94_659
else:
    path_bnc = Path('../data/bnc-2014-spoken')
    assert path_bnc.exists()
    texts_n = 1251
    tokens_n = 11_422_615
path_corpus = Path(path_bnc / 'spoken' / 'tagged')
path_corpus_untagged = Path(path_bnc / 'spoken' / 'untagged')
path_metadata = Path(path_bnc / 'spoken' / 'metadata')
fp_meta_speakers = Path('../data/bnc-2014-spoken/spoken/metadata/bnc2014spoken-speakerdata.tsv')
fp_meta_speakers_fields = Path('../data/bnc-2014-spoken/spoken/metadata/metadata-fields-speaker.txt')
fp_meta_texts = Path('../data/bnc-2014-spoken/spoken/metadata/bnc2014spoken-textdata.tsv')
fp_meta_texts_fields = Path('../data/bnc-2014-spoken/spoken/metadata/metadata-fields-text.txt')
assert path_corpus.exists()
assert path_corpus_untagged.exists()
assert path_metadata.exists()
assert fp_meta_speakers.exists()
assert fp_meta_speakers_fields.exists()
assert fp_meta_texts.exists()
assert fp_meta_texts_fields.exists()

Load and parse XML

path_texts = list(path_corpus.glob('*.xml'))
assert len(path_texts) == texts_n

source

get_xml

 get_xml (f_path)
texts = [get_xml(path) for path in path_texts]

Texts

meta_texts_head = pd.read_csv(
    fp_meta_texts_fields,
    delimiter='\t',
    skiprows=1,
    index_col=0
)
meta_texts = pd.read_csv(
    fp_meta_texts, 
    delimiter='\t', 
    names=meta_texts_head['XML tag'],
    index_col=0
)

Add number of tokens per text

texts_tokens = []

for text in texts:
    text_d = {}
    text_d['text_id'] = text.get('id')
    text_d['text_toks_n'] = 0
    for tok in text.iter('w'):
        text_d['text_toks_n'] += 1
    texts_tokens.append(text_d)
texts_tokens = pd.DataFrame(texts_tokens)
texts_tokens
# reset index and call it text_id
meta_texts_merge = meta_texts.reset_index().rename(columns={'index': 'text_id'})
meta_texts = pd.merge(
    left=meta_texts_merge,
    right=texts_tokens,
    on='text_id'
)
meta_texts
if not testing:
    meta_texts.to_csv('../out/texts.csv', index=False)

Utterances

utterances = []

for text in texts:
    for u in text.findall('u'):
        u_d = {}
        u_d['text_id'] = text.get('id')
        u_d['u_n'] = u.get('n')
        u_d['u_who'] = u.get('who')
        u_d['u_trans'] = u.get('trans')
        u_d['u_whoConfidence'] = u.get('whoConfidence')
        u_d['u_toks_n'] = len(list(u.iter('w')))
        utterances.append(u_d)
utterances = pd.DataFrame(utterances)
utterances
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
text_id u_n u_who u_trans u_whoConfidence u_toks_n
0 SN64 1 S0590 nonoverlap high 18
1 SN64 2 S0588 nonoverlap high 0
2 SN64 3 S0590 nonoverlap high 1
3 SN64 4 S0588 nonoverlap high 9
4 SN64 5 S0589 overlap high 7
... ... ... ... ... ... ...
1248105 SMHY 261 S0037 overlap high 9
1248106 SMHY 262 S0115 nonoverlap high 2
1248107 SMHY 263 S0037 nonoverlap high 6
1248108 SMHY 264 S0115 nonoverlap high 29
1248109 SMHY 265 S0037 nonoverlap high 1

1248110 rows × 6 columns

if not testing:
    utterances.to_csv('../out/utterances.csv', index=False)

Create utterance table for annotating

For this, I use the untagged version of the corpus in the directory spoken/untagged/.

path_texts_untag = list(path_corpus_untagged.glob('*.xml'))
texts_untag = [get_xml(fp) for fp in path_texts_untag]

Limit to texts that have the word ‘request’ in the conv_type field of the header preamble.

texts_untag_requests = []

for text in texts_untag:
    header = text.find('header')
    if header is not None:
        conv_type = header.find('conv_type')
        if conv_type is not None:
            conv_type_text = conv_type.text
            if conv_type_text is not None:
                if 'request' in conv_type_text:
                    texts_untag_requests.append(text)
                else:
                    continue
            else:
                continue
        else:
            continue
    else:
        continue
print(
    f'all texts: {len(texts_untag)}',
    f'texts with requests: {len(texts_untag_requests)}',
    sep='\n'
)
all texts: 1251
texts with requests: 154
utterances_requests = []

for text in texts_untag_requests:
    for u in text.iter('u'):
        u_d = {}
        u_d['text_id'] = text.get('id')
        u_d['u_n'] = u.get('n')
        u_d['u_who'] = u.get('who')
        u_d['text'] = u.text
        utterances_requests.append(u_d)
utterances_requests = pd.DataFrame(utterances_requests)
utterances_requests
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
text_id u_n u_who text
0 SQ2W 1 S0439 we have to like move out of this house
1 SQ2W 2 S0441 no
2 SQ2W 3 S0439 no no not move out of the house we have this v...
3 SQ2W 4 S0441 None
4 SQ2W 5 S0439 oh er are you here this weekend?
... ... ... ... ...
204613 SJSC 1166 S0439 mm
204614 SJSC 1167 S0440 but the pension at the moment
204615 SJSC 1168 S0439 we're not
204616 SJSC 1169 S0440 None
204617 SJSC 1170 S0440 ah bonjour Paris calling Paris calling

204618 rows × 4 columns

filter out utterances without text

utterances_requests = utterances_requests[utterances_requests['text'].notna()]

randomize rows

utterances_requests = utterances_requests.sample(frac=1).reset_index(drop=True)

select first 50,000 rows

utterances_requests = utterances_requests.iloc[:50_000]

write out to out/utterances_requests_50k.csv

if not testing:
    utterances_requests.to_csv(
        '../out/utterances_requests_50k.csv', index=False)

Speakers

meta_speakers_head = pd.read_csv(
    fp_meta_speakers_fields,
    delimiter='\t',
    skiprows=1,
    index_col=0
)
meta_speakers = pd.read_csv(
    fp_meta_speakers, 
    delimiter='\t', 
    names=meta_speakers_head['XML tag'],
    index_col=0
)
meta_speakers

Add number of tokens per speaker

speakers_toks = defaultdict(int)

for text in texts:
    for u in text.iter('u'):
        who = u.get('who')
        n_words = len([w for w in u.iter('w')])
        speakers_toks[who] += n_words
speaker_toks = pd.DataFrame(list(speakers_toks.items()), columns=['who', 'speaker_toks_n'])
speaker_toks.sort_values(by='speaker_toks_n', ascending=False).head(10)
meta_speakers_merge = meta_speakers.reset_index().rename(columns={'index': 'who'})
meta_speakers = pd.merge(
    left=meta_speakers_merge,
    right=speaker_toks,
    on='who'
)
meta_speakers

Write out

if not testing:
    meta_speakers.to_csv('../out/speakers.csv', index=False)

Tokens

In addition to the metadata present in the corpus, I’ve added the following columns:

  • w_idx: token position (‘index’) in the given utterance, starting at 1
  • w_L1: preceding token
  • w_R1: subsequent token
tokens = []

for text in texts:
    tok_d = {}
    tok_d['text_id'] = text.get('id')

    for u in text.findall('u'):
        tok_d['u_n'] = u.get('n')

        u_toks = list(u.iter('w'))
        for i, w in enumerate(u_toks):
            tok_d['w_pos'] = w.get('pos')
            tok_d['w_lemma'] = w.get('lemma')
            tok_d['w_class'] = w.get('class')
            tok_d['w_usas'] = w.get('usas')
            tok_d['w_text'] = w.text
            tok_d['w_idx'] = i + 1
            tok_d['w_L1'] = u_toks[i-1].text if i > 0 else '<s>'
            tok_d['w_R1'] = u_toks[i+1].text if i < len(u_toks) - 1 else '</s>'

            tokens.append(tok_d.copy())
tokens = pd.DataFrame(tokens)
tokens.head(20)
assert len(tokens) == tokens_n

I export the full token table to tokens.csv.

if not testing:
    tokens.to_csv('../out/tokens.csv', index=False)

I also export a smaller version for use in spreadsheet software. This version contains the first 50,000 tokens in the corpus and is stored in tokens_small.csv.

if not testing:
    (tokens
     .head(50_000)
     .to_csv('../out/tokens_small.csv', index=False))

Merge tokens with metadata

tokens.info()

+ utterance information

toks_utt = pd.merge(
    tokens,
    utterances,
    on = ['text_id', 'u_n']
)
toks_utt.info()

+ text information

toks_utt_text = pd.merge(
    toks_utt,
    meta_texts,
    on = 'text_id'
)
toks_utt_text.info()

+ speaker information

toks_utt_text_speakers = pd.merge(
    toks_utt_text,
    meta_speakers,
    left_on = 'u_who',
    right_on = 'who'
)
toks_utt_text_speakers.info()

Write out

if not testing:
    toks_utt_text_speakers.to_csv('../out/tokens-plus-meta.csv', index=False)
    print(f'number of rows: {len(toks_utt_text_speakers)}')
    print(f'file size: {os.path.getsize("../out/tokens-plus-meta.csv") / 1_000_000:.2f} MB')

I also write out a small version containing the first 50,000 rows for use in spreadsheet software:

if not testing:
    toks_utt_text_speakers.iloc[:50_000].to_csv(
        '../out/tokens-plus-meta_small.csv', index=False)
    print(f'number of rows: {len(toks_utt_text_speakers.iloc[:50_000])}')
    print(f'file size: {os.path.getsize("../out/tokens-plus-meta_small.csv") / 1_000_000:.2f} MB')