-
Notifications
You must be signed in to change notification settings - Fork 1
/
run_streamlit.py
640 lines (523 loc) · 31.8 KB
/
run_streamlit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
#!/usr/bin/env python3
"""
To run:
streamlit run ./run_streamlit.py
"""
from datetime import datetime
import numpy as np
import pandas as pd
# import math
from src import util
from src import filter_util
from src.helpers import io
from src import constants
from src import html_util
import streamlit as st
# from st_aggrid import GridOptionsBuilder, AgGrid, GridUpdateMode, DataReturnMode, JsCode
import streamlit.components.v1 as components
# import requests
# import webbrowser
from PIL import Image
import yaml
INFO = {}
@st.cache_data
def load_constants():
return io.read_all_constants()
@st.cache_data
def load_data():
data_summary = io.read_data_summary_json("data_summaries/")
data_summary = filter_util.map_license_criteria(data_summary, INFO["constants"])
# st.write([r["Unique Dataset Identifier"] for r in data_summary if "License Attribution (DataProvenance)" not in r])
# st.write(data_summary[0].keys())
return pd.DataFrame(data_summary).fillna("")
# def render_tweet(tweet_url):
# api = "https://publish.twitter.com/oembed?url={}".format(tweet_url)
# response = requests.get(api)
# html_result = response.json()["html"]
# st.text(html_result)
# components.html(html_result, height= 360, scrolling=True)
def insert_main_viz():
# p5.js embed
sketch = '<script src="https://cdnjs.cloudflare.com/ajax/libs/p5.js/1.6.0/p5.js"></script>'
sketch += '<script src="https://cdnjs.cloudflare.com/ajax/libs/p5.js/1.6.0/addons/p5.sound.min.js"></script>'
sketch += '<script>'
sketch += "const JSONDATA = " + open("static/ds_viz1.json", 'r', encoding='utf-8').read() + "\n"
sketch += open("static/sketch.js", 'r', encoding='utf-8').read()
sketch += '</script>'
components.html(sketch, height=800, scrolling=True)
def custom_metric(caption, score, delta=None):
st.markdown("## :green[" + str(score) + "]")
# st.subheader(" :green[" + str(score) + "]")
if delta:
st.markdown(" " + str(delta))
st.markdown(f":gray[{caption}]")
# st.caption(caption)
# :red[**NOT** to be taken as legal advice]
def display_metrics(metrics, df_metadata):
# metric_columns = st.columns(4)
# metric_columns[0].metric("Collections", len(metrics["collections"]), delta=f"/ {len(df_metadata['collections'])}")#, delta_color="off")
# metric_columns[1].metric("Datasets", len(metrics["datasets"]), delta=f"/ {len(df_metadata['datasets'])}")
# metric_columns[2].metric("Languages", len(metrics["languages"]), delta=f"/ {len(df_metadata['languages'])}")
# metric_columns[3].metric("Task Categories", len(metrics["task_categories"]), delta=f"/ {len(df_metadata['task_categories'])}")
metric_columns = st.columns(3)
# with metric_columns[0]:
# st.metric("Collections", len(metrics["collections"]), delta=f"/ {len(df_metadata['collections'])}")#, delta_color="off")
# st.metric("Datasets", len(metrics["datasets"]), delta=f"/ {len(df_metadata['datasets'])}")
# st.metric("Dialogs", metrics["dialogs"], delta=f"/ {df_metadata['dialogs']}")
# with metric_columns[1]:
# st.metric("Languages", len(metrics["languages"]), delta=f"/ {len(df_metadata['languages'])}")
# st.metric("Task Categories", len(metrics["task_categories"]), delta=f"/ {len(df_metadata['task_categories'])}")
# st.metric("Topics", len(metrics["topics"]), delta=f"/ {len(df_metadata['topics'])}")
# with metric_columns[2]:
# st.metric("Text Domains", len(metrics["domains"]), delta=f"/ {len(df_metadata['domains'])}")
# st.metric("Text Sources", len(metrics["sources"]), delta=f"/ {len(df_metadata['sources'])}")
# st.metric("% Synthetic Text", metrics["synthetic_pct"])
with metric_columns[0]:
custom_metric("Collections", len(metrics["collections"]), delta=f"/ {len(df_metadata['collections'])}") # , delta_color="off")
custom_metric("Datasets", len(metrics["datasets"]), delta=f"/ {len(df_metadata['datasets'])}")
custom_metric("Dialogs", metrics["dialogs"], delta=f"/ {df_metadata['dialogs']}")
with metric_columns[1]:
custom_metric("Languages", len(metrics["languages"]), delta=f"/ {len(df_metadata['languages'])}")
custom_metric("Task Categories", len(metrics["task_categories"]), delta=f"/ {len(df_metadata['task_categories'])}")
custom_metric("Topics", len(metrics["topics"]), delta=f"/ {len(df_metadata['topics'])}")
with metric_columns[2]:
custom_metric("Text Domains", len(metrics["domains"]), delta=f"/ {len(df_metadata['domains'])}")
custom_metric("Text Sources", len(metrics["sources"]), delta=f"/ {len(df_metadata['sources'])}")
custom_metric("% Synthetic Text", metrics["synthetic_pct"])
def insert_metric_container(title, key, metrics):
with st.container():
st.caption(title)
fig = util.plot_altair_barchart(metrics[key])
# fig = util.plot_altair_piechart(metrics[key], title)
st.altair_chart(fig, use_container_width=True, theme="streamlit")
def add_instructions():
st.title("Data Provenance Explorer")
col1, col2 = st.columns([0.75, 0.25], gap="medium")
with col1:
intro_sents = """
The Data Provenance Initiative is a large-scale audit of AI datasets used to train large language models. As a first step, we've traced 1800+ popular,
text-to-text finetuning datasets from origin to creation, cataloging their data sources, licenses, creators, and other metadata, for researchers to explore
using this tool.
"""
follow_sents = "The purpose of this work is to improve transparency, documentation, and informed use of datasets in AI. "
st.write(" ".join([intro_sents, follow_sents]))
st.write("You can download this data (with filters) directly from the [Data Provenance Collection](https://github.com/Data-Provenance-Initiative/Data-Provenance-Collection).")
st.write("If you wish to contribute or discuss, please feel free to contact the organizers at [[email protected]](mailto:[email protected]).")
# st.write("NB: This data is compiled voluntarily by the best efforts of academic & independent researchers, and is :red[**NOT** to be taken as legal advice].")
st.write("""
NB: It is important to note we collect *self-reported licenses*, from the papers and repositories that released these datasets, and categorize them according to our best efforts,
as a volunteer research and transparency initiative. The information provided by any of our works and any outputs of the Data Provenance Initiative :red[do **NOT**, and are **NOT**
intended to, constitute legal advice]; instead, all information, content, and materials are for general informational purposes only.
""")
col1a, col1b, col1c = st.columns([0.16, 0.16, 0.68], gap="small")
with col1a:
st.link_button("Data Repository", 'https://github.com/Data-Provenance-Initiative/Data-Provenance-Collection', type="primary")
with col1b:
st.link_button("Paper", 'https://www.dataprovenance.org/paper.pdf', type="primary")
# col1a, col1b = st.columns(2, gap="large")
# with col1a:
# st.link_button("Paper", 'https://www.dataprovenance.org/paper.pdf', type="primary")
# with col1b:
# st.link_button("Data Repository", 'https://github.com/Data-Provenance-Initiative/Data-Provenance-Collection', type="primary")
# st.link_button('Paper', 'https://www.dataprovenance.org/paper.pdf', type="primary")
# st.link_button('Download Repository', 'https://github.com/Data-Provenance-Initiative/Data-Provenance-Collection', type="primary")
# if st.button('Paper', type="primary"):
# webbrowser.open_new_tab('https://www.dataprovenance.org/paper.pdf')
# if st.button('Download Repository', type="primary"):
# webbrowser.open_new_tab('https://github.com/Data-Provenance-Initiative/Data-Provenance-Collection')
with col2:
image = Image.open('dpi.png')
st.image(image) # , caption='Sunrise by the mountains')
st.subheader("Instructions")
form_instructions = """
1. **Select the source from where the license information should be retrieved.** The options are `DataProvenance (ours)`, `HuggingFace`, and `GitHub`.
2. **Select from the licensed data use cases**. The options range from least to most strict:
`Commercial`, `Unspecified`, `Non-Commercial`, `Academic-Only`.
* `Commercial` will select only the data with licenses explicitly permitting commercial use.
* `Unspecified` includes Commercial plus datasets with no license found attached, which may suggest the curator does not prohibit commercial use.
* `Non-Commercial` includes Commercial and Unspecified datasets plus those licensed for non-commercial use.
* `Academic-Only` will select all available datasets, including those that restrict to only academic uses.
Note that these categories reflect the *self-reported* licenses attached to datasets, and assume fair use of any data they are derived from (e.g. scraped from the web).
3. Select whether to include datasets with **Attribution requirements in their licenses**.
4. Select whether to include datasets with **`Share-Alike` requirements in their licenses**.
Share-Alike means a copyright left license, that allows other to re-use, re-mix, and modify works, but requires that derivative work is distributed under the same terms and conditions.
5. Select whether to ignore the [OpenAI Terms of Use](https://openai.com/policies/terms-of-use) as a Non-Commercial restriction, and include datasets that are at least partially **generated by
OpenAI** (inputs, outputs, or both). While the OpenAI terms state you cannot ``use output from the Services to develop models that compete with OpenAI'', there is debate as to their enforceability
and applicability to third parties who did not generate this data themselves. See our Legal Discussion section in the paper for more discussion on these terms.
6. Select to use GitHub license information if not available ("undefined") for our Data Provenance source
(i.e. In cases where we do not have complete information about a data source, we have the option to use license information from GitHub as a fallback. This means that if license information is unavailable,
we can use the information (if available) from GitHub to fill in the gaps and ensure that we have complete and accurate license information for our Data Provenance source.).
7. **Select Language Families** to include.
8. **Select Task Categories** to include.
9. **Select Time of Collection**. By default it includes all datasets.
10. **Select the Text Domains** to include.
11. **Select the allowed text sources**.
12. **Select whether to include datasets with model generated data**.
Finally, Submit Selection when ready!
"""
with st.expander("Expand for Instructions!"):
st.write(form_instructions)
def streamlit_app():
st.set_page_config(page_title="Data Provenance Explorer", layout="wide") # , initial_sidebar_state='collapsed')
INFO["constants"] = load_constants()
INFO["data"] = load_data()
df_metadata = util.compute_metrics(INFO["data"], INFO["constants"])
add_instructions()
# ### ALTERNATIVE STARTS HERE
st.markdown("""Select the preferred criteria for your datasets.""")
with st.form("data_selection"):
col1, col2, col3 = st.columns([1, 1, 1], gap="medium")
with col1:
licensesource_multiselect = st.multiselect(
'Select the license source to select a dataset',
["DataProvenance", "HuggingFace", "GitHub"],
["DataProvenance"])
# st.write("Select the acceptable license values for constituent datasets")
license_multiselect = st.select_slider(
'Select the datasets licensed for these use cases',
options=constants.LICENSE_USE_CLASSES,
value="Academic-Only")
license_attribution = st.toggle('Include Datasets w/ Attribution Requirements', value=True)
license_sharealike = st.toggle('Include Datasets w/ Share Alike Requirements', value=True)
openai_license_override = st.toggle('Always include datasets w/ OpenAI-generated data. (I.e. See `instructions` above for details.)', value=False)
dpi_license_override = st.toggle('Use GitHub Licence information if DataProvenance is undefined. (I.e. See `instructions` above for details.)', value=False)
with col3:
taskcats_multiselect = st.multiselect(
'Select the task categories to cover in your datasets',
["All"] + list(INFO["constants"]["TASK_GROUPS"].keys()),
["All"])
# with st.expander("More advanced criteria"):
# format_multiselect = st.multiselect(
# 'Select the format types to cover in your datasets',
# ["All"] + INFO["constants"]["FORMATS"],
# ["All"])
domain_multiselect = st.multiselect(
'Select the domain types to cover in your datasets',
["All"] + list(INFO["constants"]["DOMAIN_GROUPS"].keys()),
# ["All", "Books", "Code", "Wiki", "News", "Biomedical", "Legal", "Web", "Math+Science"],
["All"])
text_sources = st.multiselect(
'Select the text sources to cover in your datasets',
["All"] + io.read_txt('src/configs/limited_text_sources.txt'),
["All"])
model_generated = st.toggle('Include only datasets that are not generated by a model', value=True)
with col2:
language_multiselect = st.multiselect(
'Select the languages to cover in your datasets',
["All"] + list(INFO["constants"]["LANGUAGE_GROUPS"].keys()),
["All"])
time_range_selection = st.slider(
"Select data release time constraints",
value=(datetime(2000, 1, 1), datetime(2023, 12, 1)))
st.divider()
# Every form must have a submit button.
submitted = st.form_submit_button("Submit Selection")
if submitted:
start_time = time_range_selection[0].strftime('%Y-%m-%d')
end_time = time_range_selection[1].strftime('%Y-%m-%d')
# We do this check to make sure we include No-Time datasets.
if start_time == "2000-01-01":
start_time = None
if end_time == "2023-12-01":
end_time = None
filtered_df = filter_util.apply_filters(
df=INFO["data"],
all_constants=INFO["constants"],
selected_collection=None,
selected_licenses=None, # Select all licenses.
selected_license_sources=licensesource_multiselect,
selected_license_use=license_multiselect,
openai_license_override=openai_license_override,
selected_license_attribution=str(int(license_attribution)),
selected_license_sharealike=str(int(license_sharealike)),
selected_languages=language_multiselect,
selected_task_categories=taskcats_multiselect,
selected_domains=domain_multiselect,
selected_start_time=start_time,
selected_end_time=end_time,
dpi_undefined_license_override=int(dpi_license_override),
no_synthetic_data=model_generated,
text_source_allow_list=text_sources
)
def format_datetime(value):
if isinstance(value, pd.Timestamp):
return value.strftime('%Y-%m-%d')
return value
formatted_df = filtered_df.applymap(format_datetime)
filtered_data_summary = {row["Unique Dataset Identifier"]: row for row in formatted_df.to_dict(orient='records')}
# save config file
config_data = {
"collection": None,
"license_use": license_multiselect.lower(),
"licenses": None,
"license_sources": licensesource_multiselect,
"dpi-undefined-license-override": int(dpi_license_override),
"openai-license-override": int(openai_license_override),
"license_attribution": int(license_attribution),
"license_sharealike": int(license_sharealike),
"model-generated": int(model_generated),
"languages": [str(language) for language in language_multiselect if language != "All"],
"tasks": [str(task) for task in taskcats_multiselect if task != "All"],
"domains": [str(domain) for domain in domain_multiselect if domain != "All"],
"text-sources": [str(text_source) for text_source in text_sources if text_source != "All"],
"start-time": None if start_time is None else start_time.strftime('%Y-%m-%d'),
"end-time": None if end_time is None else end_time.strftime('%Y-%m-%d'),
"data-limit": 0,
"output-format": "messages",
"savedir": "data/",
}
config_str = yaml.dump(config_data, default_flow_style=None, sort_keys=False)
timestep = datetime.now().strftime("%Y%m%d%H%M%S")
st.download_button(
label="Download Configuration File",
data=config_str,
file_name=f"{timestep}_config_DPI.yaml",
mime="application/x-yaml"
)
tab1, tab2, tab3, tab4, tab5 = st.tabs([
"Data Summary",
":rainbow[Global Representation] :earth_africa:",
"Text Characteristics :test_tube:",
"Data Licenses :vertical_traffic_light:",
"Inspect Individual Datasets :mag:"])
with tab1:
if not submitted:
st.write("When you're ready, fill out your data filtering criteria on the left, and click Submit!\n\n")
elif submitted:
metrics = util.compute_metrics(filtered_df, INFO["constants"])
st.subheader('General Properties of your collection')
st.write(r"""
Given your selection, see the quantity of data (collections, datasets, dialogs), the characteristics of the data (languages, tasks, topics),
and the sources of data covered (sources, domains, \% synthetically generated by models).
""")
st.markdown('#')
display_metrics(metrics, df_metadata)
# insert_metric_container("Language Distribution", "languages", metrics)
# insert_metric_container("Task Category Distribution", "task_categories", metrics)
with st.container():
st.header('Summary of Data Collections')
table = util.prep_collection_table(filtered_df, INFO["data"], metrics)
html_util.setup_table(table)
with tab2:
st.header(":rainbow[Global Representation] :earth_africa:")
tab2_intro = """This section explores the representation of text datasets internationally.
These datasets contain a wide distribution of languages, and are created by many organizations and insitutions.
We measure both the representation across countries in which these languages are spoken, as well as "who creates these datasets"?
"""
st.write(tab2_intro)
if submitted:
st.subheader("Language Representation by Country")
st.write(r"""
First we visualize the language coverage per country, according to the spoken languages and their representation in the Data Provenance Collection.
We compute a score $S_k$ for each country $k$, parametrized by $p_{kl}$, the percentage of people in country $k$ that speak language $l$, and $w_{li}$
which is a binary indicator that is 1 if dataset $i \in D$ contains language $l$ and 0 otherwise.
""")
st.latex(r'''
S_k = \sum_{l \in L} \left( p_{kl} \times \sum_{i \in D} w_{li} \right)
''')
html_util.compose_html_component(
filtered_data_summary,
"language-map.js", {
"world": "html/countries-50m.json",
"countryCodes": "html/country-codes.json",
"langCodes": "html/language-codes.json",
"countryCodeToLangCodes": "html/country-code-to-language-codes.json",
})
st.write("NB: While many global south countries have large English speaking populations, it may still not mean they are well represented by English text from Western/European origins.")
st.subheader("Dataset Creator Representation by Country")
st.write("Here we visualize the density of organizations that package/create these datasets for machine learning, in contrast to the above.")
st.write("This may help answer 'who owns the data?'")
html_util.compose_html_component(
filtered_data_summary,
"creator-map.js", {
"world": "html/countries-50m.json",
"countryToCreator": "html/constants/creator_groups_by_country.json",
})
st.subheader("Dataset Creator Proportions")
st.write("Here we count the contributions of organizations to dataset creation.")
html_util.compose_html_component(
filtered_data_summary,
"creator-sunburst.js", {
"CREATOR_GROUPS": "html/constants/creator_groups.json",
}, 1600)
with tab3:
st.header("Text Characteristics :test_tube:")
st.write("This section covers various characteristics of the text in the datasets.")
if submitted:
st.subheader("Text Length Metrics x License Category")
st.write("Text-to-text datasets are formatted as an input-target pair.")
st.write("Here each point is a dataset, showing its input text length (in characters), target text length (in characters), and license category.")
html_util.compose_html_component(
filtered_data_summary, "text-metrics-licenses.js", {})
st.subheader("Text Length Metrics x Regular/Synthetic Text")
st.write("New text-to-text datasets are often synthetically generated by large models like GPT-4.")
st.write("""
Here each point is a dataset, showing its input text length (in characters), target text length (in characters), and whether it is synthetically generated,
or manually/human created.
""")
html_util.compose_html_component(
filtered_data_summary, "text-metrics-synthetic.js", {})
st.subheader("Task Category Distribution")
st.write("Here we measure the variety and distribution of tasks that the datasets represent -- i.e. what they're teaching a model to do.")
html_util.compose_html_component(
filtered_data_summary,
"tasks-sunburst.js", {
"TASK_GROUPS": "html/constants/task_groups.json",
}, 1200)
# tree
st.subheader("Text Source Domains")
st.write("""
Many datasets are originally scraped from the web or other sources. For the data you've selected, we cluster the original sources by Domain,
quantify them and show the top sources 5 per domain.
""")
html_util.compose_html_component(
filtered_data_summary,
"source-tree.js", {
"DOMAIN_GROUPS": "html/constants/domain_groups.json",
}, 2400)
with tab4:
st.header("Data Licenses :vertical_traffic_light:")
st.write("This section explores the *self-reported* data licenses by the creators of each dataset.")
tab4_intro = """
Note a few important limitations:
* The legal status of data licenses is not always clear and may be different by jurisdiction.
* Despite our best efforts, omissions or mistakes are possible.
* This transparency initative is **not** intended as legal advice, and bears no responsibility on how the *self-reported* licenses are used.
"""
st.markdown(tab4_intro)
if submitted:
st.subheader("License Distribution")
st.write("Here we see the license distribution of those collected by the Data Provenance Initiative.")
insert_metric_container("License Distribution", "licenses", metrics)
with tab5:
st.header("Inspect Individual Datasets :mag:")
with st.form("data_explorer"):
dataset_select = st.selectbox(
'Select the dataset in this collection to inspect',
list(set(INFO["data"]["Unique Dataset Identifier"])))
submitted2 = st.form_submit_button("Submit Selection")
if submitted2:
tab2_selected_df = INFO["data"][INFO["data"]["Unique Dataset Identifier"] == dataset_select]
tab2_metrics = util.compute_metrics(tab2_selected_df, INFO["constants"])
display_metrics(tab2_metrics, df_metadata)
with st.container():
# do we need this?
# collection_info_keys = [
# "Collection Name",
# "Collection URL",
# "Collection Hugging Face URL",
# "Collection Paper Title",
# ]
dataset_info_keys = [
"Unique Dataset Identifier",
"Paper Title",
"Dataset URL",
"Hugging Face URL",
]
data_characteristics_info_keys = [
"Format", "Languages", "Task Categories",
("Inferred Metadata", "Text Topics")
]
data_provenance_info_keys = ["Creators", "Text Sources", "Licenses"]
def extract_infos(df, key, numerical=False):
if isinstance(key, tuple):
dds = df[key[0]].tolist()
# st.write(dds)
entries = [dd.get(key[1], []) for dd in dds]
# st.write(entries)
else:
entries = df[key].tolist()
if not entries:
return []
elif numerical:
return np.mean([x for x in entries if x])
elif key == "Licenses":
return set([x["License"] for xs in entries for x in xs if x and x["License"]])
elif isinstance(entries[0], list):
return list(set([x for xs in entries if xs for x in xs if x]))
else:
return list(set([x for x in entries if x]))
# st.caption("Collection Information")
# for info_key in collection_info_keys:
# st.text(f"{item}: {extract_infos(tab2_selected_df, info_key)}")
def format_markdown_entry(dset_info, info_key):
if dset_info:
info_key = info_key if isinstance(info_key, str) else info_key[1]
markdown_txt = dset_info
if isinstance(dset_info, list) or isinstance(dset_info, set):
# if len(dset_info) == 1:
# markdown_txt = list(dset_info)[0]
# else:
markdown_txt = "\n* " + "\n* ".join([str(x) for x in dset_info])
st.markdown(f"{info_key}: {markdown_txt}")
# st.write(tab2_selected_df)
if dataset_select != "All":
st.subheader("Dataset Information")
for info_key in dataset_info_keys:
# st.write(info_key)
dset_info = extract_infos(tab2_selected_df, info_key)
if len(dset_info):
format_markdown_entry(dset_info[0], info_key)
st.subheader("Data Characteristics")
for info_key in data_characteristics_info_keys:
dset_info = extract_infos(tab2_selected_df, info_key)
# st.write(dset_info)
format_markdown_entry(dset_info, info_key)
st.subheader("Data Statistics")
# for info_key in data_characteristics_info_keys:
dset_info = extract_infos(tab2_selected_df, ("Text Metrics", "Num Dialogs"), numerical=True)
format_markdown_entry(round(dset_info, 0), "Num Exs")
dset_infos = [extract_infos(tab2_selected_df, info_key, numerical=True) for info_key in [
("Text Metrics", "Min Inputs Length"),
("Text Metrics", "Mean Inputs Length"),
("Text Metrics", "Max Inputs Length")]]
format_markdown_entry(" | ".join([str(round(x, 1)) for x in dset_infos]), "Input Length (characters) [Minimum | Mean | Maximum]")
dset_infos = [extract_infos(tab2_selected_df, info_key, numerical=True) for info_key in [
("Text Metrics", "Min Targets Length"),
("Text Metrics", "Mean Targets Length"),
("Text Metrics", "Max Targets Length")]]
format_markdown_entry(" | ".join([str(round(x, 1)) for x in dset_infos]), "Target Length (characters) [Minimum | Mean | Maximum]")
st.subheader("Data Provenance")
for info_key in data_provenance_info_keys:
dset_info = extract_infos(tab2_selected_df, info_key)
format_markdown_entry(dset_info, info_key)
# ## SIDEBAR STARTS HERE
# with st.sidebar:
# st.markdown("""Select the preferred criteria for your datasets.""")
# with st.form("data_selection"):
# # st.write("Select the acceptable license values for constituent datasets")
# license_multiselect = st.select_slider(
# 'Select the datasets licensed for these use cases',
# options=constants.LICENSE_USE_CLASSES,
# value="Academic-Only")
# license_attribution = st.toggle('Exclude Datasets w/ Attribution Requirements', value=False)
# license_sharealike = st.toggle('Exclude Datasets w/ Share Alike Requirements', value=False)
# openai_license_override = st.toggle('Include Datasets w/ OpenAI-generated data', value=False)
# # with data_select_cols[1]:
# language_multiselect = st.multiselect(
# 'Select the languages to cover in your datasets',
# ["All"] + list(INFO["constants"]["LANGUAGE_GROUPS"].keys()),
# ["All"])
# # with data_select_cols[2]:
# taskcats_multiselect = st.multiselect(
# 'Select the task categories to cover in your datasets',
# ["All"] + list(INFO["constants"]["TASK_GROUPS"].keys()),
# ["All"])
# with st.expander("More advanced criteria"):
# # format_multiselect = st.multiselect(
# # 'Select the format types to cover in your datasets',
# # ["All"] + INFO["constants"]["FORMATS"],
# # ["All"])
# domain_multiselect = st.multiselect(
# 'Select the domain types to cover in your datasets',
# ["All"] + list(INFO["constants"]["DOMAIN_GROUPS"].keys()),
# # ["All", "Books", "Code", "Wiki", "News", "Biomedical", "Legal", "Web", "Math+Science"],
# ["All"])
# time_range_selection = st.slider(
# "Select data release time constraints",
# value=(datetime(2000, 1, 1), datetime(2023, 12, 1)))
# # Every form must have a submit button.
# submitted = st.form_submit_button("Submit Selection")
# ### SIDEBAR ENDS HERE
if __name__ == "__main__":
streamlit_app()