-
Notifications
You must be signed in to change notification settings - Fork 0
/
representative_subset_data.py
36 lines (28 loc) · 1.56 KB
/
representative_subset_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
# Adjusts the display width of Jupyter notebooks to 100% of the container width for better visualization
import pandas as pd
import numpy as np
# Load the spreadsheet
global_G = pd.read_csv('lineage_AY_43.tsv', sep='\t')
# Reads a tab-separated values (TSV) file into a DataFrame named 'global_G'
# View the header of the DataFrame
global_G.columns
# Displays the column names of the DataFrame
# Apply filters to select rows with high coverage
global_High = (global_G['Is high coverage?'] == True)
# Creates a boolean Series where True indicates high coverage
global_High = global_G.iloc[np.where(global_High == True)[0]]
# Selects rows from 'global_G' where the coverage is high
# Split the 'Location' column into two separate columns
global_High["Location"] = global_High["Location"].str.split("/", n=1, expand=True)
# Splits the 'Location' column at the first occurrence of '/' into two columns
# Group by 'Location' and sample a fraction of rows
frac = global_High.groupby('Location', group_keys=False).apply(pd.DataFrame.sample, frac=.02, random_state=1)
# Groups the DataFrame by 'Location', samples 2% of rows from each group, and ensures reproducibility with a fixed random seed
# View the quantity of locations
frac.groupby('Location').size()
# Displays the size of each group in the sampled DataFrame
# Save the result to a TSV file
frac.to_csv('result_AY43.tsv', sep='\t', index=False)
# Saves the sampled DataFrame to a TSV file without including row indices