Skip to content

Commit

Permalink
update phytools vignette
Browse files Browse the repository at this point in the history
  • Loading branch information
sdgamboa committed Sep 25, 2023
1 parent 27322be commit 3d330d8
Showing 1 changed file with 59 additions and 34 deletions.
93 changes: 59 additions & 34 deletions vignettes/articles/phytools.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ select_cols <- c(
'Attribute_type', 'Attribute_group'
)
valid_ranks <- c('genus', 'species', 'strain')
## It's worth checking the ranks
phys_data <- phys[[1]] |>
as_tibble() |>
filter(Attribute_value == TRUE) |> # only for discrete traits with multiple states
Expand Down Expand Up @@ -87,6 +90,22 @@ set_with_ids <- phys_data |>
) |>
ungroup() |>
distinct()
w <- NULL
withCallingHandlers(
warning = function(cnd) w <<- cnd,
set_with_ids$Rank <- taxizedb::taxid2rank(set_with_ids$NCBI_ID, db = 'ncbi')
)
taxids_with_missing_rank <- unlist(
strsplit(sub('^.*: (.*$)', '\\1', w$message), ', ')
)
no_longer_missing_ranks <- unlist(
taxize::tax_rank(taxids_with_missing_rank, db = 'ncbi')
)
for (i in seq_along(no_longer_missing_ranks)) {
pos <- set_with_ids$NCBI_ID == names(no_longer_missing_ranks)[i]
set_with_ids$Rank[pos] <- no_longer_missing_ranks[i]
}
dim(set_with_ids)
```

Expand All @@ -104,6 +123,21 @@ if (nrow(set_with_ids) > 0) {
set_without_ids <- set_without_ids |>
filter(!NCBI_ID %in% unique(set_with_ids$NCBI_ID))
}
w2 <- NULL
withCallingHandlers(
warning = function(cnd) w2 <<- cnd,
set_without_ids$Rank <- taxizedb::taxid2rank(set_without_ids$NCBI_ID, db = 'ncbi')
)
taxids_with_missing_rank2 <- unlist(
strsplit(sub('^.*: (.*$)', '\\1', w2$message), ', ')
)
no_longer_missing_ranks2 <- unlist(
taxize::tax_rank(taxids_with_missing_rank2, db = 'ncbi')
)
for (i in seq_along(no_longer_missing_ranks2)) {
pos2 <- set_without_ids$NCBI_ID == names(no_longer_missing_ranks2)[i]
set_without_ids$Rank[pos2] <- no_longer_missing_ranks2[i]
}
dim(set_without_ids)
```

Expand Down Expand Up @@ -150,7 +184,7 @@ ncbi_tree_nodes <- unname(ncbi_tree$Get('name'))
print(ncbi_tree, limit = 10)
```

### Phylogenecti tree
### Phylogenectic tree

```{r}
ltp <- ltp()
Expand Down Expand Up @@ -210,16 +244,13 @@ ncbi_tree$Do(function(node) {
ncbi_tree$d__2$p__1224$c__1236$o__91347$f__543$g__561$s__562$attribute_tbl
```



```{r}
ncbi_attr_gst <- ncbi_tree$Get(
'attribute_tbl', filterFun = function(node) grepl('^[gst]__', node$name)
)
mean(!map_lgl(ncbi_attr_gst, ~ all(is.na(.x))))
```


Removing annotations just to compare with result after propagation

```{r}
Expand Down Expand Up @@ -306,18 +337,18 @@ mean(!map_lgl(ncbi_attr_gst, ~ all(is.na(.x))))


Child taxa of E. coli with unobserved annotations can inherit these
values directly:
values with a slight penalty:

```{r}
## Inheritance - pre-order
inh1 <- function(node) {
inh1 <- function(node, adjF = 0.1) {
if (node$isRoot)
return(NULL)
if (is.null(node$parent$attribute_tbl))
return(NULL)
if (is.null(node$attribute_tbl) && grepl('^[st]__', node$name)) {
df <- node$parent$attribute_tbl
n <- nrow(tbl)
n <- nrow(df)
df <- df |>
dplyr::mutate(
target_scores = rep(1 / n, n),
Expand Down Expand Up @@ -351,55 +382,49 @@ ncbi_attr_gst <- ncbi_tree$Get(
mean(!map_lgl(ncbi_attr_gst, ~ all(is.na(.x))))
```

The step above actually covered must of the genus, species, and strains,
but this is one of the datasets with the most annotations, so it's expected
that most of the taxa would already be covered.

A formal ASR method would be nice, however.










Get new attributes:

```{r}
new_tip_annotations <- new_data_tree_nodes |>
new_attributes <- ncbi_attr_gst |>
discard(~ all(is.na(.x))) |>
bind_rows() |>
arrange(NCBI_ID, Attribute) |>
mutate(taxid = sub('^\\w__', '', NCBI_ID))
mean(tip_data$taxid %in% new_tip_annotations$taxid)
filter(!NCBI_ID %in% phys_data_ready$NCBI_ID) |>
mutate(taxid = sub('^\\w__', '', NCBI_ID)) |>
bind_rows(phys_data_ready)
table(new_attributes$Evidence)
```

Total number of tips with annotations is about 80%
Now almost 80% of the tips have annoations:

```{r}
tip <- left_join(
mpa_tip_data, new_data, by = 'taxid', relationship = 'many-to-many'
)
tip |>
filter(!is.na(Attribute)) |>
dim()
mean(tip_data$taxid %in% new_attributes$taxid)
```


```{r}
table(tip$Evidence)
new_tip_data <- left_join(tip_data, new_attributes, by = 'taxid')
table(new_tip_data$Evidence, useNA = 'always')
```


Let's create an input matrix for ASR with the phylogenetic tree

```{r}
mpa_tree <- ltp()
m1 <- tip |>
m1 <- new_tip_data |>
select(tip_label, Attribute, Score) |>
filter(!is.na(Attribute)) |>
# complete(NCB_ID, Attribute, fill = list(Score = 0)) |>
pivot_wider(
names_from = Attribute, values_from = 'Score', values_fill = 0
) |>
tibble::column_to_rownames(var = 'tip_label') |>
as.matrix()
names_from = 'Attribute', values_from = 'Score'
)
# tibble::column_to_rownames(var = 'tip_label') |>
# as.matrix()
no_annotated_tips <- tip |>
filter(!tip_label %in% rownames(m1)) |>
Expand Down

0 comments on commit 3d330d8

Please sign in to comment.