-
Notifications
You must be signed in to change notification settings - Fork 0
/
bmc_article.tex
948 lines (794 loc) · 71.2 KB
/
bmc_article.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
%% BioMed_Central_Tex_Template_v1.06
%% %
% bmc_article.tex ver: 1.06 %
% %
%%IMPORTANT: do not delete the first line of this template
%%It must be present to enable the BMC Submission system to
%%recognise this template!!
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% LaTeX template for BioMed Central %%
%% journal article submissions %%
%% %%
%% <8 June 2012> %%
%% %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% For instructions on how to fill out this Tex template %%
%% document please refer to Readme.html and the instructions for %%
%% authors page on the biomed central website %%
%% http://www.biomedcentral.com/info/authors/ %%
%% %%
%% Please do not use \input{...} to include other tex files. %%
%% Submit your LaTeX manuscript as one .tex document. %%
%% %%
%% All additional figures and files should be attached %%
%% separately and not embedded in the \TeX\ document itself. %%
%% %%
%% BioMed Central currently use the MikTex distribution of %%
%% TeX for Windows) of TeX and LaTeX. This is available from %%
%% http://www.miktex.org %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%% additional documentclass options:
% [doublespacing]
% [linenumbers] - put the line numbers on margins
%%% loading packages, author definitions
%\documentclass[twocolumn]{bmcart}% uncomment this for twocolumn layout and comment line below
\documentclass{bmcart}
%%% Load packages
%\usepackage{amsthm,amsmath}
%\RequirePackage{natbib}
%\RequirePackage[authoryear]{natbib}% uncomment this for author-year bibliography
%\RequirePackage{hyperref}
\usepackage[utf8]{inputenc} %unicode support
%\usepackage[applemac]{inputenc} %applemac support if unicode package fails
%\usepackage[latin1]{inputenc} %UNIX support if unicode package fails
\usepackage{graphicx}
\usepackage{xspace}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% If you wish to display your graphics for %%
%% your own use using includegraphic or %%
%% includegraphics, then comment out the %%
%% following two lines of code. %%
%% NB: These line *must* be included when %%
%% submitting to BMC. %%
%% All figure files must be submitted as %%
%% separate graphics through the BMC %%
%% submission process, not included in the %%
%% submitted article. %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%\def\includegraphic{}
%\def\includegraphics{}
%%% Put your definitions there:
\startlocaldefs
\def\OpBook {Open-Book view\xspace}
\def\ExpView {Exploded view\xspace}
\def\MatView {Matrix view\xspace}
\def\CoZoListView {Contact-Zone list-view\xspace}
\def\CoZoList{Contact-Zone list\xspace}
\def\CoZoLists{Contact-Zone lists\xspace}
\endlocaldefs
%%% Begin ...
\begin{document}
%%% Start of article front matter
\begin{frontmatter}
\begin{fmbox}
\dochead{Research}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% Enter the title of your article here %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\title{COZOID: COntact ZOne IDentifier for Visual Analysis of Protein-Protein Interactions}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% Enter the authors here %%
%% %%
%% Specify information, if available, %%
%% in the form: %%
%% <key>={<id1>,<id2>} %%
%% <key>= %%
%% Comment or delete the keys which are %%
%% not used. Repeat \author command as much %%
%% as required. %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\author[
addressref={aff1},
email={[email protected]}
]{\inits{KF}\fnm{Katar\'{i}na} \snm{Furmanov\'{a}}}
\author[
addressref={aff2}, % id's of addresses, e.g. {aff1,aff2}
email={[email protected]} % email address
]{\inits{JB}\fnm{Jan} \snm{By\v{s}ka}}
\author[
addressref={aff3}, % id's of addresses, e.g. {aff1,aff2}
email={[email protected]} % email address
]{\inits{EMG}\fnm{Eduard M} \snm{Gr\"{o}ller}}
\author[
addressref={aff3}, % id's of addresses, e.g. {aff1,aff2}
email={[email protected]} % email address
]{\inits{IV}\fnm{Ivan} \snm{Viola}}
\author[
addressref={aff4, aff5}, % id's of addresses, e.g. {aff1,aff2}
email={[email protected]} % email address
]{\inits{JJP}\fnm{Jan J} \snm{Pale\v{c}ek}}
\author[
addressref={aff1}, % id's of addresses, e.g. {aff1,aff2}
corref={aff1},
email={[email protected]} % email address
]{\inits{BK}\fnm{Barbora} \snm{Kozl\'{i}kov\'{a}}}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% Enter the authors' addresses here %%
%% %%
%% Repeat \address commands as much as %%
%% required. %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\address[id=aff1]{% % unique id
\orgname{[email protected], [email protected], Faculty of Informatics, Masaryk University}, % university, etc
% \street{\v{Z}erot\'{i}novo n\'{a}m\v{e}st\'{i}}, %
%\postcode{} % post or zip code
\city{Brno}, % city
\cny{Czech Republic} % country
}
\address[id=aff2]{%
\orgname{[email protected], Department of Informatics, University of Bergen},
%\street{D\"{u}sternbrooker Weg 20},
%\postcode{24105}
\city{Bergen},
\cny{Norway}
}
\address[id=aff3]{%
\orgname{[email protected], [email protected], Institute of Visual Computing \& Human-Centered Technology, TU Wien},
%\street{D\"{u}sternbrooker Weg 20},
%\postcode{24105}
\city{Wien},
\cny{Austria}
}
\address[id=aff4]{% % unique id
\orgname{[email protected], National Centre for Biomolecular Research, Masaryk University}, % university, etc
% \street{\v{Z}erot\'{i}novo n\'{a}m\v{e}st\'{i}}, %
%\postcode{} % post or zip code
\city{Brno}, % city
\cny{Czech Republic} % country
}
\address[id=aff5]{% % unique id
\orgname{[email protected], Central European Institute of Technology, Masaryk University}, % university, etc
% \street{\v{Z}erot\'{i}novo n\'{a}m\v{e}st\'{i}}, %
%\postcode{} % post or zip code
\city{Brno}, % city
\cny{Czech Republic} % country
}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% Enter short notes here %%
%% %%
%% Short notes will be after addresses %%
%% on first page. %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%\begin{artnotes}
%\note{Sample of title note} % note to the article
%\note[id=n1]{Equal contributor} % note, connected to author
%\end{artnotes}
\end{fmbox}% comment this for two column layout
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% The Abstract begins here %%
%% %%
%% Please refer to the Instructions for %%
%% authors on http://www.biomedcentral.com %%
%% and include the section headings %%
%% accordingly for your article type. %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{abstractbox}
\begin{abstract} % abstract
\parttitle{Background} %if any
Studying the patterns of protein-protein interactions (PPIs) is fundamental for understanding the structure and function of protein complexes.
The exploration of the vast space of possible mutual configurations of interacting proteins and their contact zones is very time consuming and requires the proteomic expert knowledge.
\parttitle{Results}
In this paper, we propose a novel tool containing a set of visual abstraction techniques for the guided exploration of PPI configuration space.
It helps proteomic experts to select the most relevant configurations and explore their contact zones at different levels of detail.
The system integrates a set of methods that follow and support the workflow of proteomics experts.
The first visual abstraction method, the Matrix view, is based on customized interactive heat maps and provides the users with an overview of all possible residue-residue contacts in all PPI configurations and their interactive filtering.
In this step, the user can traverse all input PPI configurations and obtain an overview of their interacting amino acids.
Then, the models containing a particular pair of interacting amino acids can be selectively picked and traversed.
Detailed information on the individual amino acids in the contact zones and their properties is presented in the Contact-Zone list-view.
The list-view provides a comparative tool to rank the best models based on the similarity of their contacts to the template-structure contacts.
All these techniques are interactively linked with other proposed methods, the Exploded view and the Open-Book view, which represent individual configurations in three-dimensional space.
These representations solve the high overlap problem associated with many configurations.
Using these views, the structural alignment of the best models can also be visually confirmed.
\parttitle{Conclusions}
We developed a system for the exploration of large sets of protein-protein complexes in a fast and intuitive way.
The usefulness of our system has been tested and verified on several docking structures covering the three major types of PPIs, including coiled-coil, pocket-string, and surface-surface interactions.
Our case studies prove that our tool helps to analyse and filter protein-protein complexes in a fraction of the time compared to using previously available techniques.
\end{abstract}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% The keywords begin here %%
%% %%
%% Put each keyword in separate \kwd{}. %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{keyword}
\kwd{protein-protein interaction}
\kwd{contact zone}
\kwd{visualization}
\end{keyword}
% MSC classifications codes, if any
%\begin{keyword}[class=AMS]
%\kwd[Primary ]{}
%\kwd{}
%\kwd[; secondary ]{}
%\end{keyword}
\end{abstractbox}
%
%\end{fmbox}% uncomment this for twcolumn layout
\end{frontmatter}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% The Main Body begins here %%
%% %%
%% Please refer to the instructions for %%
%% authors on: %%
%% http://www.biomedcentral.com/info/authors%%
%% and include the section headings %%
%% accordingly for your article type. %%
%% %%
%% See the Results and Discussion section %%
%% for details on how to create sub-sections%%
%% %%
%% use \cite{...} to cite references %%
%% \cite{koon} and %%
%% \cite{oreg,khar,zvai,xjon,schn,pond} %%
%% \nocite{smith,marg,hunn,advi,koha,mouse}%%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%% start of article main body
% <put your article body there>
%%%%%%%%%%%%%%%%
%% Background %%
%%
\section*{Background}
Understanding the constitution and biological function of proteins is essential in many research disciplines, such as medicine and pharmaceutics.
Most of the proteins critical for cellular life act in a cooperative manner, forming multiprotein complexes.
It is estimated that approximately 800 complexes exist in just one yeast cell~\cite{Gavin}.
All complexes are composed of subunits, which constitute the complex via mutual protein-protein interactions (PPIs).
The main goal of studying these PPIs, known as protein-protein docking, is to identify the appropriate spatial configuration of the interacting proteins.
This configuration is represented by the mutual spatial orientation of the interacting proteins.
Each configuration contains a contact zone, consisting of the set of amino acids from both interacting proteins that are with interaction distance, usually spanning from 3 to 5 \AA ngstr\"{o}ms.
The structure determination of PPIs in laboratories is very challenging, as well as expensive and time-consuming.
This is due to many problems related to the dynamic nature of proteins, difficulties in their purification and sample preparation.
Therefore, computational docking is often used to study the feasibility of proposed configurations.
Many algorithms and tools have appeared to examine these configurations in the last years.
A categorization of the existing algorithms, along with a description of their basic principles, was published recently by Huang~\cite{Huang2014}.
However, these algorithms produce a large number of possible configurations, which need to be explored to identify the proteomically most relevant ones.
Even though the computational tools usually provide the users with some score to rank the configurations, the resulting ordering does not necessarily correspond to their proteomic relevance.
Therefore, the configurations have to be processed and examined manually, which requires a proper visual support to enhance the exploration process.
Even for the comparison of two configurations, a traditional overlay representation suffers from many occlusion problems and it is hard to perceive the differences between individual solutions.
When comparing more configurations, even without a detailed visualization of the hot spot amino acids, the problem becomes even more apparent (Figure~\ref{fig:problem}).
\subsection*{Related Work}
As the selection of the most proteomically relevant PPI configurations is a very challenging task, several algorithms have already been published for re-ranking the configurations according to different criteria.
They suggest a subset of configurations that should be explored in detail.
As a representative of these attempts, Malhotra et al.~\cite{Malhotra2015} presented DockScore, a web server for ranking the individual configurations produced by docking tools.
Their idea is based on building a scoring scheme that considers several interface parameters, such as the surface area, hydrophobicity, spatial clustering, etc.
This helps the user to reduce the number of configurations to a smaller set, which still has to be explored manually.
For this exploration, a visual support is essential, as it enables the user to see the spatial orientation of the contact zones and to compare different configurations.
However, DockScore provides only a rudimentary visual representation of top five configurations, which is insufficient for the proper exploration of the configuration space.
Finding a proper visual representation of PPIs can be approached from different perspectives.
One technique consists of techniques visualizing the contact zones and their interacting amino acids.
The spatial techniques have to address the problem of occlusion and visual clutter caused by the fact that the most interesting parts of interacting proteins, the contact zones, are facing each other inside the configuration.
Without transformations or visual enhancements (e.g., through transparency), it is impossible to visually explore the contact zones.
Jin et al.~\cite{Jin2014} presented an open-book view where the interacting proteins are rotated to orient the contact zones towards the camera.
The problem with the presented solution lies mainly in the missing information about the interacting amino acids and the unified coloring of the contact zones.
An alternative approach presented by Lee and Varshney~\cite{Varshney2003} computes and visualizes the intermolecular negative volume and the area of the docking site.
This way the users can observe the volume between the interacting proteins without the need to display the contact zones themselves.
This can serve proteomic experts as an interactive tool for studying possible docking configurations, but it does not support their comparison.
Similar approaches suggest the construction of an interface surface between the interacting proteins~\cite{480793, Ban2006}.
The surface is visualized as a 3D mesh, encoding the information about the core and peripheral regions from the interface.
However, this method also does not support the comparison of multiple configurations.
Two-dimensional abstract representations are also commonly used for the visualization of contact zones, such as the schematic representation used by the PDBsum database~\cite{pdbsum} (Figure~\ref{fig:pdbsum}).
In the overview visualization, each of the interacting proteins is represented by a circle equipped with information about the number of amino acids forming the contact zones and the number of different types of interactions in-between (e.g., salt bridges, disulphide bonds, hydrogen bonds, or non-bonded contacts).
The detailed visualization in PDBsum lists all the contact zone amino acids.
The interactions are visualized by lines of different color and thickness, which represent the type and strength of the interactions, respectively.
This approach gives a comprehensible overview of one configuration, but comparing it with another configuration is not possible.
Lex et al.~\cite{Lex2012} proposed a visual analysis tool for the exploration of large scale heterogeneous genomics data for the characterization of cancer subtypes.
They use multiple views of the complex data, and one of them is a method for the comparison of different datasets.
The abstract representation shows the similarities in the datasets by connecting corresponding blocks of data.
The thickness of a connection denotes the degree of similarity.
This representation serves well for comparison, but it lacks detailed information about the individual items.
In this paper, we present a systemic tool, COZOID, comprised of a set of methods for the visualization, comparison, and selection of numerous docking configurations.
The combination of our proposed methods eliminates the problems associated with the existing solutions and provides proteomic experts with an intuitive and user-friendly tool for the interactive exploration of PPIs.
Our tool is integrated into the CAVER Analyst software~\cite{kozlikova2014caver}, which allows for the analysis and visualization of biomolecules, and therefore, contains many relevant features, such as different molecular visualization modes, measurement tools, etc.
The input PPI configurations are provided by the existing computational tools and our solution is designed for dealing specifically with a large number of configurations.
\section*{Methods}
\subsection*{COZOID Overview}
Our newly proposed system enables for the efficient visual exploration of a large number of PPI complexes.
For a better understanding, we introduced the following notation.
A protein $P$ consists of a set of amino acids forming a polypeptidic chain.
A complex $C$ is represented by a set of mutually interacting proteins.
In our case, we focus primarily on the interactions between two protein structures $P_1$ and $P_2$, which form a complex $C(P_1,P_2)$.
The mutual spatial orientation of the interacting proteins in the complex forms a configuration.
The $i$-th configuration of complex $C(P_1,P_2)$, denoted as $CONF_i(C(P_1,P_2))$, represents one of the possible mutual orientations of this complex.
Generally, there can be $n$ ($1 \leq i \leq n$) possible configurations for a given complex, and the task is to select the configuration that is the most relevant one from a proteomics point of view.
The decision is based on various pieces of knowledge about the geometric arrangement of the configuration as well as other aspects, such as knowledge of the contacts between the amino acids present in the contact zone of the given configuration.
Therefore, the selection of the most relevant configurations cannot be completed automatically and requires insights from the proteomic expert.
This represents a typical domain-related problem, which has to be supported by specifically designed visualizations.
The visualization methods proposed in this paper allow the user to visually explore a set of possible configurations detected by one of the existing computational tools and to select the most proteomically relevant ones.
The users have to iteratively filter out those configurations that do not fulfill the given specific criteria.
The proteomic expert workflow, along with our proposed visual support of its individual stages, is depicted in Figure~\ref{fig:workflow}.
The input datasets, consisting of dozens of configurations between two interacting proteins, were computed using the HADDOCK~\cite{haddock} and PyDock~\cite{pydock} tools.
However, any of the existing tools for protein-protein docking can serve as a source of input data for our system.
The proposed visualizations are based on the precondition that the users already have initial knowledge about the interacting proteins.
Thus, the experts are able to define a pair of amino acids that are expected to interact.
This is not restrictive, as computational tools also require this information to produce a meaningful set of configurations.
In other words, we are using similar input information as the computational tools.
The second possibility is that the users do not have this information but are aware of an already explored protein complex with a similar structure that can serve as a reference (primary) complex for further comparison and exploration.
In this case, the computational tools usually produce even more configurations, but most of them are irrelevant and have to be filtered out.
Our tool can utilize the information about the interactions in the primary complex and enhance the filtering process.
Our methods have been designed specifically to help proteomic experts answer the following questions:
\begin{itemize}
\item Q1: Which configurations contain a selected interacting pair of amino acids (and what is the frequency of the occurrence of this pair in all configurations)?
\item Q2: Which pairs of amino acids are present in a given configuration?
\item Q3: How close are the amino acids in the contact zone and which are the closest ones?
\item Q4: How similar and different are the contact zones in the configurations?
\item Q5: What are the physico-chemical properties of the amino acids in the contact zone?
\item Q6: What are the differences between the sets of amino acids in the contact zones of different configurations?
\end{itemize}
Answering these questions helps the proteomic experts to better understand the interactions in the protein-protein complexes and to evaluate the correctness of the given configurations.
The proposed visualizations enable one to find the answers by interactively exploring the configurations which is demonstrated in the supplementary video as well (see Additional file 1).
In the following chapters, we introduce our proposed views in detail.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection*{Matrix View}
\label{sec:matview}
When using a computational tool to generate possible configurations, the resulting set $S = \{CONF_i(C(P_1,P_2)); 1 \leq i \leq n\}$, $n$ can be very large, ranging from dozens to hundreds.
This amount is impossible to explore manually; thus, some preliminary filtering is crucial.
The filtering stage is designed to answer question Q1.
We propose a matrix-based visualization inspired by commonly used heat maps (Figure~\ref{fig:matrixlens}a).
The rows and columns in the \MatView correspond to the interacting proteins $P_1$ and $P_2$, respectively.
Each row or column represents one amino acid present in a contact zone in some of the configurations $CONF_i(C(P_1,P_2))$.
The rows and columns are formed only by those amino acids from the interacting proteins that are in contact in at least one configuration.
The contact between the amino acids is based on their Euclidean distance.
Two amino acids are considered to be in contact if their distance is between 3 and 5 \AA.
This range can be interactively changed by the user.
The color of each cell in the matrix corresponds to the number of occurrences of the corresponding interacting amino acids in the set $S$ of all configurations.
The colored lists of amino acids can be interpreted as histograms, encoding the number of their occurrences.
The intense red color represents the pairs of amino acids that are interacting in most of the configurations.
The \MatView serves directly for filtering out improbable solutions using the interactive user-driven selection of cells.
The selection is performed by clicking on individual cells.
Moreover, the matrix allows the expert to selecSut a combination of several pairs of amino acids.
This is useful if the user wants to further explore only those configurations that contain specific interactions, such as between the amino acid pair $A$, $B$ and simultaneously the pair $C$, $D$.
The big advantage of the \MatView is its independence from the size of the input set of possible configurations.
The number of rows and columns is limited by the size of the interacting proteins, meaning that in the worst case, it corresponds to the total number of amino acids in these proteins.
However, in most cases, the number of amino acids in the contact zones is much smaller than the total number of amino acids.
Each configuration of the input dataset then increases the counters in the respective matrix cells.
In the case of many interacting amino acids, the cells in the matrix can become too small.
In these situations, the users can employ the table lens technique introduced by Rao and Card~\cite{Rao1994}, which can be applied to both rows and columns in the matrix (Figure~\ref{fig:matrixlens}a).
To provide the users with more detailed information about individual configurations, the \MatView contains an additional side view, which is positioned directly next to the matrix (Figure~\ref{fig:matrixlens}b).
The user can select a primary configuration to which all the remaining configurations are compared.
An example of a primary configuration can be a crystal structure downloaded from the PDB database.
We propose the following ranking score, which indicates the similarity between the contact zone of a given configuration and the primary configuration.
One of the interacting proteins, e.g., $P_1$ , is selected as a reference protein, while the second protein, e.g., $P_2$, is marked as the paired protein.
The score is computed in the following way.
\begin{itemize}
\item For each match of an amino acid in the contact zones from the reference proteins of the compared and the primary configuration, the similarity score is increased by one.
\item For each matching interaction pair in the contact zones from the compared and the primary configuration, the similarity score is increased by four.
\item For each missing interaction pair in the contact zones from the compared and the primary configuration, the similarity score is decreased by one.
\end{itemize}
This score was determined experimentally while designing and testing the view (see Results chapter).
The central part of the side view consists of a scrollable list of individual configurations from a subset of $S$ that was filtered with the \MatView.
The configurations are ordered according to their similarity scores, from the most similar to the least similar ones.
The primary configuration is always displayed as the first one on the top of the list.
The side view helps to answer questions Q2 and Q3, as it enables an iterative search through the list of configurations and the exploration of all pairs of interacting amino acids for each configuration.
The user can select a configuration to focus on by clicking on it.
By default, each configuration in focus contains one polyline connecting two amino acids from the contact zone that are the closest among all the possible pairs (Figure~\ref{fig:matrixlens}b).
The user can hover the mouse over the lists of amino acids on the left and right side and inspect the corresponding connection lines for a given amino acid.
By clicking on the rectangle representing a given amino acid, the connection lines remain in the view.
The pairs of amino acids that form the configuration in focus can be highlighted in the matrix (with green border rectangles in Figure~\ref{fig:matrixlens}a).
From the color of the matrix cells, the user can immediately estimate the number of configurations in which these pairs are present.
Vice versa, by interacting with the matrix and selecting the given rectangles, the side view is automatically filtered to show only those configurations that satisfy the filtering condition.
The \MatView serves as the first filtration tool for selecting only those configurations that contain a desired combination of interacting amino acids.
This filtering cannot be automated because the frequency of a given pair in configurations does not correlate with the importance of these configurations.
The most frequent pair of interacting amino acids can be of the same interest as a pair interacting only in one configuration.
Therefore, insights from the proteomic expert in combination with the interaction possibilities from the \MatView have proven to be a very efficient and powerful solution.
Selected configurations can be further processed by the following visualization methods.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection*{Exploded View}
The proteomics experts are already familiar with the manipulation of molecules in a three-dimensional (3D) environment; thus, a 3D representation has to be an integral part of the workflow.
Moreover, the 3D space helps to find answers for questions Q3-Q5, which are related to the appearance of the contact zones of selected configurations and the properties of interacting amino acids (expressed by different coloring schemes).
Exploring and comparing many structures in 3D at once suffers from problems such as high overlap, occlusion, and visual clutter (Figure~\ref{fig:case12}b).
Traditionally used spatial representations are not sufficient.
To overcome these limitations, we adapted an exploded-view technique, to enlarge the distance between the interacting proteins.
Figure~\ref{fig:case12}c shows the comparison of three configurations using our proposed \ExpView.
The main principle of the \ExpView is the following.
First, all the reference proteins taken from the configurations selected in the \MatView are aligned using the Combinatorial Extensions from the structural-alignment algorithm~\cite{Shindyalov1998} so that their 3D spatial representations overlap (Figure~\ref{fig:case12}).
Here, it is important to understand that the reference protein shown in Figure~\ref{fig:case12}b (the brown one) actually represents three overlapping aligned reference proteins, each coming from one configuration.
The set of paired proteins interacting with the reference proteins is positioned around the aligned reference proteins with an enlarged distance.
To ensure that the paired proteins in the Exploded view will not collide with each other, we arrange the paired proteins into a parabolic regular grid.
For each reference protein and it's paired protein, the \ExpView retains the information about their interaction.
If several configurations are exploded at once, the \ExpView contains many paired proteins arranged around the aligned reference proteins.
As the change in the position of the exploded proteins can cause disorientation in the scene, the pairing information between the corresponding reference proteins (aligned) and paired proteins ("exploded") is initially indicated as a partially transparent tube that connects the centers of their contact zones.
The radius of the tube is modulated (it is smaller in the middle of the tube to reduce the visual clutter).
Once the user understands the overview of the protein spatial arrangement, the tube can be switched off.
The pairing information is also encoded by color (a different color is used for each configuration).
If the contact zones contain colliding amino acids (i.e., their mutual distance is less than 3~\AA), the residues are indicated by a red color.
Figure~\ref{fig:case12} depicts a set of three configurations before (a, b) and after (c) applying the \ExpView.
The Exploded view removes the problem of overlapping paired proteins.
It also helps to see the shape and position of the contact zones.
However, this solution does not solve the problem where the contact zones face each other, meaning that the user has to adjust the camera to observe the contact zones of the reference and paired proteins from a perpendicular viewing direction.
This manipulation does not enable the user to see both contact zones simultaneously.
This problem is solved by the proposed \OpBook, which is presented in the following section.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection*{Open-Book View}
The \ExpView does not allow one to observe both parts of a given contact zone simultaneously.
The proposed \OpBook is designed to specifically answer questions similar to Q5, which addresses a detailed exploration of one selected contact zone in the complex $C(P_1,P_2)$.
This involves the presentation of the information about different properties of individual amino acids forming the contact zone and their pairing.
The \OpBook is activated if the user selects one of the configurations from the \ExpView.
The selection is performed by clicking on the connection tube from the desired configuration $CONF_i(C(P_1,P_2))$ in the \ExpView.
The other configurations are automatically hidden, the selected configuration returns to its initial position (before applying the \ExpView), and an animated transition for the opening of $CONF_i(C(P_1,P_2))$ is launched.
When animating the opening, the reference and paired proteins are rotated and translated so that they are positioned next to each other and the contact zones are facing towards the observer (see Figure~\ref{fig:book}).
The algorithm performing the opening computes the vectors defining the orientation of the contact zones (their normal vectors).
From the normal vectors and the camera position, we compute the rotation angle, which is then applied to the reference and paired protein.
To maintain the information about the amino acid pairings, the user can also visualize individual connections between these pairs through simple lines.
The contact zones represented by their surfaces can be color-coded according to multiple criteria.
The color can encode the distance between the amino acids or represents different physico-chemical properties of the amino acids or their atoms, such as hydrophobicity or partial charges.
The coloring scheme used in the \MatView represents the so-called conservation of the amino acids in all configurations.
It can also be used to color the contact zone.
The surfaces can be augmented with labels to inform the users about the type and identifier of individual amino acids.
In both the \ExpView and the \OpBook, a protein can also be represented by other traditionally used visualization styles, such as cartoon, spheres, balls\&sticks, sticks, etc.
Moreover, these methods can be combined.
For example, the proteins can be represented by the cartoon style and the amino acids in the contact zones can be visualized using the sticks representation to see their spatial orientation.
If the task is to compare individual configurations with respect to the pairs of interacting amino acids, a further drill-down is necessary.
Therefore, in the next section, we propose another abstract view supporting mainly the comparison of paired amino acids in individual contact zones from selected configurations.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection*{Contact-Zone List-View}
The \CoZoListView helps to answer questions related to the comparison of the contact zones at the level of the individual amino acids, such as in Q6.
The list for one configuration consists of two sets of amino acids in the contact zones, each set coming from one interacting protein (see Figure~\ref{fig:list}).
The left part of the view contains all amino acids coming by default from the reference protein, while the right part is formed by their interaction counterparts in the paired protein.
However, the order of proteins in the list-view can be changed.
The order depends on the current task, i.e., if we want to compare the constitution of contact zones from the reference or the paired protein in the given configurations.
The view contains all possible connections (with respect to the distance) between the amino acids from both contact zones.
To avoid the intersection of lines representing the connections, some amino acids on the right side are repeated -- one instance for each reference protein amino acid within a user-defined distance.
This solution was adopted because without these repetitions, there would be many line intersections, which substantially decreases the readability of the representation (see Figure~\ref{fig:pdbsum}b).
For each configuration, one list-view is created and all the list-views are juxtapositioned so the user can see and visually compare the constitution of the contact zones from all selected configurations.
The user can modify this representation by changing the color, which can encode different properties for the amino acids mapped onto their corresponding rectangles.
The properties are the same as those mapped onto the surface of the contact zone in the Exploded and Open-Book views.
The left part of the list can then be sorted according to these properties (see Figure~\ref{fig:sorting}).
Moreover, by clicking on individual rectangles representing the amino acids, the corresponding amino acids are selected in the 3D view as well.
The principle steps for building the \CoZoListView are the following.
For all configurations, which should be visualized in the \CoZoListView, we find the interacting pairs of amino acids in their contact zones.
Then, the list of amino acids present in all reference proteins from the selected configurations is created.
Now, for each configuration, we take the interacting amino acids from the paired proteins, sort them according to a selected criterion (e.g., hydrophobicity), and add them to the \CoZoListView.
The amino acids in the left part of the \CoZoListView are always sorted in the same way for all depicted configurations.
Similar to the \MatView, the user can select a primary configuration to which all the remaining configurations are compared (see Figure~\ref{fig:list}b) using the proposed ranking score algorithm, which is described in Matrix View section.
The \CoZoList plots the configurations ordered from left to right by the similarity score from the most similar to the least similar.
The \CoZoListView of the primary configuration is always displayed as the first one from the left side of the view.
The user can select between two visualization modes -- the \textit{compare} and the \textit{compact} list-view.
In \textit{compare} mode, the amino acids in the contact zone in the primary configuration that are not present in the contact zone from any other configuration are depicted as white rectangles with labels giving the names of the missing amino acids (see Figure~\ref{fig:list}b).
The \textit{compact} mode omits these missing amino acids to save space.
In both modes, the matches between amino acids in the primary configuration are highlighted with red bordered rectangles and connecting lines.
This way, the user can immediately see which amino acids are present in both the primary configuration as well as the other configurations and which amino acids are missing.
To guide the visual comparison, we also introduced interactive highlighting and, if necessary, zooming to corresponding amino acids in different configurations.
\section*{Results and Discussion}
To demonstrate the usability of our proposed techniques, we selected three representative basic types of PPI patterns present in SMC complexes~\cite{Palecek2015}.
SMC (Structure Maintenance of Chromosome) complexes are the key players in chromatin organization where they ensure the stability and dynamics of chromosomes. The way the subunits of these complexes interact with each other is key for their functions~\cite{Gligoris}.
A visual representation of such information is highly beneficial as it helps to reveal the spatial relationships between the subunits in an intuitive way.
The three basic PPI types are coiled-coil, pocket-string, and surface-surface interactions~\cite{alberts02molecular}.
In the following subsections, we demonstrate the usefulness of our proposed visualizations on these three types of interactions.
\subsection*{Surface-Surface Interaction}
The most frequent surface-surface interaction type was tested on the NSE1 and NSE3 proteins in the SMC5/6 complex.
This interaction has been analysed as it represents a dimer of kite proteins, which are critical for the function of eukaryotic SMC5/6 and bacterial SMC complexes~\cite{Zabrady2016,Palecek2015,Doyle2010}.
The crystal structure of the human NSE1-NSE3 dimer was already examined in detail and the resulting configuration is already published in the PDBsum database under the PDB identifier 3NW0.
Therefore, it can serve as a primary testing complex for both the computational tools as well as for our proposed visualizations.
To restrict the set of possible docking configurations, we selected the web version of the HADDOCK tool and a pair of interacting amino acids, i.e., methionine with ID 23 from the reference protein and leucine with ID 97 from the paired protein (Figure~\ref{fig:pdbsum}b).
This selection was based on experimental data from previous works~\cite{Doyle2010,Hudson2011,Kozakova,Crabben}.
The HADDOCK analysis resulted in 40 possible configurations.
HADDOCK groups the configurations into clusters, according to their similarity, which is defined internally by the HADDOCK score.
In our case, it led to 10 clusters each containing 4 configurations.
The computed configurations were loaded into our COZOID visualization system, which interactively links all the proposed visualizations.
From these configurations, the \MatView was computed first, which contains the frequencies of all the pairs of amino acids within the interaction distance within these 40 configurations.
The matrix identified configurations containing pairs of interacting amino acids with interaction distances smaller than 4 \AA.
In our particular case, the leucine 97 and methionine 23 amino acids were within this interaction distance in only three configurations out of the initial 40 (Figure~\ref{fig:matrixlens}).
The Matrix view helped to filter these immediately through a simple interaction with the view.
The remaining 37 configurations were automatically hidden in the remaining views.
In the next step, we switched to the \CoZoListView and compared the list of amino acids from the 3NW0 crystal structure with the lists of all three selected configurations.
Figure~\ref{fig:case3} shows the comparison between the 3NW0 structure and the three selected HADDOCK configurations.
From the given portion of the \CoZoListView, the similarities and differences between the 3NW0 crystal (in the leftmost list) and the three selected HADDOCK configurations at the level of the individual amino acids are clearly visible.
Additionally, the pairs of the interacting amino acids identical to the 3NW0 crystal structure are highlighted (red lines in Figure~\ref{fig:case3}).
The left-to-right order of the modelled configurations in Figure~\ref{fig:case3} reflects their similarity to the primary crystal structure, based on the number of identical pairs of amino acids (the best model is next to the crystal).
Finally, the 3NW0 crystal and three selected configurations were explored using the 3D representations with the aim of exploring the constitution, mutual distances, and properties of the contact zones in detail.
In 3NW0, the first NSE1 interacting protein was selected as the reference protein and all three configurations were aligned with respect to the paired proteins.
The paired proteins were positioned around the reference one.
Figure~\ref{fig:case12}a shows the situation where the three selected configurations are visualized using a commonly available method.
The configurations are represented as surfaces and the contact zones are highlighted using different colors.
However, the most interesting parts, i.e., the contact zones, are hidden (Figure~\ref{fig:case12}b).
Our \ExpView overcomes this limitation so the individual contact zones from all the paired proteins are clearly visible (Figure~\ref{fig:case12}c).
Moreover, if we point the camera towards the aligned reference proteins, the differences between the positions in the contact zones in the reference proteins can be observed as well.
The \ExpView representation gave us the information about the mutual positioning of the individual configurations with respect to the positions of the contact zones.
Using our tool, the investigation can go even deeper to the level where individual contact zones can be explored in detail using the \OpBook.
By animating the opening of the protein complex, we were able to look inside the contact zone.
The \OpBook enhancements, i.e., labelling the surface of the contact zones with the names of the corresponding amino acids and coloring them according to different properties, were highly beneficial for exploring the physico-chemical and geometric properties of the individual amino acids.
\subsection*{Coiled-Coil Interaction}
For the second type of interaction, we picked the SMC3 coiled-coil arm from the SMC complex~\cite{Gligoris}.
The interaction site is formed by two helical fragments from the SMC3 protein.
The primary structure is published under the PDB identifier 4UX3~\cite{pmid25414305}.
Using this structure, the results of both the HADDOCK and the PyDock tools were tested.
The HADDOCK results contained 40 output configurations.
Using the \MatView, we set the interaction distance threshold between 3 and 5~\AA~and selected methionine 186 and isoleucine 1030 as the initial pair of interacting amino acids (Figure~\ref{fig:coiled_haddock_mat}).
These amino acids were used as the input restraints for the HADDOCK computation as well.
These restraints were applied to select the correct configurations in the \MatView (Figure~\ref{fig:coiled_haddock_mat}).
Next, the selected configurations were structurally aligned to the primary 4UX3 structure in 3D space.
Afterwards, we selected the first amino acid (A172) within the respective helices and visually compared their positions in the 3D view.
In this case, it was not even necessary to use other views to see that the preselected HADDOCK configurations exhibited a wrong orientation of the aligned helices.
In all the output models, the A172 amino acids were located on the opposite side in comparison with the primary 4UX3 crystal (see Figure~\ref{fig:coiled_haddock}).
The 3D view from COZOID helped to reveal this misorientation intuitively and quickly, without a detailed exploration of the HADDOCK configurations one-by-one.
As for the PyDock results, 28 out of 100 output PyDock models were selected using the \MatView; the M186 and I1030 interaction pair was used to filter the results.
The visual selection (based on A172 position judgement) provided us with 14 models in the correct orientation (see Figure~\ref{fig:selection2SMC3PyDock}).
In the final step, we compared the \CoZoLists of the selected models with the original crystal structure (4UX3).
Figure~\ref{fig:coiled2} shows the similarities (highlighted in red) of one of the selected models to the crystal. It is the best model, and fits the crystal structure very well. The \ExpView comparison of the contact zone from the crystal structure and the selected model can be observed in Figure~\ref{fig:selection_4_final_SMC3_PyDock}.
\subsection*{Pocket-String Interaction}
For the pocket-string interaction type, we selected an interaction present in the crystal structure from the MukE-MukF complex (PDB identifier 3EUH)~\cite{Woo}.
The pocket is formed by the winged helix domain of the MukE protein, while one of the MukF helical fragments is sitting inside the MukE pocket (Figure~\ref{fig:MukEF_crystal_3EUH_selected}a).
This time, we selected valine 200 and arginine 300 as the pair of amino acids for the docking restraints.
These were the closest contact amino acids in the structure, as can be observed from the \CoZoList ordered by the distance of the interacting amino acids (see Figure~\ref{fig:list_pocket_string}), as well as from the \OpBook of the crystal structure (Figure~\ref{fig:MukEF_crystal_3EUH_selected}b).
The docking models were again generated with both HADDOCK and PyDock docking tools.
The HADDOCK run resulted in 32 output configurations, which were first scrutinized using the \MatView, using the initial V200-R300 amino acid pair.
This first selection step filtered away only 8 models, leaving 24 models for further analysis.
Then, we repeated the \MatView filtering using the second tightest amino acid contact in the crystal (tyrosine 110 and arginine 302) (Figure~\ref{fig:MukEF_crystal_3EUH_selected}b).
This filtration resulted in 6 docking models.
The \CoZoLists of these models were compared with the original crystal structure (3EUH), resulting in an ordered list of the best models (Figure~\ref{fig:list_pocket_string}).
The visual exploration confirmed that the first model from the \CoZoList fits best to the original structure (Figure~\ref{fig:MukEF_selection_3_best_pair}).
PyDock docking provided 100 models, which were analysed similarly to the HADDOCK models.
The selection steps with the \MatView, including the first filtration step with the initial amino acid pair and the filtering with the second amino acid pair, resulted in 32 and 19 models, respectively.
The \CoZoLists of these models were then compared with the original crystal structure.
The models most closely matching the original crystal structure, which was detected using the \CoZoList, were then visually explored in 3D using the \ExpView and the \OpBook.
This step revealed that the best five models from the list are very close to the original crystal, though none of them precisely fits the crystal structure.
Here, we took the advantage of our testing setup (using the tightest contacts between the interacting amino acids) and altered the interaction distance parameter in the \MatView for the selection procedure.
All PyDOCK models were re-evaluated with the distance parameter set to 4 \AA~(compared to the previous 5 \AA~default parameter settings).
As expected, fewer configurations containing the V200-R300 and Y110-R302 amino acid pairs were found within the 4 \AA~distance (the \MatView selection steps resulted in 21 and 13 models, respectively).
However, the altered distance parameter also resulted in a different ranking of the configurations in the \CoZoLists.
Figure~\ref{fig:list_3euh} shows the comparison of the \CoZoLists for the 3EUH crystal structure computed with 5 \AA~and 4 \AA~distance parameter settings.
It can be seen that the decreased distance parameter eliminated several amino acid pairs with distance greater than 4~\AA~from the crystal structure \CoZoList.
The eliminated pairs were not considered in the new \CoZoList ranking, where five models, the most similar to the crystal, were once again selected (Figure~\ref{fig:pydock_pocket_string}a).
Four of these five models overlapped with the five best models detected with the previous system set-up; however, a new model with a closer match was also identified (Figure~\ref{fig:pydock_pocket_string}b).
This test indicates the robustness of our tool with different parameter settings and its potential for experimental use in proteomics.
Our tool can also be used to select an alternative input pair of interacting amino acids, which then serves as the input for the computational tools.
These amino acids might be selected based on the COZOID analysis of the 3NW0 crystal (using the \MatView or \ExpView) when searching for the most central and closest amino acids.
Altogether, COZOID helped us to quickly select the best docking configuration using several visualization approaches.
First, the \MatView allowed us to pick models containing a particular pair of interacting amino acids.
Next, with the \CoZoList, we sorted these models based on the similarity of their contact zones with the original crystal structure.
Using the 3D \ExpView, the best model was determined and confirmed.
While the Exploded view is already available in some of current 3D visualization tools, the power of its combination with our other proposed approaches lies in the speed, user-friendly design, and highly interactive selection mechanism.
Additionally, a similar workflow can be applied for the selection of docking models from homologous proteins, which is not available in the PDB database, yet is often used when different model organisms are employed in proteomic studies.
For example, our \CoZoList can be used in the experimental design of mutants by replacing key contact residues.
This tool can be used by proteomic expert to select amino acids in the contact zones that could be mutated, i.e., replaced by other amino acids.
The ultimate goal of these mutations could be to strengthen the interactions in the contact zone or, completely destroy the interaction between the involved proteins.
\section*{Conclusions}
In this paper, we have presented COZOID, a new tool for the visual exploration of configurations of two interacting proteins.
It introduces a set of visualization methods for the exploration and evaluation of proteomic relevance of large sets of configurations detected with existing computational tools.
Our proposed methods were designed to follow and support the workflow followed by proteomic experts.
We described the design rationale and the principles of these methods, as well as their linking and interaction possibilities.
We tested these methods on real datasets of the SMC complex subunits and demonstrated their usability in three studies covering the most common interaction types.
Our aim was to overcome the drawbacks of the existing methods for visual analysis and comparison of configurations, which provide users with traditional 3D view and exploration of individual configurations one-by-one.
Additionally, specialized techniques enabling to explore the content of the contact zone are completely missing.
Therefore, our proposed solution provides proteomic experts with information that is very hard or even impossible to obtain using these previously available methods.
The system enables iterative filtering of the configurations that do not satisfy given criteria in the individual stages of the workflow.
The executable binary, along with the exemplary dataset and user guide are available in the supplementary material of the manuscript (Additional file 2, Additional file 3, and Additional file 4).
In the future, we plan to focus on the extension of our proposed techniques in cases where the user has no a priori knowledge about the protein complex, but can still feed in experimental data from mutagenesis or crosslink analysis.
%% if specified like this the section will be committed in review mode
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% Backmatter begins here %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section*{List of Abbreviations}
\begin{backmatter}
PPI -- Protein-Protein Interaction \\
COZOID -- Contact ZOne IDentifier \\
SMC -- Structure Maintenance of Chromosome \\
PDB -- Protein Data Bank \\
\end{backmatter}
\section*{Declarations}
\begin{backmatter}
\section*{Ethics approval and consent to participate}
Not applicable.
\section*{Consent for publication}
Not applicable.
\section*{Availability of data and materials}
Data and materials are available here: http://decibel.fi.muni.cz/cozoid/
\section*{Competing interests}
The authors declare that they have no competing interests.
\section*{Funding}
This work was supported through grants from the Vienna Science and Technology Fund (WWTF) through project VRG11-010, the PhysioIllustration research project 218023 funded by the Norwegian Research Council, the Ministry of Education, Youth and Sports of the Czech Republic project CEITEC 2020 (LQ1601), and an Internal Masaryk University grant (MU/0822/2015). This funding enabled the authors to design, implement, and evaluate the proposed system and to write the manuscript.
\section*{Author's contributions}
KF participated on the design of the tool, its implementation, and paper editing. JB contributed to the implementation. IV and EMG contributed to the design of visualizations and interactions of the tool. JJP contributed to the design and was responsible for testing and evaluation of the tool and paper editing. BK contributed to the design, paper writing, and was coordinating the team and activities. All authors read and approved the final manuscript.
\section*{Acknowledgements}
We acknowledge the members of Palecek lab for their participation in the COZOID testing.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% The Bibliography %%
%% %%
%% Bmc_mathpys.bst will be used to %%
%% create a .BBL file for submission. %%
%% After submission of the .TEX file, %%
%% you will be prompted to submit your .BBL file. %%
%% %%
%% %%
%% Note that the displayed Bibliography will not %%
%% necessarily be rendered by Latex exactly as specified %%
%% in the online Instructions for Authors. %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% if your bibliography is in bibtex format, use those commands:
\bibliographystyle{bmc-mathphys} % Style BST file (bmc-mathphys, vancouver, spbasic).
\bibliography{bmc_article} % Bibliography file (usually '*.bib' )
% for author-year bibliography (bmc-mathphys or spbasic)
% a) write to bib file (bmc-mathphys only)
% @settings{label, options="nameyear"}
% b) uncomment next line
%\nocite{label}
% or include bibliography directly:
% \begin{thebibliography}
% \bibitem{b1}
% \end{thebibliography}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% Figures %%
%% %%
%% NB: this is for captions and %%
%% Titles. All graphics must be %%
%% submitted separately and NOT %%
%% included in the Tex document %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%
%% Do not use \listoffigures as most will included as separate files
\section*{Additional files}
\begin{itemize}
\item Additional file 1: Supplementary video -- video showcasing the software tool in action.
\item Additional file 2: Software build -- executable binary file of the software tool.
\item Additional file 3: Example data -- testing dataset used in the manuscript.
\item Additional file 4: User guide -- user guide for the software tool.
\end{itemize}
\section*{Figures}
\begin{figure}[h!]
\centering
% \includegraphics[width=0.9\columnwidth]{figure1.png}
\caption{\csentence{Traditionally used 3D visual representation of configurations.} Typical visual representation of configurations used by the proteomic experts that suffers from substantial visual clutter. It superposes several possible configurations between two proteins and visualizes them using the cartoon model. The set of green protein instances corresponds to one of the interacting proteins, the colored components represent the second protein in different spatial configurations.}
\label{fig:problem}
\end{figure}
\begin{figure}[h!]
\centering
% \includegraphics[width=0.9\columnwidth]{figure2.pdf}
\caption{\csentence{NSE1-NSE3 complex representation in PDBsum.} Two abstracted visualizations of the NSE1-NSE3 complex with PDB ID 3NW0 available in the PDBsum database. (a) Overview representation showing the number of amino acids in the contact zones and the types of interactions. (b) Part of the list of interacting amino acids along with individual interactions and their strength. Images taken from the PDBsum database~\cite{pdbsum}.}
\label{fig:pdbsum}
\end{figure}
\begin{figure}[h!]
\centering
% \includegraphics[width=0.9\columnwidth]{figure3.pdf}
\caption{\csentence{Workflow overview.} The exploration process followed by the domain experts and our proposed supporting visualizations. (a)~The \MatView represents an overview of all input configurations, obtained by one of the existing computational tools. (b)~The \ExpView enables the user to explore the contact zones and their differences for a set of selected configurations. (c)~The \OpBook animates the opening of a selected configuration. (d)~The \CoZoListView supports the detailed comparison of the constitution of the contact zones of selected configurations.}
\label{fig:workflow}
\end{figure}
\begin{figure}[h!]
\centering
% \includegraphics[width=0.9\columnwidth]{figure4.pdf}
\caption{\csentence{Matrix view for the exploration and filtering of the input configurations.} (a)~\MatView showing the aggregated information about the presence of mutually interacting amino acids in all configurations. Horizontal and vertical axes contain the lists of amino acids in the contact zones of the interacting proteins $P_1$ and $P_2$. (b)~The side view shows individual configurations sorted according to their similarity to the primary configuration. The interaction with the side view enables to gain more detailed information about the configurations and their interacting amino acids. The central part of the side view consists of a scrollable list of individual configurations. The vertical list of amino acids (the rightmost column) is the same list as the one on the horizontal axis. The configuration in focus contains one polyline connecting those two amino acids from the contact zone which are the closest ones (red lines). The remaining interactions between amino acids are marked with black polylines. The green borders of some matrix cells represent the pairs which are present in the configuration selected in the side view. The selected cells are marked with a cross. It is possible to enlarge a selected row and column using an interactive lens.}
\label{fig:matrixlens}
\end{figure}
\begin{figure}[h!]
\centering
% \includegraphics[width=0.9\columnwidth]{figure5.pdf}
\caption{\csentence{Exploded view.} (a)~Three configurations represented by surfaces with highlighted contact zones. (b)~Aligned configurations. Their contact zones are almost completely occluded. (c)~\ExpView of these configurations. A different color is used for each contact zone.}
\label{fig:case12}
\end{figure}
\begin{figure}[h!]
\centering
% \includegraphics[width=0.9\columnwidth]{figure6.pdf}
\caption{\csentence{\OpBook.} \OpBook enables the user to explore the contact zones between the interacting proteins simultaneously. On the left there is the reference protein and on the right there is the corresponding paired protein. The surface of the contact zones can be color-coded according to different criteria. Here the color represents the distance between the pairs of amino acids (red represents the closest ones, green the most distant ones).}
\label{fig:book}
\end{figure}
\begin{figure}[h!]
\centering
% \includegraphics[width=0.7\columnwidth]{figure7.pdf}
\caption{\csentence{\CoZoListView.} This view shows the comparison of one configuration, the primary one (a), with another selected configuration (b). For better comparison of configurations, the corresponding amino acids are interactively highlighted by zooming in. The view is sorted (and colored) according to hydrophobicity of the amino acids in the $P_1$ protein. Red color indicates the matches between the contact zone amino acids of the primary and the compared configuration. White rectangles indicate amino acids that are present in the primary configuration but are missing in the compared one.}
\label{fig:list}
\end{figure}
\begin{figure}[h!]
\centering
% \includegraphics[width=0.9\columnwidth]{figure8.pdf}
\caption{\csentence{The \CoZoListView and different properties.} Sorting of the \CoZoListView according to different properties of amino acids -- (a)~hydrophobicity, (b)~mutual distance, (c)~frequency of occurrence of the pairs in all configurations.}
\label{fig:sorting}
\end{figure}
\begin{figure}[h!]
\centering
% \includegraphics[width=0.9\columnwidth]{figure9.pdf}
\caption{\csentence{Surface-Surface Interaction -- best HADDOCK configurations.} Example of four configurations represented by the juxtapositioned \CoZoListView. (a)~Primary 3NW0 crystal structure, (b), (c), (d)~three selected best-fit HADDOCK models. The lists are colored and sorted according to the hydrophobicity of the amino acids in the reference protein in each selected configuration.}
\label{fig:case3}
\end{figure}
\begin{figure}[h!]
\centering
% \includegraphics[width=0.9\columnwidth]{figure10.pdf}
\caption{\csentence{Coiled-Coil Interaction -- the \MatView of interacting amino acids in all HADDOCK models.} The \MatView indicates that the selected pair of M186 and I1030 amino acids is present in 10 out of 40 loaded models.}
\label{fig:coiled_haddock_mat}
\end{figure}
\begin{figure}[h!]
\centering
% \includegraphics[width=0.9\columnwidth]{figure11.png}
\caption{\csentence{Coiled-Coil Interaction -- 4UX3 crystal (blue) and 10 selected HADDOCK configurations (green).} The first A172 amino acid (red) is highlighted in all loaded structures. The opposite orientation of 4UX3 and HADDOCK models is clearly visible.}
\label{fig:coiled_haddock}
\end{figure}
\begin{figure}[h!]
\centering
% \includegraphics[width=0.9\columnwidth]{figure12.png}
\caption{\csentence{Coiled-Coil Interaction -- 4UX3 crystal (blue) and 14 selected PyDock configurations (green).} In these PyDock configurations, all A172 amino acids (red balls and sticks) are positioned at the same side as in the crystal structure (blue).}
\label{fig:selection2SMC3PyDock}
\end{figure}
\begin{figure}[h!]
\centering
% \includegraphics[width=0.9\columnwidth]{figure13.pdf}
\caption{\csentence{Coiled-Coil Interaction -- best fitting PyDock configuration.} \CoZoList comparing the 4UX3 crystal with one best fit PyDock model with respect to the distances of amino acids.}
\label{fig:coiled2}
\end{figure}
\begin{figure}[!h]
\centering
% \includegraphics[width=0.9\columnwidth]{figure14.png}
\caption{\csentence{Coiled-Coil Interaction -- best fitting PyDock configuration.} The \ExpView showing the contact zone of the best fitting PyDock model (orange) and the 4UX3 crystal (blue). On the top, the overlapping contact zones on the reference protein are shown. The bottom part of the image depicts the paired proteins.}
\label{fig:selection_4_final_SMC3_PyDock}
\end{figure}
\begin{figure}[h!]
\centering
% \includegraphics[width=0.9\columnwidth]{figure15.pdf}
\caption{\csentence{Pocket-String Interaction -- 3EUH crystal structure.} (a)~3EUH crystal structure consisting of the domain containing the pocket (grey) and the helical fragment of the second domain (blue), shown using the cartoon representation. (b)~The same structure shown with the \OpBook. The contact zones are colored according to the distance between the interacting amino acids and the labels of the two closest pairs are shown.}
\label{fig:MukEF_crystal_3EUH_selected}
\end{figure}
\begin{figure}[h!]
\centering
% \includegraphics[width=0.9\columnwidth]{figure16.pdf}
\caption{\csentence{Pocket-String Interaction -- best HADDOCK configurations.} \CoZoLists of the selected HADDOCK configurations sorted according to the distance of the amino acids. (a)~The primary 3EUH crystal structure, (b), (c), (d)~three selected HADDOCK models. The sorting shows that the V200-R300 pair is one of the closest ones in the crystal as well as in all selected models.}
\label{fig:list_pocket_string}
\end{figure}
\begin{figure}[h!]
\centering
% \includegraphics[width=0.9\columnwidth]{figure17.png}
\caption{\csentence{Pocket-String Interaction -- the best fitting HADDOCK configuration.} The best fit HADDOCK configuration (orange) aligned with the 3EUH crystal structure (blue).}
\label{fig:MukEF_selection_3_best_pair}
\end{figure}
\begin{figure}[h!]
\centering
%\includegraphics[width=0.9\columnwidth]{figure18.pdf}
\caption{\csentence{\CoZoLists of 3EUH crystal structure computed with different distance parameter settings.} (a)~Contacts computed with the distance parameter 5~\AA. (b)~Contacts computed with the distance parameter 4~\AA. The \CoZoLists are sorted according to the mutual distance of the amino acids.}
\label{fig:list_3euh}
\end{figure}
\begin{figure}[h!]
\centering
% \includegraphics[width=0.9\columnwidth]{figure19.pdf}
\caption{\csentence{Pocket-String Interaction -- the best fitting PyDock configurations.} (a)~The best 5 PyDock configurations with the distance parameter 4 \AA, aligned with the 3EUH crystal structure (blue). The configuration, which exhibited the most similar contacts with the crystal, is orange and the remaining configurations are green. (b)~The \ExpView showing the comparison of the contact zones of the best PyDock configuration (orange) with the 3EUH crystal structure (blue).}
\label{fig:pydock_pocket_string}
\end{figure}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% Tables %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% Use of \listoftables is discouraged.
%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% Additional Files %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%\section*{Additional Files}
% \subsection*{Additional file 1 --- Sample additional file title}
% Additional file descriptions text (including details of how to
% view the file, if it is in a non-standard format or the file extension). This might refer to a multi-page table or a figure.
% \subsection*{Additional file 2 --- Sample additional file title}
% Additional file descriptions text.
\end{backmatter}
\end{document}