Publications

  • T. Nepusz, R. Sasidharan, and A. Paccanaro
    BMC Bioinformatics, vol. 11, iss. 1, p. 120, 2010.
    @article{citeulike:6780423, abstract = {{BACKGROUND}: An important problem in genomics is the automatic inference of groups of homologous proteins from pairwise sequence similarities. Several approaches have been proposed for this task which are "local" in the sense that they assign a protein to a cluster based only on the distances between that protein and the other proteins in the set. It was shown recently that global methods such as spectral clustering have better performance on a wide variety of datasets. However, currently available implementations of spectral clustering methods mostly consist of a few loosely coupled Matlab scripts that assume a fair amount of familiarity with Matlab programming and hence they are inaccessible for large parts of the research community. {RESULTS}: {SCPS} (Spectral Clustering of Protein Sequences) is an efficient and user-friendly implementation of a spectral method for inferring protein families. The method uses only pairwise sequence similarities, and is therefore practical when only sequence information is available. {SCPS} was tested on difficult sets of proteins whose relationships were extracted from the {SCOP} database, and its results were extensively compared with those obtained using other popular protein clustering algorithms such as {TribeMCL},
      hierarchical clustering and connected component analysis. We show that {SCPS} is able to identify many of the family/superfamily relationships correctly and that the quality of the obtained clusters as indicated by their F-scores is consistently better than all the other methods we compared it with. We also demonstrate the scalability of {SCPS} by clustering the entire {SCOP} database (14,183 sequences) and the complete genome of the yeast Saccharomyces cerevisiae (6,690 sequences). {CONCLUSIONS}: Besides the spectral method, {SCPS} also implements connected component analysis and hierarchical clustering, it integrates {TribeMCL},
      it provides different cluster quality tools, it can extract human-readable protein descriptions using {GI} numbers from {NCBI},
      it interfaces with external tools such as {BLAST} and Cytoscape, and it can produce publication-quality graphical representations of the clusters obtained, thus constituting a comprehensive and effective tool for practical research in computational biology. Source code and precompiled executables for Windows, Linux and Mac {OS} X are freely available at http://www.paccanarolab.org/software/scps.},
      author = {Nepusz, Tam\'{a}s and Sasidharan, Rajkumar and Paccanaro, Alberto},
      citeulike-article-id = {6780423},
      citeulike-linkout-0 = {http://dx.doi.org/10.1186/1471-2105-11-120},
      citeulike-linkout-1 = {http://view.ncbi.nlm.nih.gov/pubmed/20214776},
      citeulike-linkout-2 = {http://www.hubmed.org/display.cgi?uids=20214776},
      day = {9},
      doi = {10.1186/1471-2105-11-120},
      issn = {1471-2105},
      journal = {BMC Bioinformatics},
      keywords = {clustering, connected\_component\_analysis, hierarchical\_clustering, protein\_families, protein\_sequences, proteins, scop\_database, spectral\_clustering},
      month = mar, number = {1},
      pages = {120+},
      pmid = {20214776},
      title = {{SCPS}: a fast implementation of a spectral method for detecting protein families on a genome-wide scale.},
      url = {http://dx.doi.org/10.1186/1471-2105-11-120},
      volume = {11},
      year = {2010}
    }
  • T. A. Gianoulis, J. Raes, P. V. Patel, R. Bjornson, J. O. Korbel, I. Letunic, T. Yamada, A. Paccanaro, L. J. Jensen, M. Snyder, P. Bork, and M. B. Gerstein
    Proceedings of the National Academy of Sciences, vol. 106, iss. 5, pp. 1374-1379, 2009.
    @article{citeulike:3969446,
      author = {Gianoulis, Tara A. and Raes, Jeroen and Patel, Prianka V. and Bjornson, Robert and Korbel, Jan O. and Letunic, Ivica and Yamada, Takuji and Paccanaro, Alberto and Jensen, Lars J. and Snyder, Michael and Bork, Peer and Gerstein, Mark B.},
      citeulike-article-id = {3969446},
      citeulike-linkout-0 = {http://dx.doi.org/10.1073/pnas.0808022106},
      citeulike-linkout-1 = {http://www.pnas.org/content/106/5/1374.abstract},
      citeulike-linkout-2 = {http://www.pnas.org/content/106/5/1374.full.pdf},
      citeulike-linkout-3 = {http://www.pnas.org/cgi/content/abstract/106/5/1374},
      citeulike-linkout-4 = {http://view.ncbi.nlm.nih.gov/pubmed/19164758},
      citeulike-linkout-5 = {http://www.hubmed.org/display.cgi?uids=19164758},
      day = {3},
      doi = {10.1073/pnas.0808022106},
      issn = {1091-6490},
      journal = {Proceedings of the National Academy of Sciences},
      month = feb, number = {5},
      pages = {1374--1379},
      pmid = {19164758},
      title = {Quantifying environmental adaptation of metabolic pathways in metagenomics},
      url = {http://dx.doi.org/10.1073/pnas.0808022106},
      volume = {106},
      year = {2009}
    }
  • P. Hu, S. C. Janga, M. Babu, J. J. D’iaz-Mej’ia, G. Butland, W. Yang, O. Pogoutse, X. Guo, S. Phanse, P. Wong, S. Chandran, C. Christopoulos, A. Nazarians-Armavil, N. K. Nasseri, G. Musso, M. Ali, N. Nazemof, V. Eroukova, A. Golshani, A. Paccanaro, J. F. Greenblatt, G. Moreno-Hagelsieb, and A. Emili
    PLoS Biol, vol. 7, iss. 4, p. 1000096, 2009.
    @article{citeulike:4462926,
      author = {Hu, Pingzhao and Janga, Sarath C. and Babu, Mohan and D\'{\i}az-Mej\'{\i}a, Javier J. and Butland, Gareth and Yang, Wenhong and Pogoutse, Oxana and Guo, Xinghua and Phanse, Sadhna and Wong, Peter and Chandran, Shamanta and Christopoulos, Constantine and Nazarians-Armavil, Anaies and Nasseri, Negin K. and Musso, Gabriel and Ali, Mehrab and Nazemof, Nazila and Eroukova, Veronika and Golshani, Ashkan and Paccanaro, Alberto and Greenblatt, Jack F. and Moreno-Hagelsieb, Gabriel and Emili, Andrew},
      citeulike-article-id = {4462926},
      citeulike-linkout-0 = {http://dx.doi.org/10.1371/journal.pbio.1000096},
      citeulike-linkout-1 = {http://view.ncbi.nlm.nih.gov/pubmed/19402753},
      citeulike-linkout-2 = {http://www.hubmed.org/display.cgi?uids=19402753},
      day = {28},
      doi = {10.1371/journal.pbio.1000096},
      issn = {1545-7885},
      journal = {PLoS Biol},
      month = apr, number = {4},
      pages = {e1000096+},
      pmid = {19402753},
      publisher = {Public Library of Science},
      title = {Global Functional Atlas of Escherichia coli Encompassing Previously Uncharacterized Proteins},
      url = {http://dx.doi.org/10.1371/journal.pbio.1000096},
      volume = {7},
      year = {2009}
    }
  • A. Devoto and A. Paccanaro
    2008, pp. 331-350.
    @incollection{citeulike:5847015, {TOR} signalling in plants; Doerner P, Signals and mechanisms in the control of plant growth; Durgardeyn J and Van Der Straeten D, Ethylene: inhibitor and stimulator of plant growth) The authors will also discuss recent approaches that have been developed for inferring and modelling protein networks in other model organisms, with the aim of highlighting how integrating this information with gene expression data may lead to better inference of signalling networks. It is reasonable to think that, once large scale protein interaction data will be available, similar methods will be applicable to infer and model signalling networks in Arabidopsis.},
      author = {Devoto, Alessandra and Paccanaro, Alberto},
      booktitle = {Plant Growth Signaling},
      citeulike-article-id = {5847015},
      citeulike-linkout-0 = {http://dx.doi.org/10.1007/7089\_2007\_152},
      citeulike-linkout-1 = {http://www.springerlink.com/content/j441164615213g34},
      doi = {10.1007/7089\_2007\_152},
      journal = {-},
      pages = {331--350},
      title = {Signal Transduction Networks During Stress Responses in Arabidopsis: {High-Throughput} Analysis and Modelling},
      url = {http://dx.doi.org/10.1007/7089\_2007\_152},
      year = {2008}
    }
  • H. Yang, P. Bhat, H. Shanahan, and A. Paccanaro
    A maximal eigenvalue method for detecting process representative genes by integrating data from multiple sources
    in NIPS Workshop on Learning from Multiple Sources, 2008.
    @inproceedings{citeulike:5847191, author = {Yang, Haixuan and Bhat, Prajwal and Shanahan, Hugh and Paccanaro, Alberto},
      booktitle = {NIPS Workshop on Learning from Multiple Sources},
      citeulike-article-id = {5847191},
      title = {A maximal eigenvalue method for detecting process representative genes by integrating data from multiple sources},
      year = {2008}
    }
  • Z. D. Zhang, A. Paccanaro, Y. Fu, S. Weissman, Z. Weng, J. Chang, M. Snyder, and M. B. Gerstein
    Genome Research, vol. 17, iss. 6, pp. 787-797, 2007.
    @article{citeulike:1390187, abstract = {The comprehensive inventory of functional elements in 44 human genomic regions carried out by the {ENCODE} Project Consortium enables for the first time a global analysis of the genomic distribution of transcriptional regulatory elements. In this study we developed an intuitive and yet powerful approach to analyze the distribution of regulatory elements found in many different {ChIP}–chip experiments on a 10∼100-kb scale. First, we focus on the overall chromosomal distribution of regulatory elements in the {ENCODE} regions and show that it is highly nonuniform. We demonstrate, in fact, that regulatory elements are associated with the location of known genes. Further examination on a local, single-gene scale shows an enrichment of regulatory elements near both transcription start and end sites. Our results indicate that overall these elements are clustered into regulatory rich ” islands” and poor ” deserts.” Next, we examine how consistent the nonuniform distribution is between different transcription factors. We perform on all the factors a multivariate analysis in the framework of a biplot, which enhances biological signals in the experiments. This groups transcription factors into sequence-specific and sequence-nonspecific clusters. Moreover, with experimental variation carefully controlled, detailed correlations show that the distribution of sites was generally reproducible for a specific factor between different laboratories and microarray platforms. Data sets associated with histone modifications have particularly strong correlations. Finally, we show how the correlations between factors change when only regulatory elements far from the transcription start sites are considered.},
      author = {Zhang, Zhengdong D. and Paccanaro, Alberto and Fu, Yutao and Weissman, Sherman and Weng, Zhiping and Chang, Joseph and Snyder, Michael and Gerstein, Mark B.},
      citeulike-article-id = {1390187},
      citeulike-linkout-0 = {http://dx.doi.org/10.1101/gr.5573107},
      citeulike-linkout-1 = {http://genome.cshlp.org/content/17/6/787.abstract},
      citeulike-linkout-2 = {http://genome.cshlp.org/content/17/6/787.full.pdf},
      citeulike-linkout-3 = {http://www.genome.org/cgi/content/abstract/17/6/787},
      citeulike-linkout-4 = {http://view.ncbi.nlm.nih.gov/pubmed/17567997},
      citeulike-linkout-5 = {http://www.hubmed.org/display.cgi?uids=17567997},
      day = {1},
      doi = {10.1101/gr.5573107},
      issn = {1088-9051},
      journal = {Genome Research},
      month = jun, number = {6},
      pages = {787--797},
      pmid = {17567997},
      title = {Statistical analysis of the genomic distribution and correlation of regulatory elements in the {ENCODE} regions},
      url = {http://dx.doi.org/10.1101/gr.5573107},
      volume = {17},
      year = {2007}
    }
  • N. J. Krogan, G. Cagney, H. Yu, G. Zhong, X. Guo, A. Ignatchenko, J. Li, S. Pu, N. Datta, A. P. Tikuisis, T. Punna, J. M. Peregrín-Alvarez, M. Shales, X. Zhang, M. Davey, M. D. Robinson, A. Paccanaro, J. E. Bray, A. Sheung, B. Beattie, D. P. Richards, V. Canadien, A. Lalev, F. Mena, P. Wong, A. Starostine, M. M. Canete, J. Vlasblom, S. Wu, C. Orsi, S. R. Collins, S. Chandran, R. Haw, J. J. Rilstone, K. Gandi, N. J. Thompson, G. Musso, P. St Onge, S. Ghanny, M. H. Y. Lam, G. Butland, A. M. Altaf-Ul, S. Kanaya, A. Shilatifard, E. O’Shea, J. S. Weissman, J. C. Ingles, T. R. Hughes, J. Parkinson, M. Gerstein, S. J. Wodak, A. Emili, and J. F. Greenblatt
    Nature, vol. 440, iss. 7084, pp. 637-643, 2006.
    @article{citeulike:560813, abstract = {Identification of protein–protein interactions often provides insight into protein function, and many cellular processes are performed by stable protein complexes. We used tandem affinity purification to process 4,562 different tagged proteins of the yeast Saccharomyces cerevisiae. Each preparation was analysed by both matrix-assisted laser desorption/ionization–time of flight mass spectrometry and liquid chromatography tandem mass spectrometry to increase coverage and accuracy. Machine learning was used to integrate the mass spectrometry scores and assign probabilities to the protein–protein interactions. Among 4,087 different proteins identified with high confidence by mass spectrometry from 2,357 successful purifications, our core data set (median precision of 0.69) comprises 7,123 protein–protein interactions involving 2,708 proteins. A Markov clustering algorithm organized these interactions into 547 protein complexes averaging 4.9 subunits per complex, about half of them absent from the {MIPS} database, as well as 429 additional interactions between pairs of complexes. The data (all of which are available online) will help future studies on individual proteins as well as functional genomics and systems biology.},
      author = {Krogan, Nevan J. and Cagney, Gerard and Yu, Haiyuan and Zhong, Gouqing and Guo, Xinghua and Ignatchenko, Alexandr and Li, Joyce and Pu, Shuye and Datta, Nira and Tikuisis, Aaron P. and Punna, Thanuja and Peregr\~{A}­n-Alvarez, Jos\~{A}{\copyright} M. and Shales, Michael and Zhang, Xin and Davey, Michael and Robinson, Mark D. and Paccanaro, Alberto and Bray, James E. and Sheung, Anthony and Beattie, Bryan and Richards, Dawn P. and Canadien, Veronica and Lalev, Atanas and Mena, Frank and Wong, Peter and Starostine, Andrei and Canete, Myra M. and Vlasblom, James and Wu, Samuel and Orsi, Chris and Collins, Sean R. and Chandran, Shamanta and Haw, Robin and Rilstone, Jennifer J. and Gandi, Kiran and Thompson, Natalie J. and Musso, Gabe and St Onge, Peter and Ghanny, Shaun and Lam, Mandy H. Y. and Butland, Gareth and Altaf-Ul, Amin M. and Kanaya, Shigehiko and Shilatifard, Ali and O'Shea, Erin and Weissman, Jonathan S. and Ingles, C. James and Hughes, Timothy R. and Parkinson, John and Gerstein, Mark and Wodak, Shoshana J. and Emili, Andrew and Greenblatt, Jack F.},
      citeulike-article-id = {560813},
      citeulike-linkout-0 = {http://dx.doi.org/10.1038/nature04670},
      citeulike-linkout-1 = {http://dx.doi.org/10.1038/nature04670},
      citeulike-linkout-2 = {http://view.ncbi.nlm.nih.gov/pubmed/16554755},
      citeulike-linkout-3 = {http://www.hubmed.org/display.cgi?uids=16554755},
      day = {22},
      doi = {10.1038/nature04670},
      issn = {0028-0836},
      journal = {Nature},
      month = mar, number = {7084},
      pages = {637--643},
      pmid = {16554755},
      publisher = {Nature Publishing Group},
      title = {Global landscape of protein complexes in the yeast Saccharomyces cerevisiae},
      url = {http://dx.doi.org/10.1038/nature04670},
      volume = {440},
      year = {2006}
    }
  • R. Sasidharan, M. Gerstein, and A. Paccanaro
    Spectral clustering of protein sequences using sequence-profile scores
    in Proceedings of ICNPSC 2006 – 3rd International Conference on Neural Parallel and Scientific Computations, 2006.
    @inproceedings{citeulike:5847201, author = {Sasidharan, Rajkumar and Gerstein, Mark and Paccanaro, Alberto},
      booktitle = {Proceedings of ICNPSC 2006 - 3rd International Conference on Neural Parallel and Scientific Computations},
      citeulike-article-id = {5847201},
      title = {Spectral clustering of protein sequences using sequence-profile scores},
      year = {2006}
    }
  • M. Seringhaus, A. Paccanaro, A. Borneman, M. Snyder, and M. Gerstein
    Genome research, vol. 16, iss. 9, pp. 1126-1135, 2006.
    @article{citeulike:1286363, abstract = {Essential genes are required for an organism's viability, and the ability to identify these genes in pathogens is crucial to directed drug development. Predicting essential genes through computational methods is appealing because it circumvents expensive and difficult experimental screens. Most such prediction is based on homology mapping to experimentally verified essential genes in model organisms. We present here a different approach, one that relies exclusively on sequence features of a gene to estimate essentiality and offers a promising way to identify essential genes in unstudied or uncultured organisms. We identified 14 characteristic sequence features potentially associated with essentiality, such as localization signals, codon adaptation, {GC} content, and overall hydrophobicity. Using the well-characterized baker's yeast Saccharomyces cerevisiae, we employed a simple Bayesian framework to measure the correlation of each of these features with essentiality. We then employed the 14 features to learn the parameters of a machine learning classifier capable of predicting essential genes. We trained our classifier on known essential genes in S. cerevisiae and applied it to the closely related and relatively unstudied yeast Saccharomyces mikatae. We assessed predictive success in two ways: First, we compared all of our predictions with those generated by homology mapping between these two species. Second, we verified a subset of our predictions with eight in vivo knockouts in S. mikatae, and we present here the first experimentally confirmed essential genes in this species.},
      author = {Seringhaus, Michael and Paccanaro, Alberto and Borneman, Anthony and Snyder, Michael and Gerstein, Mark},
      citeulike-article-id = {1286363},
      citeulike-linkout-0 = {http://dx.doi.org/10.1101/gr.5144106},
      citeulike-linkout-1 = {http://www.genome.org/cgi/content/abstract/16/9/1126},
      citeulike-linkout-2 = {http://view.ncbi.nlm.nih.gov/pubmed/16899653},
      citeulike-linkout-3 = {http://www.hubmed.org/display.cgi?uids=16899653},
      day = {1},
      doi = {10.1101/gr.5144106},
      issn = {1088-9051},
      journal = {Genome research},
      month = sep, number = {9},
      pages = {1126--1135},
      pmid = {16899653},
      title = {Predicting essential genes in fungal genomes.},
      url = {http://dx.doi.org/10.1101/gr.5144106},
      volume = {16},
      year = {2006}
    }
  • A. Paccanaro, J. A. Casbon, and M. A. Saqi
    Nucleic Acids Research, vol. 34, iss. 5, pp. 1571-1580, 2006.
    @article{citeulike:575333, abstract = {An important problem in genomics is automatically clustering homologous proteins when only sequence information is available. Most methods for clustering proteins are local, and are based on simply thresholding a measure related to sequence distance. We first show how locality limits the performance of such methods by analysing the distribution of distances between protein sequences. We then present a global method based on spectral clustering and provide theoretical justification of why it will have a remarkable improvement over local methods. We extensively tested our method and compared its performance with other local methods on several subsets of the {SCOP} (Structural Classification of Proteins) database, a gold standard for protein structure classification. We consistently observed that, the number of clusters that we obtain for a given set of proteins is close to the number of superfamilies in that set; there are fewer singletons; and the method correctly groups most remote homologs. In our experiments, the quality of the clusters as quantified by a measure that combines sensitivity and specificity was consistently better [on average, improvements were 84\% over hierarchical clustering, 34\% over Connected Component Analysis ({CCA}) (similar to {GeneRAGE}) and 72\% over another global method, {TribeMCL}].},
      author = {Paccanaro, Alberto and Casbon, James A. and Saqi, Mansoor A.},
      citeulike-article-id = {575333},
      citeulike-linkout-0 = {http://dx.doi.org/10.1093/nar/gkj515},
      citeulike-linkout-1 = {http://nar.oxfordjournals.org/content/34/5/1571.abstract},
      citeulike-linkout-2 = {http://nar.oxfordjournals.org/content/34/5/1571.full.pdf},
      citeulike-linkout-3 = {http://nar.oxfordjournals.org/cgi/content/abstract/34/5/1571},
      citeulike-linkout-4 = {http://www.ingentaconnect.com/content/oup/nar/2006/00000034/00000005/art01571},
      citeulike-linkout-5 = {http://www.ncbi.nlm.nih.gov/pmc/articles/PMC1409676/},
      citeulike-linkout-6 = {http://view.ncbi.nlm.nih.gov/pubmed/16547200},
      citeulike-linkout-7 = {http://www.hubmed.org/display.cgi?uids=16547200},
      doi = {10.1093/nar/gkj515},
      issn = {1362-4962},
      journal = {Nucleic Acids Research},
      number = {5},
      pages = {1571--1580},
      pmcid = {PMC1409676},
      pmid = {16547200},
      publisher = {Oxford University Press},
      title = {Spectral clustering of protein sequences.},
      url = {http://dx.doi.org/10.1093/nar/gkj515},
      volume = {34},
      year = {2006}
    }
  • R. D. Waite, A. Paccanaro, A. Papakonstantinopoulou, J. M. Hurst, M. Saqi, E. Littler, and M. A. Curtis
    BMC Genomics, vol. 7, iss. 1, p. 162, 2006.
    @article{citeulike:711774, abstract = {{BACKGROUND}:Pseudomonas aeruginosa is a genetically complex bacterium which can adopt and switch between a free-living or biofilm lifestyle, a versatility that enables it to thrive in many different environments and contributes to its success as a human {pathogen.RESULTS}:Transcriptomes derived from growth states relevant to the lifestyle of P. aeruginosa were clustered using three different methods (K-means, K-means spectral and hierarchical clustering). The culture conditions used for this study were; biofilms incubated for 8, 14, 24 and 48 hrs, and planktonic culture (logarithmic and stationary phase). This cluster analysis revealed the existence and provided a clear illustration of distinct expression profiles present in the dataset. Moreover, it gave an insight into which genes are up-regulated in planktonic, developing biofilm and confluent biofilm states. In addition, this analysis confirmed the contribution of quorum sensing ({QS}) and {RpoS} regulated genes to the biofilm mode of growth, and enabled the identification of a 60.69 Kbp region of the genome associated with stationary phase growth (stationary phase planktonic culture and confluent {biofilms).CONCLUSION}:This is the first study to use clustering to separate a large P. aeruginosa microarray dataset consisting of transcriptomes obtained from diverse conditions relevant to its growth, into different expression profiles. These distinct expression profiles not only reveal novel aspects of P. aeruginosa gene expression but also provide a growth specific transcriptomic reference dataset for the research community.},
      author = {Waite, Richard D. and Paccanaro, Alberto and Papakonstantinopoulou, Anastasia and Hurst, Jacob M. and Saqi, Mansoor and Littler, Eddie and Curtis, Michael A.},
      citeulike-article-id = {711774},
      citeulike-linkout-0 = {http://dx.doi.org/10.1186/1471-2164-7-162},
      citeulike-linkout-1 = {http://view.ncbi.nlm.nih.gov/pubmed/16800888},
      citeulike-linkout-2 = {http://www.hubmed.org/display.cgi?uids=16800888},
      day = {26},
      doi = {10.1186/1471-2164-7-162},
      issn = {1471-2164},
      journal = {BMC Genomics},
      month = jun, number = {1},
      pages = {162+},
      pmid = {16800888},
      title = {Clustering of Pseudomonas aeruginosa transcriptomes from planktonic cultures, developing and mature biofilms reveals distinct expression profiles},
      url = {http://dx.doi.org/10.1186/1471-2164-7-162},
      volume = {7},
      year = {2006}
    }
  • C. S. Goh, T. A. Gianoulis, Y. Liu, J. Li, A. Paccanaro, Y. A. Lussier, and M. Gerstein
    BMC Genomics, vol. 7, p. 257, 2006.
    @article{citeulike:894945, abstract = {{BACKGROUND}: The ability to rapidly characterize an unknown microorganism is critical in both responding to infectious disease and biodefense. To do this, we need some way of anticipating an organism's phenotype based on the molecules encoded by its genome. However, the link between molecular composition (i.e. genotype) and phenotype for microbes is not obvious. While there have been several studies that address this challenge, none have yet proposed a large-scale method integrating curated biological information. Here we utilize a systematic approach to discover genotype-phenotype associations that combines phenotypic information from a biomedical informatics database, {GIDEON},
      with the molecular information contained in National Center for Biotechnology Information's Clusters of Orthologous Groups database ({NCBI} {COGs}). {RESULTS}: Integrating the information in the two databases, we are able to correlate the presence or absence of a given protein in a microbe with its phenotype as measured by certain morphological characteristics or survival in a particular growth media. With a 0.8 correlation score threshold, 66\% of the associations found were confirmed by the literature and at a 0.9 correlation threshold, 86\% were positively verified. {CONCLUSION}: Our results suggest possible phenotypic manifestations for proteins biochemically associated with sugar metabolism and electron transport. Moreover, we believe our approach can be extended to linking pathogenic phenotypes with functionally related proteins.},
      author = {Goh, Chern-Sing S. and Gianoulis, Tara A. and Liu, Yang and Li, Jianrong and Paccanaro, Alberto and Lussier, Yves A. and Gerstein, Mark},
      citeulike-article-id = {894945},
      citeulike-linkout-0 = {http://dx.doi.org/10.1186/1471-2164-7-257},
      citeulike-linkout-1 = {http://view.ncbi.nlm.nih.gov/pubmed/17038185},
      citeulike-linkout-2 = {http://www.hubmed.org/display.cgi?uids=17038185},
      day = {12},
      doi = {10.1186/1471-2164-7-257},
      issn = {1471-2164},
      journal = {BMC Genomics},
      month = oct, pages = {257+},
      pmid = {17038185},
      title = {Integration of curated databases to identify genotype-phenotype associations.},
      url = {http://dx.doi.org/10.1186/1471-2164-7-257},
      volume = {7},
      year = {2006}
    }
  • H. Yu, A. Paccanaro, V. Trifonov, and M. Gerstein
    Bioinformatics, vol. 22, iss. 7, pp. 823-829, 2006.
    @article{citeulike:561194, abstract = {10.1093/bioinformatics/btl014 Datasets obtained by large-scale, high-throughput methods for detecting protein\^{a}€“protein interactions typically suffer from a relatively high level of noise. We describe a novel method for improving the quality of these datasets by predicting missed protein\^{a}€“protein interactions, using only the topology of the protein interaction network observed by the large-scale experiment. The central idea of the method is to search the protein interaction network for defective cliques (nearly complete complexes of pairwise interacting proteins), and predict the interactions that complete them. We formulate an algorithm for applying this method to large-scale networks, and show that in practice it is efficient and has good predictive performance. More information can be found on our website {http://topnet.gersteinlab.org/clique/Contact}:{Mark.Gerstein}@{yale.eduSupplementary} information: Supplementary Materials are available at Bioinformatics online.},
      author = {Yu, Haiyuan and Paccanaro, Alberto and Trifonov, Valery and Gerstein, Mark},
      citeulike-article-id = {561194},
      citeulike-linkout-0 = {http://dx.doi.org/10.1093/bioinformatics/btl014},
      citeulike-linkout-1 = {http://bioinformatics.oxfordjournals.org/cgi/content/abstract/22/7/823},
      citeulike-linkout-2 = {http://www.ingentaconnect.com/content/oup/cabios/2006/00000022/00000007/art00823},
      citeulike-linkout-3 = {http://view.ncbi.nlm.nih.gov/pubmed/16455753},
      citeulike-linkout-4 = {http://www.hubmed.org/display.cgi?uids=16455753},
      day = {1},
      doi = {10.1093/bioinformatics/btl014},
      issn = {1367-4803},
      journal = {Bioinformatics},
      month = apr, number = {7},
      pages = {823--829},
      pmid = {16455753},
      publisher = {Oxford University Press},
      title = {Predicting interactions in protein networks by completing defective cliques},
      url = {http://dx.doi.org/10.1093/bioinformatics/btl014},
      volume = {22},
      year = {2006}
    }
  • L. J. Lu, Y. Xia, A. Paccanaro, H. Yu, and M. Gerstein
    Genome research, vol. 15, iss. 7, pp. 945-953, 2005.
    @article{citeulike:334796, address = {Department of Molecular Biophysics and Biochemistry, Yale University, New Haven, Connecticut 06520, USA.},
      author = {Lu, Long J. and Xia, Yu and Paccanaro, Alberto and Yu, Haiyuan and Gerstein, Mark},
      citeulike-article-id = {334796},
      citeulike-linkout-0 = {http://dx.doi.org/10.1101/gr.3610305},
      citeulike-linkout-1 = {http://genome.cshlp.org/content/15/7/945.long.abstract},
      citeulike-linkout-2 = {http://genome.cshlp.org/content/15/7/945.long.full.pdf},
      citeulike-linkout-3 = {http://www.genome.org/cgi/content/abstract/15/7/945},
      citeulike-linkout-4 = {http://www.ncbi.nlm.nih.gov/pmc/articles/PMC1172038/},
      citeulike-linkout-5 = {http://view.ncbi.nlm.nih.gov/pubmed/15998909},
      citeulike-linkout-6 = {http://www.hubmed.org/display.cgi?uids=15998909},
      day = {1},
      doi = {10.1101/gr.3610305},
      issn = {1088-9051},
      journal = {Genome research},
      month = jul, number = {7},
      pages = {945--953},
      pmcid = {PMC1172038},
      pmid = {15998909},
      title = {Assessing the limits of genomic data integration for predicting protein networks.},
      url = {http://dx.doi.org/10.1101/gr.3610305},
      volume = {15},
      year = {2005}
    }
  • A. Paccanaro, V. Trifonov, Y. Haiyuan, and M. Gerstein
    in Proceedings of the International Joint Conference on Neural Networks, 2005, pp. 161-166.
    @inproceedings{citeulike:5847188,
      author = {Paccanaro, A. and Trifonov, V. and Haiyuan, Y. and Gerstein, M.},
      booktitle = {Proceedings of the International Joint Conference on Neural Networks},
      citeulike-article-id = {5847188},
      citeulike-linkout-0 = {http://dx.doi.org/10.1109/IJCNN.2005.1555823},
      citeulike-linkout-1 = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=1555823},
      day = {27},
      doi = {10.1109/IJCNN.2005.1555823},
      isbn = {0-7803-9048-2},
      journal = {-},
      location = {Montreal, Quebec, Canada},
      month = dec, pages = {161--166},
      publisher = {IEEE},
      title = {Inferring protein-protein interactions using interaction network topologies},
      url = {http://dx.doi.org/10.1109/IJCNN.2005.1555823},
      volume = {2},
      year = {2005}
    }
  • C. Chennubhotla and A. Paccanaro
    Markov analysis of protein sequence similarities
    Springer, 2003, vol. 2859, pp. 278-286.
    @inbook{citeulike:5847004, abstract = {In our previous work, we explored the use of graph-theoretic spectral methods for clustering protein sequences [7]. The nodes of the graph represent a set of proteins to be clustered into families and/or super-families. Edges between nodes are undirected and weighted by the similarities between proteins. We constructed a novel similarity function based on {BLAST} scores. The similarity values are in turn used to construct a Markov matrix representing transition probabilities between every pair of connected proteins. By analyzing the perturbations to the stationary distribution of the Markov matrix (as in [6,4]), we partition the graph into clusters. In this paper, we compare our method with {TribeMCL},
      which modifies random walks, by reinforcing strong edges and pruning weak ones, such that clusters emerge naturally from the graph [3]. We compare these two methods with respect to their ease of use and the quality of the resulting clusters.},
      author = {Chennubhotla, Chakra and Paccanaro, Alberto},
      citeulike-article-id = {5847004},
      isbn = {978-3-540-20227-1},
      issn = {0302-9743},
      pages = {278--286},
      publisher = {Springer},
      series = {Lecture Notes in Computer Science},
      title = {Markov analysis of protein sequence similarities},
      volume = {2859},
      year = {2003}
    }
  • A. Paccanaro, C. Chennubhotla, J. A. Casbon, and M. A. S. Saqi
    in Proceedings of the International Joint Conference on Neural Networks, 2003, pp. 3083-3088.
    @inproceedings{citeulike:694514,
      author = {Paccanaro, A. and Chennubhotla, C. and Casbon, J. A. and Saqi, M. A. S.},
      booktitle = {Proceedings of the International Joint Conference on Neural Networks},
      citeulike-article-id = {694514},
      citeulike-linkout-0 = {http://dx.doi.org/10.1109/IJCNN.2003.1224064},
      citeulike-linkout-1 = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=1224064},
      day = {26},
      doi = {10.1109/IJCNN.2003.1224064},
      isbn = {0-7803-7898-9},
      journal = {-},
      location = {Portland, Oregon, USA},
      month = aug, pages = {3083--3088},
      publisher = {IEEE},
      title = {Spectral clustering of protein sequences},
      url = {http://dx.doi.org/10.1109/IJCNN.2003.1224064},
      volume = {4},
      year = {2003}
    }