diff --git a/.github/workflows/jekyll.yml b/.github/workflows/jekyll.yml new file mode 100644 index 00000000000..140ce83d937 --- /dev/null +++ b/.github/workflows/jekyll.yml @@ -0,0 +1,63 @@ +# This workflow uses actions that are not certified by GitHub. +# They are provided by a third-party and are governed by +# separate terms of service, privacy policy, and support +# documentation. + +# Sample workflow for building and deploying a Jekyll site to GitHub Pages +name: Deploy Jekyll site to Pages + +on: + # Runs on pushes targeting the default branch + push: + branches: ["master"] + + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages +permissions: + contents: read + pages: write + id-token: write + +# Allow one concurrent deployment +concurrency: + group: "pages" + cancel-in-progress: true + +jobs: + # Build job + build: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Setup Ruby + uses: ruby/setup-ruby@ee2113536afb7f793eed4ce60e8d3b26db912da4 # v1.127.0 + with: + ruby-version: '3.0' # Not needed with a .ruby-version file + bundler-cache: true # runs 'bundle install' and caches installed gems automatically + cache-version: 0 # Increment this number if you need to re-download cached gems + - name: Setup Pages + id: pages + uses: actions/configure-pages@v3 + - name: Build with Jekyll + # Outputs to the './_site' directory by default + run: bundle exec jekyll build --baseurl "${{ steps.pages.outputs.base_path }}" + env: + JEKYLL_ENV: production + - name: Upload artifact + # Automatically uploads an artifact from the './_site' directory by default + uses: actions/upload-pages-artifact@v1 + + # Deployment job + deploy: + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + runs-on: ubuntu-latest + needs: build + steps: + - name: Deploy to GitHub Pages + id: deployment + uses: actions/deploy-pages@v3 diff --git a/AboutMe.html b/AboutMe.html new file mode 100644 index 00000000000..ab3c15fcccb --- /dev/null +++ b/AboutMe.html @@ -0,0 +1,126 @@ +--- +layout: page +--- + + + + + + +
+

About Me

+
+ + +
+ +
+ +
+ +
+

+ I am Yongzhe 永哲 :) and a biostatistician from the City of Hope Comprehensive Cancer Center + with a passion for clinical and biomedical studies. + I am very passionate about data visualization and believe that it is a powerful tool for communicating complex information in a way + that is accessible to a wide range of audience. +

+
+ +
+ +
+

Where I learned my skills...

+

+ Before I joined in the City of Hope, I earned my Master of Science in biostatistics from the University of Washington (2022), + where I had the privilege of being advised by Jim Hughes and + Helen Y. Chu. + During my time there, I actively participated in numerous research focused on combating the COVID-19 pandemic, spanning from + epidemiological studies to molecular investigations. + Prior to my graduate studies, I completed my Bachelor of Science in statistics at the University of California, Los Angeles (2020). At UCLA, + I developed a keen interest in applying advanced machine learning models to gain insights into the behavior of concrete and cement, aiming to mitigate carbon emissions. + I was fortunate to receive guidance from Mathieu Bauchy and + Hongquan Xu, who supported me in exploring this fascinating area of research. +

+
+ +
+

What I am doing now...

+

+ I worked as a biostatistician at the City of Hope, focusing primarily on research related to women's cancers such as gynecologic cancer, + breast cancer, and colorectal cancer. My goal was to investigate the disparities in cancer behavior based on various social determinants, + such as race/ethnicity and insurance, as well as biological and molecular information, including genomics, proteomics, and cytokines. + Additionally, I evaluated programs/trials aimed at translating scientific knowledge into community practice to reduce and eliminate inequalities + in cancer outcomes. In my daily work, I frequently addressed questions like: + +

+

+

+ Similar cancer-related problems often arose in my work. Regardless of the specific question, statistical analysis played a crucial role in + providing evidence to answer scientific inquiries. I employed various statistical methods, such as survival and longitudinal models, + machine learning, experimental designs, and many other statistical models, integrating them to formulate effective solution pathways for each scientific question. + In addition to my cancer-related research, I also conduct statistical analysis in computational neuroscience to explore the relationship + between neuro signals and animal behaviors. +

+
+ +
+

What I am interested…

+

+ My research journey began with the application of variable importance, such as permutation importance and Shapley values, to investigate + the impact of chemical compositions on the strength of concrete/cement. This experience sparked my interest in exploring variable importance for + both semi-parametric and non-parametric models. As I delved into high-dimensional data from respiratory infectious diseases and cancer studies, + which included RNA-seq, proteomics, and cytokine data, I became fascinated with understanding how variable importance can be incorporated into statistical + models for such data types. Given the longitudinal nature and survival-related aspects of clinical and biomedical studies, I developed a specific + interest in utilizing longitudinal and survival models to effectively handle high-dimensional data in practical scenarios. As I immersed myself in + public health studies, I regularly encountered data analysis related to population science and epidemiology. I was intrigued by the prospect of + adapting statistical models to accommodate data from diverse study designs and answer scientific inquiries. Regardless of the aforementioned research + questions, I strongly believe that data visualization plays an integral role in explanatory data analysis. As a biostatistician, I am consistently passionate + about enhancing data visualization techniques to suit various types of data. +

+ +

+ Additionally, I have been practicing Chinese calligraphy for 19 years and continue to cherish this art form : ) +

+
+ + + + + + + + + + + + + + diff --git a/ME2.png b/ME2.png new file mode 100644 index 00000000000..d9480804af3 Binary files /dev/null and b/ME2.png differ diff --git a/ME3.JPG b/ME3.JPG new file mode 100644 index 00000000000..502b769dce9 Binary files /dev/null and b/ME3.JPG differ diff --git a/PubList.md b/PubList.md new file mode 100644 index 00000000000..a4d14632d3b --- /dev/null +++ b/PubList.md @@ -0,0 +1,138 @@ +--- +layout: page +title: "Publication and Presentation" +--- + +## Manuscript +Health Insurance Enrollment and Disparities in Cervical Cancer Screening Across 47 Counties in Kenya: A Systematic Analysis of National Demographic and Health Data \ +Ana I. Tergas, __Yongzhe Wang__, Myurajan Rubaharan, Gabriela M. Escalante, Brighton Odundo, Jerome Katumba, Javier Gordon Ogembo, Narissa J. Nonzee, Lawrence P. O. Were \ +___The International Journal of Gynecological Cancer___. (Under Review) [PDF](https://papers.ssrn.com/sol3/papers.cfm?abstract_id=5365896) + +Divergent prefrontal cortex circuits regulate cued food seeking under distinct metabolic or emotional states \ +Xu O. Zhang, Guillermo Aquino-Miranda, Claire E. Cho, __Yongzhe Wang__, Duy Hoang Ha, Nikita Elinson-Watson, Allen Dong, Caleb Kemere & Fabricio H. Do-Monte \ +___Nature Communications___. [PDF](https://www.nature.com/articles/s41467-025-65347-1) + +Racial differences in breast cancer-specific mortality and CVD-specific mortality after breast cancer in post-menopausal women \ +Kerryn W. Reding, Alexi L. Vasbinder, Richard K. Cheng, Ana Barac, __Yongzhe Wang__, Warren J. Szewczyk, Reina Haque, Tarah J. Ballinger, Khadijah Breathett, Aladdin H. Shadyab, Regina Shih, Tomas Nuno, Robert A Wild, Xiaochen Zhang, Rami Nassir, Charles Mouton, Dorothy S. Lane, Lisa Warsinger Martin, JoAnn E. Manson, Marcia L. Stefanick, Michael S Simon, Veronica Jones \ +___Cardio-Oncology___. [PDF](https://link.springer.com/article/10.1186/s40959-025-00403-9#citeas) + +Gene expression associated with endocrine therapy resistance in estrogen receptor-positive breast cancer \ +Veronica Jones, Hongwei Holly Yin, Yate-Ching Yuan, __Yongzhe Wang__, Sierra Min Li, Dana Aljaber, Angelica Sanchez, Christine Quinones, Dan Schmolze, Yuan Yuan, Joanne Mortimer, Lisa Yee, Laura Kruper, Tijana Jovanovic-Talisman, Jerneja Tomsic, Nancy Sanchez, Tanya Chavez, Ruth M. O’Regan, Qamar J. Khan, Melissa Davis, Kevin Kalinsky, Jane Meisel, Rick Kittles, Lorna Rodriguez-Rodriguez, Victoria Seewaldt \ +___Scientific Reports___. [PDF](https://www.nature.com/articles/s41598-025-89274-9) + +Impact of Travel Burden on Timeliness of Care and Overall Survival for Breast Cancer: A National Cancer Database Analysis \ +__Yongzhe Wang__, Christine M. Quinones, Elizabeth Gonzalez, Preeti Farmah, Hans F. Schoellhammer, Lorena Gonzalez, Nikita Shah, Katharine Schulz-Costello, Jennifer Tseng, Veronica C. Jones \ +___Cancer Medicine___. [PDF](https://onlinelibrary.wiley.com/doi/10.1002/cam4.71354) + +Interpretation of Coefficients in Segmented Regression for Interrupted Time Series Analyses \ +__Yongzhe Wang__, Narissa J. Nonzze, Haonan Zhang, Kimlin T. Ashing, Gaole Song, Catherine M. Crespi \ +___BMC Medical Research Methodology___. [PDF](https://link.springer.com/article/10.1186/s12874-025-02556-8) + +“I Probably Shouldn’t Go in Today”: Inequitable Access to Paid Sick Leave and its Impacts on Health Behaviors During the Emergence of COVID-19 in the Seattle Area \ +Chidozie Declan Iwu, Sarah Cox, Sarah Sohlberg, Ashley Kim, Jennifer Logue, Peter Han, Thomas Sibley, Misja Ilcisin, Kairsten Fay, Jover Lee, Denise McCulloch, __Yongzhe Wang__, Michael Boeckh, Janet Englund, Anjum Hajat, Lea Starita, Helen Chu \ +___PLoS one___. (2024). [PDF](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0307734) + +Health Behavioral Trends and Absenteeism Associated Before and During the SARS-CoV-2 Pandemic in a Community Household Study, King County, Washington \ +Erin Chung, __Yongzhe Wang__, Eric J Chow, Roy Burstein, Elisabeth Brandstetter, Peter D Han, Kairsten Fay, Brian Pfau, Amanda Adler, Kirsten Lacombe, Christina M Lockwood, Timothy M Uyeki, Jay Shendure, Jeffrey S Duchin, Mark J Rieder, Deborah A Nickerson, Michael Boeckh, Michael Famulare, James P Hughes, Lea M Starita, Trevor Bedford, Janet A Englund, Helen Y Chu \ +___AJPM Focus___. (2024). [PDF](https://www.ajpmfocus.org/article/S2773-0654(24)00066-X/fulltext) + +Survival Rates in Hispanic/Latinx Subpopulations with Cervical Cancer Associated with Disparities in Guideline-concordant Care \ +Andreea I. Dinicu, Shayan Dioun, __Yongzhe Wang__, Yongmei Huang, Jason D. Wright, Ana I. Tergas \ +___Gynecologic Oncology___. (2024). [_PDF_](https://www.sciencedirect.com/science/article/pii/S0090825824000696) + +Decoding the Genome of Cement by Machine Learning \ +Yu Song, __Yongzhe Wang__, Kaixin Wang, Samy Allal, Gaurav Sant, Mathieu Bauchy \ +___Available at SSRN 4657713___. (2023) [PDF](https://papers.ssrn.com/sol3/papers.cfm?abstract_id=4657713) + +Region of Origin and Cervical Cancer Stage in Multiethnic Hispanic/Latinx Patients Living in the United States \ +Andreea I. Dinicu, Shayan Dioun, Mandy Goldberg, Danielle M. Crookes, __Yongzhe Wang__, Alexander Melamed, Jason D. Wright, Ana I. Tergas \ +___Cancer Medicine___. (2023). [_PDF_](https://onlinelibrary.wiley.com/doi/10.1002/cam4.6697) + +Risk of Subsequent Respiratory Virus Detection After Primary Detection in a Community Household Study—Seattle, Washington \ +Jessica Heimonen, Eric J. Chow, __Yongzhe Wang__, James P. Hughes, Peter D. Han, Anne Emmanuels, Jessica O’Hanlon, Denise McCulloch, Veronica Bowen, Constance E. Ogokeh, Melissa A. Rolfes, Timothy M. Uyeki, MPP, Michael Boeckh, Trevor Bedford, Lea Starita, Janet A. Englund, Helen Y. Chu \ +___Journal of Infectious Diseases___. (2023). [_PDF_](https://doi.org/10.1093/infdis/jiad305) + +Respiratory Syncytial Virus and Other Respiratory Virus Infections in Residents of Homeless Shelters – King County, Washington, 2019–2021 \ +Denise J. McCulloch, Julia H. Rogers, __Yongzhe Wang__, Eric J. Chow, Amy C. Link, Caitlin R. Wolf, Timothy M. Uyeki, Melissa A. Rolfes, Emily Mosites, Jaydee Sereewit, Jeffrey S. Duchin, Nancy K. Sugg, Michael J. Boeckh, Janet A. Englund, Trevor Bedford, Jay Shendure, James P. Hughes, Lea M. Starita, Pavitra Roychoudhury, Helen Y. Chu \ +___Influenza and other respiratory viruses___. (2023). [_PDF_](https://onlinelibrary.wiley.com/doi/10.1111/irv.13166) + +Variable Importance for Fixed Effects in Linear Mixed Model \ +__Yongzhe Wang__, Lingbo Ye, Zifan Yu \ +___Western North American Region of The International Biometric Society Annual Meeting___. (2022). (Conference paper). + +Using Machine Learning to Predict Concrete’s Strength: Learning from Small Datasets \ +Ouyang, Boya, Yu Song, Yuhai Li, Feishu Wu, Huizi Yu, __Yongzhe Wang__, Zhanyuan Yin, Xiaoshu Luo, Gaurav Sant, and Mathieu Bauchy \ +___Engineering Research Express___. (2021). [_PDF_](https://iopscience.iop.org/article/10.1088/2631-8695/abe344/meta) + +Decoding the Genome of Cement by Gaussian Process Regression \ +Yu Song, __Yongzhe Wang__, Kaixin Wang, Mathieu Bauchy \ +___NeurIPS Workshop: Machine Learning for Engineering Modeling, Simulation, and Design___. (2020). (Conference paper) [_PDF_](https://ml4eng.github.io/camera_readys/38.pdf) + +Predicting Concrete’s Strength by Machine Learning: Balance between Accuracy and Complexity of Algorithms \ +Ouyang, Boya, Yu Song, Yuhai Li, Feishu Wu, Huizi Yu, __Yongzhe Wang__, Gaurav Sant, Mathieu Bauchy \ +___ACI Materials Journal___. (2020). [_PDF_](https://par.nsf.gov/biblio/10296333) + +## Abstract +Impact of universal germline testing on pathogenic/likely pathogenic variant frequency among breast cancer patients in Southern California \ +Jessica Dzubnar, __Yongzhe Wang__, Dana Aljaber, Christine Quinones, Jennifer Tseng, Ilana Solomon, Lorena Gonzalez, Stacy Gray, Veronica Jones \ +___17th AACR Conference on The Science of Cancer Health Disparities in Racial/Ethnic Minorities and the Medically Underserved___. (2024). (Poster presentation) [_PDF_](https://aacrjournals.org/cebp/article/33/9_Supplement/A020/748299/Abstract-A020-Impact-of-universal-germline-testing?searchresult=1) + +Health Insurance and Cervical Cancer Screening Uptake in Kenya: a Propensity Score Matched Analysis of the National Demographic and Health Survey 2022 \ +Ana I. Tergas, Narissa Nonzee, __Yongzhe Wang__, Myurajan Rubaharan, Brighton Odundo, John Malago, James M’Boya, Gabriela Escalante, Lauren Cai, Javier Gordon Ogembo, Lawrence Were \ +___36th International Papillomavirus Conference___. (2024). (Poster presentation) + +Changes in No-Show Rates after Implementation of an Oncology Navigation Enhancement Program Targeting Vulnerable Populations \ +Narissa J. Nonzee, __Yongzhe Wang__, Lauren Cai, Annette Mercurio, Lorena Gaytan, Marianne Razavi2, Deborah Lefkowitz, Terry Hernandez, Chrissy Kim, Steven Morales, Danilo Duque, Zeke Luna, William Dale and Ana I. Tergas \ +___AcademyHealth Annual Research Meeting___. (2024). (Poster presentation) [_PDF_](https://vmx.m-anage.com/us/2024arm/en-US/presentation/653912) + +Determinants of Time to Cancer Treatment Initiation at a Comprehensive Cancer Center \ +Narissa J. Nonzee, Marianne Razavi, William Dale, Lauren Cai, Annette Mercurio, __Yongzhe Wang__, Deborah Lefkowitz, Chrissy Kim, Terry Hernandez, Steven Morales, Danilo Duque, Ana I. Tergas \ +___American Association for Cancer Research 2024 Annual Meeting___. (2024). (Poster presentation) + +Evaluating Racial Disparities in Competing Cardiovascular and Cancer Outcomes after Breast Cancer \ +Alexi Vasbinder, Veronica Jones, Richard Kar-Hang Cheng, Ana Barac, __Yongzhe Wang__, Marcia Stefanick, Noah Simon, Aladdin Shadyab, Robert A. Wild, Reina Haque, Dorothy Lane, Xiaochen Zhang, Rami Nassir, Lisa Warsinger Martin, Michael Simon, and Kerryn Reding \ +___American College of Cardiology's 73th Annual Scientific Session & Expo___. (2024). (Poster presentation) [_PDF_](https://www.jacc.org/doi/epdf/10.1016/S0735-1097%2824%2904584-4) + +The Effect of Medicaid Expansion on the Receipt of Guideline-Concordant Care Among Patients with Cervical Cancer in the United States: a Difference-in-Difference Analysis of the NCDB \ +Andreea Dinicu, Narissa J. Nonzee, __Yongzhe Wang__, Yongmei Huang, Jason D. Wright, Ana I. Tergas \ +___Society of Gynecologic Oncology 2024 Annual Meeting on Women’s Cancer___. (2024). (Poster presentation) + +Disparities in cervical cancer survival among Hispanic subpopulations living in the United States \ +Andreea Dinicu, Shayan Dioun, __Yongzhe Wang__, Yongmei Huang, Jason Wright, Ana Tergas \ +___16th AACR Conference on The Science of Cancer Health Disparities in Racial/Ethnic Minorities and the Medically Underserved___. (2023). (Poster presentation) [_PDF_](https://doi.org/10.1158/1538-7755.DISP23-B112) + +Income Level with Death from Cardiovascular Disease after Breast Cancer in Women’s Health Initiative \ +Veronica Jones, __Yongzhe Wang__, Alexi Vasbinder, Noah Simon, Kerry Reding \ +___16th AACR Conference on The Science of Cancer Health Disparities in Racial/Ethnic Minorities and the Medically Underserved___. (2023). (Poster presentation) [_PDF_](https://doi.org/10.1158/1538-7755.DISP23-B014) + +Interrupted Time Series Design with Longitudinal Models: Evaluating a Multi-Site Intervention to Improve Colorectal Cancer Screening in Federally Qualified Health Centers in Los Angeles County \ +__Yongzhe Wang__, Kimlin Ashing, Haonan Zhang, Narissa Nonzee \ +___AcademyHealth Annual Research Meeting___. (2023). (Oral presentation) [_PDF_](https://academyhealth.confex.com/academyhealth/2023arm/meetingapp.cgi/Paper/59138) + +Prelimbic Neurons Differently Encode Changes in Metabolic or Threat States During Cued Food Seeking \ +Xu Zhang, Claire Cho, Guillermo Aquino-Miranda, __Yongzhe Wang__, Nikita Elinson-Watson, Fabricio Do Monte \ +___Modulation of Neural Circuits and Behavior Gordon Research Seminar___. (2023). (Poster presentation) + +Machine Learning to Identify Risk Factors for Cardiovascular Mortality after Breast Cancer \ +Alexi Vasbinder, Veronica Jones, __Yongzhe Wang__, Noah Simon, Richard K. Cheng, Ana Barac, Michael Simon, Kerryn Reding \ +___WHI Investigator Meeting___. (2023). (Oral presentation) + +Mobilizing Cancer Center-Community Partnerships to Improve Colorectal Cancer Screening among Underserved Populations during COVID-19 \ +Narissa J. Nonzee, __Yongzhe Wang__, Marisela Garcia, Sophia Yeung, Trilokesh D. Kidambi, Gregory Idos, Margaret Martinez, Charleen Mikail, Kim Tran, Paul Gregerson, Paul Round, Cherry Lee, Pei-Chi Wu, Nancy Chen, Stephen Denq, Kimlin T. Ashing \ +___American Society of Preventive Oncology 2023 Annual Meeting___. (2023). (Oral presentation) + +Community Outreach and Engagement to Increase Colorectal Cancer Screening in Ethnic Minority Communities \ +Kimlin T. Ashing, __Yongzhe Wang__, Marisela Garcia, Trilokesh Kidambi, Gregory Idos, Sophia Yeung, Charleen Mikail, Margaret Martinez, Scott Kim, Kim Tran, Paul Gregerson, Paul Round, Cherry Lee, Pei-Chi Wu, Nancy Chen, Stephen Denq, Narissa Nonzee \ +___American Association for Cancer Research 2023 Annual Meeting___. (2023). (Poster presentation) + +Health Behavioral Trends and Absenteeism Associated Before and During the SARS-CoV-2 Pandemic in a Community Household Study, King County, Washington \ +Erin Chung, Jessica Heimonen, Jessica A O’Hanlon, __Yongzhe Wang__, Eric J Chow, Constance E Ogokeh, Melissa A Rolfes, James Hughes, Timothy M Uyeki, Lea Starita, Janet A Englund, Helen Y Chu, Samara Hoag \ +___Open Forum Infectious Diseases___. (2022). (Poster presentation) [_PDF_](https://academic.oup.com/ofid/article/9/Supplement_2/ofac492.1531/6903759) + +Influenza Surveillance of Families in an Observational Household Study 2019-2021 \ +Tara M Babu, Amanda M Casto, Jessica Heimonen, __Yongzhe Wang__, Annie Emanuels, Eric J Chow, Samara Hoag, James Hughes, Constance E Ogokeh, Melissa A Rolfes, Timothy M Uyeki, Lea Starita, Janet A Englund, Helen Y Chu \ +___Open Forum Infectious Diseases___. (2022). (Poster presentation) [_PDF_](https://academic.oup.com/ofid/article/9/Supplement_2/ofac492.1821/6903881) + +Streptococcus Pneumoniae Nasal Carriage Patterns with and without Common Respiratory Viruses in Seattle, WA, USA \ +Julia C. Bennett, Anne Emanuels, Jessica Heimonen, Jessica O’Hanlon, James P. Hughes, Peter D. Han, __Yongzhe Wang__, Denise J. McCulloch, Eric J. Chow, Constance E. Ogokeh, Melissa A. Rolfes, Timothy M. Uyeki, Jay Shendure, Lea M. Starita, Janet A. Englund, Helen Y. Chu \ +___12th International RSV Symposium___. (2022). (Poster presentation) [_PDF_](https://isirv.org/site/images/conferences/RSV/RSV2022/RSV_2022_Abstracts_POSTERS%20Rev%20Dec22.pdf) diff --git a/README.md b/README.md index 5e640626279..a845d9f2fb5 100644 --- a/README.md +++ b/README.md @@ -1,99 +1,5 @@ --- -title: "About" -permalink: "/about/" layout: page +title: "About Me" --- -## Installation - -Just fork this [repository](https://github.com/niklasbuschmann/contrast) and adjust the `_config.yml` to use with [Github Pages](https://pages.github.com/) and your page is done. - -## Features - - - supports dark mode on macOS Mojave - - optional sidebar - - MathJax support - - no external ressources - - included archive page - - supports pagination - - feed generation - - responsive - - syntax highlighting - - supports comments via [disqus](https://disqus.com/) or [isso](http://posativ.org/isso/) - -## Based on - -- [Hyde](https://github.com/poole/hyde) -- [Minima](https://github.com/jekyll/minima) -- [Lagrange](https://github.com/LeNPaul/Lagrange) -- [Font Awesome](http://fontawesome.io/) -- [KaTeX](https://katex.org/) -- [Pygments](https://github.com/richleland/pygments-css) - -## Installation (jekyll-remote-theme method) - -You can use this theme with the `jekyll-remote-theme` plugin. Just create an empty repo, copy over the `index.html` file and add this to your `_config.yml`: - -```yaml -remote_theme: niklasbuschmann/contrast@v2.11 - -plugins: - - jekyll-remote-theme -``` - -Note: to enable icons you also need to copy over the `_data` folder. - -## Config - -Your `_config.yml` could for example look like this: - -```yaml -title: "Blog Title" -author: "Blog Author" -description: "My personal blog about ... something" -permalink: /:title/ -lang: "en" -excerpt_separator: "\n\n\n" -date_format: "%B %d, %Y" - -# Layout - -show_excerpts: true # show article excerpts on the home page -show_frame: true # adds a gray frame to the site -show_sidebar: false # show a sidebar instead of the usual header - -# Menu - -navigation: # accepts {file, title, url, icon, sidebaricon} - - {file: "index.html"} - - {file: "README.md"} - -external: # shows a footer with social links - for available icons see fontawesome.com/icons - - {title: Mail, icon: envelope, url: "mailto:niklasbuschmann@users.noreply.github.com"} - - {title: Github, icon: github, url: "https://github.com/niklasbuschmann/contrast"} - - {title: Subscribe, icon: rss, url: "/feed.xml"} - -comments: -# disqus_shortname: "" # see https://disqus.com/ -# isso_domain: "" # see https://posativ.org/isso/ - -plugins: - - jekyll-feed - -``` - -## MathJax - -Contrast comes preinstalled with a leightweight alternative to MathJax called [KaTeX](https://katex.org/). To display equations in a post simply set `mathjax: true` in the article's front matter. - -## License - -[public domain](http://unlicense.org/) - -## Screenshots - -![screenshot](https://user-images.githubusercontent.com/4943215/109431850-cd711780-7a08-11eb-8601-2763f2ee6bb4.png) - -![screenshot](https://user-images.githubusercontent.com/4943215/109431832-b6cac080-7a08-11eb-9c5e-a058680c23a1.png) - -![screenshot](https://user-images.githubusercontent.com/4943215/73125194-5f0b8b80-3fa4-11ea-805c-8387187503ad.png) diff --git a/_config.yml b/_config.yml index d13c687b5b6..6ce4c8624d5 100644 --- a/_config.yml +++ b/_config.yml @@ -1,6 +1,6 @@ -title: "Blog Title" -author: "Blog Author" -description: "Made with <3" +title: "Visual" +author: "Visual" +description: "Visual" permalink: /:title/ lang: "en" excerpt_separator: "\n\n\n" @@ -10,19 +10,17 @@ date_format: "%B %d, %Y" show_excerpts: true # show article excerpts on the home page show_frame: true # adds a gray frame to the site -show_sidebar: false # show a sidebar instead of the usual header -minimal: false # use a dark header +show_sidebar: true # show a sidebar instead of the usual header # Menu navigation: # accepts {file, title, url, icon, sidebaricon} - - {file: "index.html"} - - {file: "README.md"} + - {file: "index.html", title: "Data Visualization", icon: arrow-right} + + external: # shows a footer with social links - for available icons see fontawesome.com/icons - - {title: Mail, icon: envelope, url: "mailto:niklasbuschmann@users.noreply.github.com"} - - {title: Github, icon: github, url: "https://github.com/niklasbuschmann/contrast"} - - {title: Subscribe, icon: rss, url: "/feed.xml"} + - {title: Mail, icon: envelope, url: "mailto:yonwang@coh.org"} comments: # disqus_shortname: "" # see https://disqus.com/ diff --git a/_posts/2017-01-01-advanced-examples.md b/_posts/2017-01-01-advanced-examples.md deleted file mode 100644 index 785d05464b8..00000000000 --- a/_posts/2017-01-01-advanced-examples.md +++ /dev/null @@ -1,63 +0,0 @@ ---- -title: "Advanced examples" -mathjax: true -layout: post -categories: media ---- - -![Swiss Alps](https://user-images.githubusercontent.com/4943215/55412536-edbba180-5567-11e9-9c70-6d33bca3f8ed.jpg) - - -## MathJax - -You can enable MathJax by setting `mathjax: true` on a page or globally in the `_config.yml`. Some examples: - -[Euler's formula](https://en.wikipedia.org/wiki/Euler%27s_formula) relates the complex exponential function to the trigonometric functions. - -$$ e^{i\theta}=\cos(\theta)+i\sin(\theta) $$ - -The [Euler-Lagrange](https://en.wikipedia.org/wiki/Lagrangian_mechanics) differential equation is the fundamental equation of calculus of variations. - -$$ \frac{\mathrm{d}}{\mathrm{d}t} \left ( \frac{\partial L}{\partial \dot{q}} \right ) = \frac{\partial L}{\partial q} $$ - -The [Schrödinger equation](https://en.wikipedia.org/wiki/Schr%C3%B6dinger_equation) describes how the quantum state of a quantum system changes with time. - -$$ i\hbar\frac{\partial}{\partial t} \Psi(\mathbf{r},t) = \left [ \frac{-\hbar^2}{2\mu}\nabla^2 + V(\mathbf{r},t)\right ] \Psi(\mathbf{r},t) $$ - -## Code - -Embed code by putting `{{ "{% highlight language " }}%}` `{{ "{% endhighlight " }}%}` blocks around it. Adding the parameter `linenos` will show source lines besides the code. - -{% highlight c %} - -static void asyncEnabled(Dict* args, void* vAdmin, String* txid, struct Allocator* requestAlloc) -{ - struct Admin* admin = Identity_check((struct Admin*) vAdmin); - int64_t enabled = admin->asyncEnabled; - Dict d = Dict_CONST(String_CONST("asyncEnabled"), Int_OBJ(enabled), NULL); - Admin_sendMessage(&d, txid, admin); -} - -{% endhighlight %} - -## Gists - -With the `jekyll-gist` plugin, which is preinstalled on Github Pages, you can embed gists simply by using the `gist` command: - - - -## Images - -Upload an image to the *assets* folder and embed it with `![title](/assets/name.jpg))`. Keep in mind that the path needs to be adjusted if Jekyll is run inside a subfolder. - -A wrapper `div` with the class `large` can be used to increase the width of an image or iframe. - -![Flower](https://user-images.githubusercontent.com/4943215/55412447-bcdb6c80-5567-11e9-8d12-b1e35fd5e50c.jpg) - -[Flower](https://unsplash.com/photos/iGrsa9rL11o) by Tj Holowaychuk - -## Embedded content - -You can also embed a lot of stuff, for example from YouTube, using the `embed.html` include. - -{% include embed.html url="https://www.youtube.com/embed/_C0A5zX-iqM" %} diff --git a/_posts/2017-02-01-markdown-examples.md b/_posts/2017-02-01-markdown-examples.md deleted file mode 100644 index 072a700e0ea..00000000000 --- a/_posts/2017-02-01-markdown-examples.md +++ /dev/null @@ -1,85 +0,0 @@ ---- -title: "Markdown examples" -layout: post ---- - -Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. - -Curabitur pretium tincidunt lacus. Nulla gravida orci a odio. Nullam varius, turpis et commodo pharetra, est eros bibendum elit, nec luctus magna felis sollicitudin mauris. Integer in mauris eu nibh euismod gravida. Duis ac tellus et risus vulputate vehicula. Donec lobortis risus a elit. - - -## Heading Two (h2) - -### Heading Three (h3) - -#### Heading Four (h4) - -##### Heading Five (h5) - -###### Heading Six (h6) - - -## Blockquotes - -### Single line - -> My mom always said life was like a box of chocolates. You never know what you're gonna get. - -### Multiline - -> What do you get when you cross an insomniac, an unwilling agnostic and a dyslexic? -> -> You get someone who stays up all night torturing himself mentally over the question of whether or not there's a dog. -> -> – _Hal Incandenza_ - -## Horizontal Rule - ---- - -## Table - -| Title 1 | Title 2 | Title 3 | Title 4 | -|------------------|------------------|-----------------|-----------------| -| First entry | Second entry | Third entry | Fourth entry | -| Fifth entry | Sixth entry | Seventh entry | Eight entry | -| Ninth entry | Tenth entry | Eleventh entry | Twelfth entry | -| Thirteenth entry | Fourteenth entry | Fifteenth entry | Sixteenth entry | - -## Code - -Source code can be included by fencing the code with three backticks. Syntax highlighting works automatically when specifying the language after the backticks. - -```` -```javascript -function foo () { - return "bar"; -} -``` -```` - -This would be rendered as: - -```javascript -function foo () { - return "bar"; -} -``` - -## Lists - -### Unordered - -* First item -* Second item -* Third item - * First nested item - * Second nested item - -### Ordered - -1. First item -2. Second item -3. Third item - 1. First nested item - 2. Second nested item diff --git a/_posts/2017-03-01-welcome-to-jekyll.md b/_posts/2017-03-01-welcome-to-jekyll.md deleted file mode 100644 index 0b02eb48b78..00000000000 --- a/_posts/2017-03-01-welcome-to-jekyll.md +++ /dev/null @@ -1,25 +0,0 @@ ---- -title: "Welcome to Jekyll" -layout: post ---- - -You’ll find this post in your `_posts` directory. Go ahead and edit it and re-build the site to see your changes. You can rebuild the site in many different ways, but the most common way is to run `jekyll serve`, which launches a web server and auto-regenerates your site when a file is updated. - - -To add new posts, simply add a file in the `_posts` directory that follows the convention `YYYY-MM-DD-name-of-post.ext` and includes the necessary front matter. Take a look at the source for this post to get an idea about how it works. - -Jekyll also offers powerful support for code snippets: - -{% highlight ruby %} -def print_hi(name) - puts "Hi, #{name}" -end -print_hi('Tom') -#=> prints 'Hi, Tom' to STDOUT. -{% endhighlight %} - -Check out the [Jekyll docs][jekyll-docs] for more info on how to get the most out of Jekyll. File all bugs/feature requests at [Jekyll’s GitHub repo][jekyll-gh]. If you have questions, you can ask them on [Jekyll Talk][jekyll-talk]. - -[jekyll-docs]: http://jekyllrb.com/docs/home -[jekyll-gh]: https://github.com/jekyll/jekyll -[jekyll-talk]: https://talk.jekyllrb.com/ diff --git a/_posts/2021-10-30-Forest-Plot.md b/_posts/2021-10-30-Forest-Plot.md new file mode 100644 index 00000000000..221ac15571e --- /dev/null +++ b/_posts/2021-10-30-Forest-Plot.md @@ -0,0 +1,255 @@ +--- +title: "Forest Plots in R with ggplot2" +mathjax: true +layout: post +categories: media +--- + +![Forest Plot](https://raw.githubusercontent.com/YzwIsALaity/Forest-Plot-Tutorial-in-R/cc64ccb0e0922c9553c9e2aeb64cf090644e2d1b/Version%203.0.jpeg) + + +# 1. Format of dataset for forest plot +We will use __ggplot2__ to make a __forest plot__ for estimated odds ratios from logistic regression models and the ggplot2 requires a specific format of the dataset. __We require three packages `ggplot2`, `gridExtra`, and `scales`.__ In this dataset, we have 6 columns + +- `Variable`: __covariates__ in logistic regression models (numerical); + +- `Time`: time points on month for different outcomes (numerical); + +- `OR`: estimated __odds ratios__ for `Variable` (numerical); + +- `Lower` and `Upper`: lower and upper bounds in __95% confidence intervals__ for estimated odds ratios (numerical); + +- `Summary`: __sentences__ combined OR and 95% CI for odds ratios (string); + +- `Index`: it is a sequence of ordered number used for the plot and its length is equal to the number of `Variable`. + +![](https://raw.githubusercontent.com/YzwIsALaity/Forest-Plot-Tutorial-in-R/eb8887ed67d5da87e9bf5fc69895c7459390ec5d/Dataset%20Shape.png) + +In the ggplot, no matter what kind of plots we make, they are basically composed of two parts: X-axis and Y-axis. __Since we want to create a forest plot for odds ratio, we set X-axis as__ `Variable` __and Y-axis as__ `OR`. + +# 2. Version 0.0 +When we determined which variables should be placed in X-axis or Y-axis, we then need to choose what "values" should be displayed. Typically, these "values" are in numerical format and we also need to choose what types of plots we want to show (e.g. point, line, bar, tile, box with whisker, etc.). A forest plot consists of a center (`OR`) and two whiskers (`Lower` and `Upper`). + +```{r} +# Version 0.0 +ggplot(Plot.OR.Mat.6, aes(x = OR, y = Variable)) + # x is for X-axis | y is for Y-axis + geom_point() + # a function for plotting points + geom_errorbarh(aes(xmin = Lower, xmax = Upper)) # a function for plotting two whiskers +``` + + +![](https://raw.githubusercontent.com/YzwIsALaity/Forest-Plot-Tutorial-in-R/33ff981aaeb260a5f32916e05fa769b35aadf931/Version%200.0.jpeg) + +So it basically looks like the above one. But we can polish it and eventually attach `Summary` with corresponding `Variable`. + +# 3. Version 1.0 +The main elements in this plot are from two functions: + +- `geom_point()`: this function is for points (OR in our example) and it can use different commands to modify the point (e.g. __shape__ [`shape = 18`], __size__ [`size = 3`], __color__ [`col = 'black'`], etc.); + +- `geom_errorbarh()`: this function for horizontal error bars which are two whiskers in a 95% confidence interval and it requires providing __minimum and maximum for the "value"__ that we want to pass into (e.g. `geom_errorbarh(aes(xmin = Lower, xmax = Upper))`). We also can change the length of the __vertical tick__ on two sides (e.g. `height = 0.25`). + +Meanwhile, plots for odds ratio are often displayed under log scale and we can also easily change the scale system in ggplot with the command `scale_x_continuous()` (numerical value are in X-axis) or `scale_y_continuous()` (numerical value are in y-axis). In our example, the numerical values are in X-axis so we use + +- `scale_x_continuous()`: we use the command `trans = 'log'` (for __log transformation__) inside the function and set up the __scale range of X-axis__ with the command `limits = c(0.005, 13)`(lower, upper). Since the log transformation will bring up lots of digits for numerical values, we can pass the `label_number()` to round digits into __2 decimal places of precision__. + +We can combine them with the __X-axis label__ [`xlab()`] and the __main title__ [`ggtitle()`] to see what it looks like. + +```{r} +# Version 1.0 +ggplot(Plot.OR.Mat.6, aes(x = OR, y = Variable)) + # x is for X-axis | y is for Y-axis + geom_point(shape = 18, size = 3) + # a function for plotting points + geom_errorbarh(aes(xmin = Lower, xmax = Upper), # a function for plotting two whiskers + height = 0.25) + # the length of vertical ticks + scale_x_continuous(trans = 'log', # log transformation for values in X-axis + limits = c(0.005, 13), # range of X-axis in original scale + labels = label_number()) + # round digits + xlab("Odds Ratio (95% CI, log scale)") + # X-axis label + ggtitle('6-month Timepoint') # title of plots +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/Forest-Plot-Tutorial-in-R/477bde6d42ba41b9c3c71f0dcb2095ee29538463/Version%201.0.jpeg) + +In the Version 1.0, we can find that the gray background and grid may not be good for presenting the plot and texts in two axes are small. + +# 3. Version 2.0 +So in the next step we are going to improve the background and texts under the Version 1.0 and we are going to introduce some functions and corresponding commands. + +- `theme_bw()`: this is a dark-on-light theme and better for presentations. + +- `theme()`: this is a general function for all non-data components in a plot so we can use it to modify any texts, labels, backgrounds, etc. Within this function, we are going to introduce some commands. + + + `panel.border`, `panel.background`, `panel.grid.major`, and `panel.grid.minor`: these four are used to __control the background and gird of a plot__. since we want to give it a blank background and grid, we will set + + `panel.border = element_blank(),`\ + `panel.background = element_blank(),`\ + `panel.grid.major = element_blank(),`\ + `panel.grid.minor = element_blank()`. + + In here, the function `element_blank()` is to __display nothing for non-data elements__. + + + `axis.line.x` and `axis.line.y`: they are used to __control the axis line__ in X-axis and Y-axis. In the forest plot, the axis line for Y-axis is blank so we will set + + `axis.line.y = element_blank()`. + + + `axis.ticks.x` and `axis.ticks.y`: they are used to __control the small tick__ in X-axis and Y-axis. In the forest plot, the ticks for X-axis and Y-axis are blank so we will set + + `axis.line.y = element_blank(),`\ + `axis.line.y = element_blank()`. + + + `axis.text.x` and `axis.text.y`: they are used to __modify texts in two axes__. Since we want to enlarge the font of texts in Y-axis so we will set + + `axis.text.y = element_text(colour = "black", size = 11)`. + + The function `element_text()` is to __modify texts__ in a plot and it provides a brunch of commands for font size, color, position, face, etc. + + + `plot.title`: this is to __modify the title__. We want to align the title on the left side and make the font to be bold, so we will set + + `plot.title = element_text(hjust = -0.86, face="bold")`. + +We also want to move the title for Y-axis and add a vertical line for OR = 1 (`geom_vline(xintercept = 1, color = "red", linetype = "dashed", cex = 0.5, alpha = 0.5)`). Now we can combine them all together! + +```{r} +# Version 2.0 +p1 <- +ggplot(Plot.OR.Mat.6, aes(x = OR, y = Variable)) + # x is for X-axis | y is for Y-axis + geom_point(shape = 18, size = 3) + # a function for plotting points + geom_errorbarh(aes(xmin = Lower, xmax = Upper), # a function for plotting two whiskers + height = 0.25) + # the length of vertical ticks + scale_x_continuous(trans = 'log', # log transformation for values in X-axis + limits = c(0.005, 13), # range of X-axis in original scale + labels = label_number()) + # round digits + xlab("Odds Ratio (95% CI, log scale)") + # X-axis label + ggtitle('6-month Timepoint') + # title of plots + theme_bw() + # dark-on-light theme + theme(panel.border = element_blank(), # these four are for the background and grid + panel.background = element_blank(), + panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + axis.line.x = element_line(), # these two are for the axis line + axis.line.y = element_blank(), + axis.text.x = element_text(colour = "black", size = 11), # there two are for texts in axes + axis.text.y = element_text(colour = "black", size = 11), + axis.ticks.x = element_line(), # these two are for ticks in axes + axis.ticks.y = element_blank(), + axis.title.x = element_text(), # these two are for titles in axes + axis.title.y = element_blank(), + plot.title = element_text(hjust = -0.86, face = "bold")) + # the main title + # hjust is for title position + geom_vline(xintercept = 1, # the position of vertical line + color = "red", # the color of line + linetype = "dashed", # the type of line + # we used dashed line in here + alpha = 0.5) # the transparency level of the line +p1 +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/Forest-Plot-Tutorial-in-R/66d0209a407a273f78a5d92f86d99a9d670e2bf5/Version%202.0.jpeg) + +This version looks more clear and tidy! We can try our last step, that is to combine the `Summary` for each variable. + +# 4. Version 3.0 +In the last version, we are going to combine the `Summary` and the main plot together. The basic logic of the combination is to put two plots together--the main plot is the Version 2.0 and another empty plot only contains `Summary`. First, we are going to create an empty plot which __attaches__ `Summary` __on its right hand side__. We first create an empty plot. + +```{r} +table_base <- + ggplot(Plot.OR.Mat.6, aes(y = Variable)) + # everything in this plot is empty + ylab(NULL) + xlab('') + ggtitle('') + + xlim(0, 13) + # make sure this is the same as p1 + theme(panel.border = element_blank(), + panel.background = element_blank(), + panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + axis.line.x = element_blank(), + axis.line.y = element_blank(), + axis.text.x = element_text(color = "white", hjust = -3, size = 11), + axis.text.y = element_blank(), + axis.ticks.x = element_blank(), + axis.ticks.y = element_blank(), + axis.title.x = element_blank(), + axis.title.y = element_blank(), + plot.title = element_text(hjust = -0.86, face = "bold")) # make sure hjust is the same as p1 +## OR point estimate table +tab1 <- table_base + geom_text(aes(y = rev(Index), x = 1, label = Summary), size = 4, hjust = 0, vjust = -0.5) +``` + +The `tab1` is an empty plot only including `Summary` and we need to arrange its position and combine it with the main plot `p1`. Hence, we need to use the function `grid.arrange()`. + +- `grid.arrange()`: we will put the main plot `p1` first and then `tab1`. The commands, `nrow` and `ncol`, are used to arrange positions of two plots. We will set + `nrow = 1` and `ncol = 2` which means we want to horizontally put them together where `p1` is in the left hand side and `tab1` is in the right hand side. + +Eventually, the final plot will be better than previous versions. + +```{r} +# eventually we attach the summary to the plot +grid.arrange(p1, tab1, + nrow = 1, ncol = 2) +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/Forest-Plot-Tutorial-in-R/cc64ccb0e0922c9553c9e2aeb64cf090644e2d1b/Version%203.0.jpeg) + +This is the final version! If we want to print it out from R, __we can always change its size when we save the figure to make sure title is center or aligned left__. + +# 5. Version 4.0 +This is a stacked version of forest plots. __However, this time we need to remove everything in the X-axis for the plot at the top of a stack__. + +```{r} +# Version 4.0 +p2 <- +ggplot(Plot.OR.Mat.6, aes(x = OR, y = Variable)) + # x is for X-axis | y is for Y-axis + geom_point(shape = 18, size = 3) + # a function for plotting points + geom_errorbarh(aes(xmin = Lower, xmax = Upper), # a function for plotting two whiskers + height = 0.25) + # the length of vertical ticks + scale_x_continuous(trans = 'log', # log transformation for values in X-axis + limits = c(0.005, 13), # range of X-axis in original scale + labels = label_number()) + # round digits + ggtitle('12-month Timepoint') + # title of plots + theme_bw() + # dark-on-light theme + theme(panel.border = element_blank(), # these four are for the background and grid + panel.background = element_blank(), + panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + axis.line.x = element_blank(), # these two are for the axis line + axis.line.y = element_blank(), + axis.text.x = element_blank(), + axis.text.y = element_text(colour = "black", size = 11), + axis.ticks.x = element_blank(), # these two are for ticks in axes + axis.ticks.y = element_blank(), + axis.title.x = element_blank(), # these two are for titles in axes + axis.title.y = element_blank(), + plot.title = element_text(hjust = -0.065, face = "bold")) + # the main title + # hjust is for title position + geom_vline(xintercept = 1, # the position of vertical line + color = "red", # the color of line + linetype = "dashed", # the type of line + # we used dashed line in here + alpha = 0.5) # the transparency level of the line +# This is the empty +table_base2 <- + ggplot(Plot.OR.Mat.6, aes(y = Variable)) + # everything in this plot is empty + ylab(NULL) + xlab('') + ggtitle('') + + xlim(0, 13) + # make sure this is the same as p1 + theme(panel.border = element_blank(), + panel.background = element_blank(), + panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + axis.line.x = element_blank(), + axis.line.y = element_blank(), + axis.text.x = element_text(color = "white", hjust = -3, size = 11), + axis.text.y = element_blank(), + axis.ticks.x = element_blank(), + axis.ticks.y = element_blank(), + axis.title.x = element_blank(), + axis.title.y = element_blank(), + plot.title = element_text(hjust = -0.065, face = "bold")) # make sure hjust is the same as p1 + +## OR point estimate table +tab2 <- table_base2 + geom_text(aes(y = Index, x = 1, label = Summary), size = 4, hjust = 0, vjust = 1.35) +# eventually we attach all together! +grid.arrange(p2, tab2, + p1, tab1, + nrow = 2, ncol = 2) +``` +![](https://raw.githubusercontent.com/YzwIsALaity/Forest-Plot-Tutorial-in-R/ddd1c3a8b85da7319432c60195ec29a32138fc48/Version%204.0.jpeg) + +__After we remove the X-axis for the top plot__, we need to __align the__ `Summary` __with rows in the plot__ and this can be modified by `geom_text(aes(y = Index, x = 1, label = Summary), size = 4, hjust = 0, vjust = 1.35)` (need to modify `vjust = `: modify vertical distance). diff --git a/_posts/2021-12-16-Radar-Chart.md b/_posts/2021-12-16-Radar-Chart.md new file mode 100644 index 00000000000..b78834fe4ed --- /dev/null +++ b/_posts/2021-12-16-Radar-Chart.md @@ -0,0 +1,189 @@ +--- +title: "Radar Chart in R with fmsb and scales" +mathjax: true +layout: post +categories: media +--- + +![Radar Chart](https://raw.githubusercontent.com/YzwIsALaity/Radar-Chart-Tutorial-in-R/1e5dc30a259b09e851af3e60d2231e52b84a03d5/Radar%20Plot%20(Multiple).jpeg) + + + +This is a short tutorial for creating a __radar chart/spider chart/star chart__ and it requires `fmsb` and `scales` packages. A radar chart is a good tool for visualizing multivariate data that is shared among similar groups/participants so it is good to visualize life performance metrics (e.g. EQ-5D-5L, EQ-5D-3L, etc.). We will take a dataset with __EQ-5D-5L__ as an example in this tutorial. + +The EQ-5D-5L is a health-related quality of life measurement and it includes __five dimensions: mobility (MO), self-care (SC), usual activities (UA), pain/discomfort (PD), and anxiety/depression (AD)__. For each dimension, it has five ordinal levels: + +- 1: no problem; + +- 2: slight; + +- 3: moderate; + +- 4: severe; + +- 5: extreme. + +Each participant in the dataset will provide scores for five dimensions. + +## 1. Format of dataset +The original dataset has seven columns: + +- `PTID`: an __unique participant's identification__ (string); + +- `MO`, `SC`, `UA`, `PD`, and `AD`: __scores__ for five dimensions (numerical); + +- `Group`: groups that participants are belonged to [four groups: __Healthy Control, Infected Patients, Long-term Symptoms, Short-term Symptoms__]. + +![](https://raw.githubusercontent.com/YzwIsALaity/Radar-Chart-Tutorial-in-R/1e5dc30a259b09e851af3e60d2231e52b84a03d5/Dataset%20Shape.jpeg) + +In the radar chart, we want to __show average levels of five dimensions for four groups__ so we need to do data preprocessing first. We will `dplyr` and `tidyr` packages for this process. + +```{r} +# Create a table with averages for each measurement +Dt.Summary <- + Dt %>% + group_by(Group) %>% + summarise(UA.Avg = mean(UA), + SC.Avg = mean(SC), + AD.Avg = mean(AD), + PD.Avg = mean(PD), + MO.Avg = mean(MO)) +kable(Dt.Summary) +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/Radar-Chart-Tutorial-in-R/1e5dc30a259b09e851af3e60d2231e52b84a03d5/Final%20Dataset%20Shape%201.jpeg) + +After data preprocessing, the dataset used for creating a radar chart will only include mean scores of five dimensions for different groups. However, the required format of the dataset for radar chart, using the function in `fmsb` package, need to __attach two rows at the top of the above dataset--one is for minimum score and another is for maximum score__. + +![](https://raw.githubusercontent.com/YzwIsALaity/Radar-Chart-Tutorial-in-R/1e5dc30a259b09e851af3e60d2231e52b84a03d5/Final%20Dataset%20Shape%202.jpeg) + +The above one is the __final dataset__ that we are going to use for creating a radar chart. + +## 2. Single radar plot +We first create a single radar chart with the final dataset and are going to use the `radarchart()` function. So we will introduce some useful commands within the main function: + +- `axistype`: this is used to choose the __type of axes__ and its default is 0. + * `axistype = 1` = center axis label + * `axistype = 2` = around-the-chart label + * `axistype = 3` = both center and around-the-chart (peripheral) labels + * `axistype = 4` = center axis label with two-digit number + * `axistype = 5` = both center and around-the-chart (peripheral) labels with two-digit number + +- `pcol`: this is a __vector/number of color codes for plot data__. + +- `pfcol`: this is a __vector/number of color codes for filling polygons__. + +- `plwd`: this is a __vector/number of line widths for plot data__. + +- `plty`: this is a __vector/number of line types for plot data__. + +- `cglcol`: this is used to choose __line color__ for radar grids. + +- `cglty`: this is used to choose __line type__ for radar grids. + +- `cglwd`: this is used to choose __line width__ for radar grids. + +- `axislabcol`: this is used to choose __color of axis label and numbers__. + +- `vlabels`: this is a __character vector for the names for variables__. + +- `vlcex`: this is to choose __font size magnification for__ `vlabels`. + +```{r} +# the name of each group used in legend #################################################################### +Group <- c('Healthy Control', 'Infected Patients', 'Long-term Symptoms', 'Short-term Symptoms') + +# the choice of color for each group ####################################################################### +ColorOfGroup <- c("#00AFBB", "#E7B800", "#FC4E07", 'gray') + +# the name of each vertex ################################################################################## +EQ5D5L <- c('Usual\nActivities', 'Self-care', 'Anxiety/\nDepression', 'Pain/\nDiscomfort', 'Mobility') + +# Plot ###################################################################################################### +# op is used to control the margin of radar chart +op <- par(mar = c(1,2,2,2)) +# the main function for figure and choices of parameters +radarchart( + df = Dt.Summary.2[, 2:6], axistype = 1, + # Customize the polygon + pcol = ColorOfGroup, + pfcol = alpha(ColorOfGroup, 0.1), # alpha() is used to control the transparency of color + plwd = 2, plty = 1, + # Customize the grid + cglcol = "black", cglty = 1, cglwd = 0.8, + # Customize the axis + axislabcol = "black", caxislabels = 1:5, calcex = 0.7, + # Variable labels + vlcex = 1.25, vlabels = EQ5D5L + ) +# Add an horizontal legend +legend("right", # we put the legend on the right of the radar chart but one can also put + # it at the bottom of the figure with the command ["bottom", horiz = TRUE] + legend = Group, # set up the text for legend + col = ColorOfGroup, # corresponding colors for groups + text.col = "black", cex = 1.25, pt.cex = 1.25, bty = "n", pch = 20 + ) +par(op) # this command is used to perform the setup of margin for radar chart +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/Radar-Chart-Tutorial-in-R/1e5dc30a259b09e851af3e60d2231e52b84a03d5/Radar%20Plot%20(Single).jpeg) + +Since the `radarchart()` is based on the basic `plot()` in the R, __all non-data components (e.g. point, line, text, color, etc.) follow the guidance of__ `plot()`. This means parameter choices for `pcol`, `pfcol`, `cglcol`, ..... follow the guidance of `plot()`. + +## 3. Multiple radar charts +In the next step, we are going to put multiple radar charts together if they all share the same legend. This is one is a little bit complex than the single one but it is still acceptable. __The basic logic for putting multiple radar charts in a row is first to create different radar charts separately and then attach an empty plot including the legend only to a row of radar plots.__ + +```{r} +# arrange positions for multiple radar charts and legends ################################################### +par(mfrow = c(1, 3)) # mfrow = c(1,3) means we want to put plots/legends in 1 row and each plot/legend + # occupies 1 column so totally 3 columns (2 radar charts + 1 legend) +# op is used to control the margin of radar chart ########################################################### +op <- par(mar = c(1,2,2,1)) # OK to omit this one + +# First radar chart ######################################################################################### +radarchart( + df = Dt.Summary.2[, 2:6], axistype = 1, + # Customize the polygon + pcol = color, pfcol = alpha(color, 0.1), plwd = 2, plty = 1, + # Customize the grid + cglcol = "black", cglty = 1, cglwd = 0.8, + # Customize the axis + axislabcol = "black", + # Variable labels + vlcex = 1.35, vlabels = EQ5D5L, + caxislabels = 1:5, calcex = 0.7 + ) + +# Second radar chart ######################################################################################### +radarchart( + df = Dt.Summary.2[, 2:6], axistype = 1, + # Customize the polygon + pcol = color, pfcol = alpha(color, 0.1), plwd = 2, plty = 1, + # Customize the grid + cglcol = "black", cglty = 1, cglwd = 0.8, + # Customize the axis + axislabcol = "black", + # Variable labels + vlcex = 1.35, vlabels = EQ5D5L, + caxislabels = 1:5, calcex = 0.7 + ) + +# Add an horizontal legend ################################################################################### +# control the margin of radar chart and OK to omit it +par(mar = c(0, 0, 0, 0)) +# Create empty plot +plot(1, type = "n", axes = FALSE, xlab = "", ylab = "") +# Attach the legend to the empty plot +legend('left', # this 'left' means we want to put the legend on the left side of the empty plot + # and this is close to the 2 radar charts + legend = c('Healthy Control', 'Infected Patients', 'Long-term Symptoms', 'Short-term Symptoms'), + col = c("#00AFBB", "#E7B800", "#FC4E07", 'gray'), + pch = 20, cex = 1.35, pt.cex = 1.35, bty = "n") + +par(op) +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/Radar-Chart-Tutorial-in-R/1e5dc30a259b09e851af3e60d2231e52b84a03d5/Radar%20Plot%20(Multiple).jpeg) + +The above is the final version of radar charts and similar to other plots in R, the final size and resolution of the figure can be adjusted when you are going to output it from R. Since the `radarchart()` in `fmsb` package is based on the basic `plot()` function in R, it provides users more flexible way to make choices for non-data components (e.g. legends, texts, colors, etc.). + diff --git a/_posts/2022-01-28-Alluvium-Plot.md b/_posts/2022-01-28-Alluvium-Plot.md new file mode 100644 index 00000000000..4412900542b --- /dev/null +++ b/_posts/2022-01-28-Alluvium-Plot.md @@ -0,0 +1,140 @@ +--- +title: "Alluvium Plot in R with ggplot2" +mathjax: true +layout: post +categories: media +--- + +![Alluvium Plot](https://raw.githubusercontent.com/YzwIsALaity/Alluvium-Plot-Tutorial-in-R/fe9b191c90b75e92682db4389f8b46ef17bf20ba/p3.jpeg) + + +This is a short tutorial for creating __alluvium plots in R__ with `ggplot2`. An alluvium plot is used to show the change/trend/development of subjects regarding single/multiple variables over time flow. Beside `ggplot2`, we require to use `ggalluvial` package in our plots and will provide two examples of alluvium plots. + +## 1. Example: Counts of specialists in different timepoints +The first dataset we used is a follow-up study with a __cohort of 200 patients__. They reported what types of specialists they sought for at __4 different timepoints of visits (repeated measure)__. Hence the dataset has three columns: + +- `PTID`: __unique identification__ for each patient (200 participants, string); + +- `Time`: this covariate provides different __timepoints of visits__ and has __four levels [Acute, 6-month, 12-month, 24-month]__ (string); + +- `Specialist`: this covariate provides different __types of specialists__ and has __five levels [None, Neurology, Rheumatology, Infectious Disease, Pulmonology]__ (numerical). + +![](https://raw.githubusercontent.com/YzwIsALaity/Alluvium-Plot-Tutorial-in-R/fe9b191c90b75e92682db4389f8b46ef17bf20ba/Dataset_1.jpeg) + +There are some new functions we are going to use for creating a alluvium plot beside the `ggplot()`. __In `ggplot()`, we first set up the X-axis as `x = Time` and fill out stratum with its color as `stratum = Specialist, fill = Specialist`. Since each participant's choice of specialist are changed and recorded over `Time`, the alluvium in the plot should each unique participant and we will set `alluvium = PTID`. Eventually, we set the legend label as `label = Specialist`.__ + +- `geom_flow()`: this function is used to create __lodes__ of an alluvial plot. + * In this example, __lodes__ are just flow curves in the plot. If we just single call this function, it looks like below + +```{r} +# Set time as a factor +Dt$Time <- factor(Dt$Time, levels = c('Acute', '6-month', '12-month', '24-month')) +# Plot +ggplot(Dt, aes(x = Time, stratum = Specialist, alluvium = PTID, fill = Specialist, label = Specialist)) + + geom_flow(stat = "alluvium") +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/Alluvium-Plot-Tutorial-in-R/fe9b191c90b75e92682db4389f8b46ef17bf20ba/p1.jpeg) + +- `geom_stratum()`: this function is used to create __strata__ of an alluvial plot. + * In this example, each __stratum__ represents the amount of participants seeking for a type of specialists. + +```{r} +ggplot(Dt, aes(x = Time, stratum = Specialist, alluvium = PTID, fill = Specialist, label = Specialist)) + + geom_stratum(alpha = 0.7) # "alpha = 0.7" is used to control the transparency of color filled in strata +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/Alluvium-Plot-Tutorial-in-R/fe9b191c90b75e92682db4389f8b46ef17bf20ba/p2.jpeg) + +- `scale_fill_brewer()`: this function is used to provide sequential ('seq'), diverging ('div') and qualitative ('qual') __color schemes__ in `ggplot2`. + * `type = 'div'` means we want diverging color scheme (colors are different from each other apparently) and `palette = "Set2"` means we want to use the "Set2" palette. + +We can combine together to see what we can get! This time we are going to remove the background and grid and __use "Times New Roman" font for every text/label__ in the plot (`element_text(family = 'Times')`). + +```{r} +# Version 1.0 +ggplot(Dt, aes(x = Time, + stratum = Specialist, + alluvium = PTID, + fill = Specialist, + label = Specialist)) + + geom_flow(stat = "alluvium") + + geom_stratum(alpha = 0.7) + + xlab('Visit Timepoint') + ylab('Count') + + scale_fill_brewer(type = "div", palette = "Set2") + + theme_bw() + + theme(panel.border = element_blank(), + panel.background = element_blank(), + panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + axis.line.x = element_line(), + axis.line.y = element_line(), + axis.text.x = element_text(colour = "black", size = 11, family = "Times"), + axis.text.y = element_text(colour = "black", size = 11, family = "Times"), + axis.ticks.x = element_line(), + axis.ticks.y = element_line(), + axis.title.x = element_text(colour = "black", face = "bold",size = 11, family = "Times"), + axis.title.y = element_text(colour = "black", face = "bold",size = 11, family = "Times"), + legend.text = element_text(size = 11, color = 'black',family = "Times"), + legend.title = element_text(size = 13, face = "bold", color = 'black', family = "Times"), + plot.title = element_text(face = "bold", family = "Times")) +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/Alluvium-Plot-Tutorial-in-R/fe9b191c90b75e92682db4389f8b46ef17bf20ba/p3.jpeg) + +The height of each stratum is the number of participants at a given timepoint of visit with a type of specialist and flow curves between two timepoints show the longitudinal changes in specialists for participants. The acute state is the baseline and we have 200 participants. In subsequent follow-ups, we lost some participants and this is reflected by the height of bars in following timepoints. + +## 2. Example 2: Counts of viruses in different timepoints +The second dataset is a 2-year record of counts for different respiratory viruses in a given region. Counts of different viruses are collected quarterly from 2019 to 2020. It has three columns: + +- `Virus`: types of respiratory viruses [5 viruses: Rhinovirus, Seasonal Coronavirus, Adenovirus, Human Metapneumovirus, Parainfluenza] (string); + +- `Time`: calendar times in the format Year-Month [6 timepoints: 2019-3, 2019-6, 2019-9, 2019-12, 2020-3, 2020-6, 2020-9, 2020-12] (string); + +- `Count`: the number of viruses (numerical). + +![](https://raw.githubusercontent.com/YzwIsALaity/Alluvium-Plot-Tutorial-in-R/fe9b191c90b75e92682db4389f8b46ef17bf20ba/Dataset_2.jpeg) + +Differed from the example one which has __stratum__ inside a bar for each timepoint, this example will only have alluvium and flow. Therefore, we will use new functions. + +- `geom_alluvium()`: this function is used to combine __lodes__ from `geom_flow()` and __strata__ from `geom_stratum()` and output them together as __ribbon curves__. + * Since we want to show longitudinal changes of viral counts, we need to pass `aes(fill = Virus, colour = Virus)` where `fill` and `colour` will be specific to each viruses. + * `alpha = 0.7` is used to control the transparency of colors and `decreasing = FALSE` means that __a virus with more counts will be always placed at the top position regarding a given timepoint__. + +- `scale_fill_brewer()`: this is the same as `scale_color_brewer()` + +Meanwhile, we will pass `aes(x = Time, y = Count, alluvium = Virus)` into `ggplot()` so we set __`Time` in X-axis and `Count` in Y-axis and the `alluvium` will be `Virus`__. Let us put all together to see what we can get! + +```{r} +# Set the Time as a factor +Dt2$Time <- factor(Dt2$Time, levels = c(paste0(2019, '-', c(3,6,9,12)), paste0(2020, '-', c(3,6,9,12)))) +# Version 2.0 +ggplot(data = Dt2, + aes(x = Time, + y = Count, + alluvium = Virus)) + + geom_alluvium(aes(fill = Virus, colour = Virus), + alpha = 0.7, decreasing = FALSE) + + scale_fill_brewer(type = "qual", palette = "Set3") + + scale_color_brewer(type = "qual", palette = "Set3") + + theme_bw() + + theme(panel.border = element_blank(), + panel.background = element_blank(), + panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + axis.line.x = element_line(), + axis.line.y = element_line(), + axis.text.x = element_text(colour = "black", size = 11, family = "Times", angle = -45), + axis.text.y = element_text(colour = "black", size = 11, family = "Times"), + axis.ticks.x = element_line(), + axis.ticks.y = element_line(), + axis.title.x = element_text(colour = "black", face = "bold", size = 11, family = "Times", vjust = -2), + axis.title.y = element_text(colour = "black", face = "bold",size = 11, family = "Times"), + legend.text = element_text(size = 11, color = 'black', family = "Times"), + legend.title = element_text(size = 13, face = "bold", color = 'black', family = "Times"), + plot.title = element_text(face = "bold", family = "Times")) +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/Alluvium-Plot-Tutorial-in-R/fe9b191c90b75e92682db4389f8b46ef17bf20ba/p4.jpeg) + +We eventually get a alluvium plot with colorful ribbons! diff --git a/_posts/2022-03-02-Heatmap.md b/_posts/2022-03-02-Heatmap.md new file mode 100644 index 00000000000..79bf8685848 --- /dev/null +++ b/_posts/2022-03-02-Heatmap.md @@ -0,0 +1,323 @@ +--- +title: "Heatmap in R with ggplot2" +mathjax: true +layout: post +categories: media +--- + +![Heatmap](https://raw.githubusercontent.com/YzwIsALaity/Heatmap-Tutorial-in-R/2a378e9b043535a679c90e68db6f9b08f6b533a7/Multiple%20Heatmaps%20(Aggregated).jpeg) + + +This is a short tutorial for making __heatmaps__ in R with __ggplot2__. In general, a heatmap is intended to show a __(numerical) correlation between a pair of features/covariates/variables__ and mostly a __correlation matrix__ will be the input of a heatmap. However, it is possible that we just want to show the __longitudinal change/trend of subjects__ in a heatmap and under this case, the scale in a heatmap will only have few levels (e.g. __Yes/No, High/Median/Low, etc.__). + +# Part 1: Categorical/Nominal variable +## 1. Format of dataset for heatmap +For example, a cohort study recorded a list of symptoms that each participant experienced and reported in follow-up surveys. In here, each symptom will only contain two levels: "Yes" for experiencing the symptom at X-month follow-up and "No" for no experiencing the symptom at X-month follow-up. We used a fake dataset to show how we can visualize the longitudinal change/trend of participants' symptoms. The dataset has 5 columns: + +- `PTID`: an __unique identification__ for each participant (string); + +- `n.symptoms`: the __total number of symptoms__ that a participants reported over all follow-ups (numerical); + +- `Month`: the __timepoint for each follow-up__ and it has 3 levels [6-month, 12-month, 24-month] (string); + +- `Experienced.Symptom`: a __binary covariate [No, Yes]__ that indicates if a participant experienced a symptom in a given follow-up (string); + +- `Symptom`: a __list of symptoms in follow-up [10 symptoms: Fatigue, Headache, Muscle Aches, Breathing Difficulties, Loss Taste/Smell, Joint Pain, Vertigo, Lowering Vision, Brain Fog, Hair Loss]__ (string). + +![](https://raw.githubusercontent.com/YzwIsALaity/Heatmap-Tutorial-in-R/2a378e9b043535a679c90e68db6f9b08f6b533a7/Data%20Shape%20(Individual).jpeg) + + +In the ggplot, no matter what kind of plots we make, they are basically composed of two parts: X-axis and Y-axis. __Since we want to create a heatmap for the longitudinal reported symptoms, we set X-axis as__ `Month`__, Y-axis as__ `PTID`__, and filled values for tiles in the heatmap with__ `Experienced.Symptom`. Since we have 10 symptoms and each of them were recorded at 3 different timepoints, we need to work on each symptom first and then __arrange them horizontally__. + +## 2. Heatmap for one symptom +We take Fatigue as an example and we will extract rows for Fatigue. + +```{r} +# Extract rows for Fatigue ################################################################################# +Fatigue <- Dt[which(Dt$Symptom == 'Fatigue'), ] +# Heatmap ################################################################################################## +p_Fatigue <- + ggplot(Fatigue, + aes(x = Month, y = reorder(PTID, n.symptoms))) + + geom_tile(aes(fill = Experienced.Symptom)) + + scale_fill_manual(name = 'Experienced Symptom', + values = c('No' = 'aquamarine2', 'Yes' = 'firebrick2', 'NA' = 'gray45')) + + ggtitle('Fatigue') + + theme(panel.grid.major = element_blank(), # remove background and grid + panel.background = element_blank(), + axis.line = element_blank(), # remove lines in X- and Y-axes + axis.ticks.x = element_blank(), # remove ticks in X- and Y-axes + axis.ticks.y = element_blank(), + axis.title.x = element_blank(), # remove titles in X- and Y-axes + axis.title.y = element_blank(), + axis.text.x = element_text(size = 15, color = 'black', face = "bold", angle = 45, hjust = 1), + axis.text.y = element_text(size = 15, color = 'black', face = "bold"), + legend.text = element_text(size = 15, color = 'black', face = "bold"), + legend.title = element_text(size = 15, face = "bold", color = 'black'), + plot.title = element_text(size = 20, hjust = 0.5, face = "bold", color = 'black')) +p_Fatigue +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/Heatmap-Tutorial-in-R/2a378e9b043535a679c90e68db6f9b08f6b533a7/Heatmap%20(one%20symptom).jpeg) + + +The above is the heatmap for one symptom (fatigue) and we are going to layout details for the code below: + +- The main __function for heatmaps in ggplot2__ is `geom_tile()` and we often need to set up and pass the command `aes(fill = )` into the main function. Since we want to __fill tiles with the covariate__ `Experienced.Symptom`, we eventually result in `geom_tile(aes(fill = Experienced.Symptom))`. + +- `ggplot(Fatigue, aes(x = Month, y = reorder(PTID, n.symptoms)))`: this is the main input of ggplot2 and we want to __reorder the input dataset by the total number of symptoms that a participant experienced__. Namely, participants experiencing more symptoms over 3 timepoints will be placed at top. + +- `scale_fill_manual(name = 'Experienced Symptom', values = c('No' = 'aquamarine2', 'Yes' = 'firebrick2', 'NA' = 'gray45'))`: this command provides a chance to __manually modify the variation of color for the value filled in tiles__. Since we only have two level No/Yes, we set up different colors for them and add up potential color for missing values as well. + +- `theme()`: this is a function used to __modify non-data components__ in the the plot and we want to enlarge texts in axes, titles, and legends. + + * `axis.text.x`, `axis.text.y`, `legend.text`, and `legend.title`: we hope texts in __axes and legends in bold__ (`face = "bold"` in `element_text()`) and better for display, we __rotate labels in X-axis__ with 45 degrees (`angle = 45`) and move it down a little bit (`hjust = 1`); + + * `plot.title`: we hope our main title is a little bit greater (`size = 20`) and center in the figure (`hjust = 0.5`); + + * `color = 'black'`: we also want all texts in black so we pass this command into each `element_text()`; + +- `ggtitle()`: this is for the title of the figure. + +## 3. Heatmap for multiple symptoms +Now we are going to pack heatmaps for multiple symptoms together. __For the first heatmap (in here it is Fatigue), we need to remove the legend and for the last one, we add the legend to it but remove texts in Y-axis. For others heatmaps in the middle of a row, we will remove texts in Y-axis and legend for better display.__ Thus, we create the first and the last one separately and then create others in a loop. + +```{r} +# First: Fatigue ######################################################################################### +Fatigue <- Dt[which(Dt$Symptom == 'Fatigue'), ] +p_Fatigue <- + ggplot(Fatigue, + aes(x = Month, y = reorder(PTID, n.symptoms))) + + geom_tile(aes(fill = Experienced.Symptom)) + + scale_fill_manual(name = 'Experienced Symptom', + values = c('No' = 'aquamarine2', 'Yes' = 'firebrick2', 'NA' = 'gray45')) + + ggtitle('Fatigue') + + theme(panel.grid.major = element_blank(), # remove background and grid + panel.background = element_blank(), + axis.line = element_blank(), # remove lines in X- and Y-axes + axis.ticks.x = element_blank(), # remove ticks in X- and Y-axes + axis.ticks.y = element_blank(), + axis.title.x = element_blank(), # remove titles in X- and Y-axes + axis.title.y = element_blank(), + axis.text.x = element_text(size = 15, color = 'black', face = "bold", angle = 45, hjust = 1), + axis.text.y = element_text(size = 15, color = 'black', face = "bold"), + legend.position = 'none', # remove legend + plot.title = element_text(size = 20, hjust = 0.5, face = "bold", color = 'black')) + +# Last: Hair Loss ######################################################################################### +Hair_Loss <- Dt[which(Dt$Symptom == 'Hair Loss'), ] +p_Hair_Loss <- + ggplot(Hair_Loss, + aes(x = Month, y = reorder(PTID, n.symptoms))) + + geom_tile(aes(fill = Experienced.Symptom)) + + scale_fill_manual(name = 'Experienced Symptom', + values = c('No' = 'aquamarine2', 'Yes' = 'firebrick2', 'NA' = 'gray45')) + + ggtitle('Hair Loss') + + theme(panel.grid.major = element_blank(), # remove background and grid + panel.background = element_blank(), + axis.line = element_blank(), # remove lines in X- and Y-axes + axis.ticks.x = element_blank(), # remove ticks in X- and Y-axes + axis.ticks.y = element_blank(), + axis.title.x = element_blank(), # remove titles in X- and Y-axes + axis.title.y = element_blank(), + axis.text.x = element_text(size = 15, color = 'black', face = "bold", angle = 45, hjust = 1), + axis.text.y = element_blank(), + legend.text = element_text(size = 15, color = 'black', face = "bold"), + legend.title = element_text(size = 15, face = "bold", color = 'black'), + plot.title = element_text(size = 20, hjust = 0.5, face = "bold", color = 'black')) + +# Middle: other symptoms ################################################################################# +Var.Middle <- c('Headache', 'Muscle Aches', 'Breathing Difficulties', 'Loss Taste/Smell', + 'Joint Pain', 'Vertigo', 'Lowering Vision', 'Brain Fog') +Heatmap.Middle <- list() +for(i in 1:length(Var.Middle)){ + Data <- Dt[which(Dt$Symptom == Var.Middle[i]), ] + p <- + ggplot(Data, + aes(x = Month, y = reorder(PTID, n.symptoms))) + + geom_tile(aes(fill = Experienced.Symptom)) + + scale_fill_manual(name = 'Experienced Symptom', + values = c('No' = 'aquamarine2', 'Yes' = 'firebrick2', 'NA' = 'gray45')) + + ggtitle(Var.Middle[i]) + + theme(panel.grid.major = element_blank(), # remove background and grid + panel.background = element_blank(), + axis.line = element_blank(), # remove lines in X- and Y-axes + axis.ticks.x = element_blank(), # remove ticks in X- and Y-axes + axis.ticks.y = element_blank(), + axis.title.x = element_blank(), # remove titles in X- and Y-axes + axis.title.y = element_blank(), + axis.text.x = element_text(size = 15, color = 'black', face = "bold", angle = 45, hjust = 1), + axis.text.y = element_blank(), + legend.position = 'none', # remove legend + plot.title = element_text(size = 20, hjust = 0.5, face = "bold", color = 'black')) + Heatmap.Middle[[i]] <- p +} +``` + +__To remove the the legend in plots__, we pass the command `legend.position = 'none'` into the function `theme()`. Since we have multiple plots now, we are going to arrange their positions and eventually pack them together! + +This time since the width of plots are different since the first and last one contains more non-data components than the heatmaps in the middle, we need to use a __layout matrix__ to arrange the position of heatmaps. The idea for the matrix is very simple and we just need to __repeat the rank number for each heatmap multiple times (the number of repeat is used to modify the width of each heatmap)__. In here, we set up __the width for the first nine heatmaps as 8 units__ and __the width for the last heatmap as 16 units (since it has a legend)__. Since all heatmaps are displayed in a row, the __layout matrix__ is just a row vector which contains different lengths of repeated rank numbers and it looks like + +```{r} +Layout.Mat <- matrix(c(rep(1:9, each = 8), rep(10, 16)), nrow = 1) +Layout.Mat +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/Heatmap-Tutorial-in-R/2a378e9b043535a679c90e68db6f9b08f6b533a7/Layout%20matrix.jpeg) + +To arrange the position of heatmaps, we use the function `grid.arrange()` and pass the command `layout_matrix = Layout.Mat` into it. + +```{r} +grid.arrange(p_Fatigue, + Heatmap.Middle[[1]], + Heatmap.Middle[[2]], + Heatmap.Middle[[3]], + Heatmap.Middle[[4]], + Heatmap.Middle[[5]], + Heatmap.Middle[[6]], + Heatmap.Middle[[7]], + Heatmap.Middle[[8]], + p_Hair_Loss, + layout_matrix = Layout.Mat) +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/Heatmap-Tutorial-in-R/2a378e9b043535a679c90e68db6f9b08f6b533a7/Heatmap%20(multiple%20symptoms).jpeg) + +It eventually looks like the above one! __Its actual size (width * height) and resolution should be adjusted when you are going to output it from R.__ + +# Part 2: Numerical variable +## 1. Format of dataset for heatmap +Similar to the dataset on the first part, the only difference is that this time we are going to fill out __numerical values for tiles__ rather than categorical/nominal values. +![](https://raw.githubusercontent.com/YzwIsALaity/Heatmap-Tutorial-in-R/2a378e9b043535a679c90e68db6f9b08f6b533a7/Data%20Shape%20(Aggregated).jpeg) + + +This dataset includes 4 variables: + +- `Symptom.PriorTimepoint`: a list of __symptoms at prior timepoints__ (Acute, 6-month, 12-month, and 24-month) (string); + +- `Symptom.LaterTimepoint`: a list of __symptoms at subsequent timepoints__ (string); + + * Symptom: Fatigue, Headache, Body aches, Difficult breathing, Loss or alter taste/smell, Joint pain, Vertigo/dizzy, Lower vision, Brain fog, Hair loss, Gastrointestinal, and Difficult sleeping + +- `Corr`: the __correlation of a pair of symptoms__ in `Symptom.PriorTimepoint` and `Symptom.LaterTimepoint` (numerical); + +- `Comparison`: texts used to describe timepoints in `Symptom.PriorTimepoint` and `Symptom.LaterTimepoint` ("Acute vs 6-month", "6-month vs 12-month", and "12-month vs 24-month") (string). + +## 2. Heatmap for one type of `Comparison` +In here, we first create a heatmap for a pair of symptoms in acute and 6-month timepoint. + +```{r} +# Extract the specific type ################################################################################# +Dt1 <- Dt[which(Dt$Comparison == "Acute vs 6-month"), ] +# Plot ###################################################################################################### +p_acute_6m <- + ggplot(Dt1, aes(x = Symptom.PriorTimepoint, y = Symptom.LaterTimepoint)) + + geom_tile(aes(fill = Corr)) + + ggtitle('Acute vs 6-month') + + scale_fill_gradientn(breaks = seq(-1, 1, length.out = 6), + colors = viridis(6), n.breaks = 6, limits = c(-1, 1), na.value = 'gray27') + + theme(panel.grid.major = element_blank(), + panel.background = element_blank(), + axis.line = element_blank(), + axis.ticks.x = element_blank(), + axis.ticks.y = element_blank(), + axis.title.x = element_blank(), + axis.title.y = element_blank(), + axis.text.x = element_text(size = 15, color = 'black', face = "bold", angle = 45, hjust = 1), + axis.text.y = element_text(size = 15, color = 'black', face = "bold"), + plot.title = element_text(size = 20, hjust = 0.5, face = "bold", color = 'black'), + legend.text = element_text(size = 15, hjust = 0.5, color = 'black'), + legend.title = element_text(size = 15, hjust = 0.5, face = "bold", color = 'black')) +p_acute_6m +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/Heatmap-Tutorial-in-R/2a378e9b043535a679c90e68db6f9b08f6b533a7/Heatmap%20(Aggregated).jpeg) + +All __non-data component settings for heatmaps__ in part 2 are the same as the part 1 except the function `scale_fill_gradientn()`. + +- `scale_fill_gradientn()`: this function is used to create n-color gradient so it can set up what color you want to use and how many gradient you want. + + * `breaks = seq(-1, 1, length.out = 6)`: create a __consecutive sequence of break__ (e.g. `r seq(-1, 1, length.out = 6)`); + + * `colors = viridis(6)`: set up a vector of __choices of colors for gradient__ and we use `viridis()` from the `viridis` package; + + * `n.breaks = 6`: set up the __number of breaks__; + + * `limits = c(-1, 1)`: set up the __limits of numerical values for breaks (lower, upper)__; + + * `na.value = 'gray27'`: set up the __color for missing values__. + +## 3. Heatmap for multiple types of `Comparison` +__Similar to the part 1, we will remove the legend for the first heatmap, remove the texts in Y-axis and legend for the heatmap in the middle, and remove the texts in Y-axis for the last heatmap.__ + +```{r} +# First: Acute vs 6-month ############################################################################ +Dt1 <- Dt[which(Dt$Comparison == "Acute vs 6-month"), ] +p_acute_6m <- + ggplot(Dt1, aes(x = Symptom.PriorTimepoint, y = Symptom.LaterTimepoint)) + + geom_tile(aes(fill = Corr)) + + ggtitle('Acute vs 6-month') + + scale_fill_gradientn(breaks = seq(-1, 1, length.out = 6), + colors = viridis(6), n.breaks = 6, limits = c(-1, 1), na.value = 'gray27') + + theme(panel.grid.major = element_blank(), + panel.background = element_blank(), + axis.line = element_blank(), + axis.ticks.x = element_blank(), + axis.ticks.y = element_blank(), + axis.title.x = element_blank(), + axis.title.y = element_blank(), + axis.text.x = element_text(size = 15, color = 'black', face = "bold", angle = 45, hjust = 1), + axis.text.y = element_text(size = 15, color = 'black', face = "bold"), + plot.title = element_text(size = 20, hjust = 0.5, face = "bold", color = 'black'), + legend.position = 'none') + +# Middle: 6-month vs 12-month ####################################################################### +Dt2 <- Dt[which(Dt$Comparison == "6-month vs 12-month"), ] +p_acute_12m <- ggplot(Dt2, aes(x = Symptom.PriorTimepoint, y = Symptom.LaterTimepoint)) + + geom_tile(aes(fill = Corr)) + + ggtitle('6-month vs 12-month') + + scale_fill_gradientn(breaks = seq(-1, 1, length.out = 6), + colors = viridis(6), n.breaks = 6, limits = c(-1, 1), na.value = 'gray27') + + theme(panel.grid.major = element_blank(), + panel.background = element_blank(), + axis.line = element_blank(), + axis.ticks.x = element_blank(), + axis.ticks.y = element_blank(), + axis.title.x = element_blank(), + axis.title.y = element_blank(), + axis.text.x = element_text(size = 15, color = 'black', face = "bold", angle = 45, hjust = 1), + axis.text.y = element_blank(), + plot.title = element_text(size = 20, hjust = 0.5, face = "bold", color = 'black'), + legend.position = 'none') + +# Last: 12-month vs 24-month ########################################################################## +Dt3 <- Dt[which(Dt$Comparison == "12-month vs 24-month"), ] +p_acute_24m <- ggplot(Dt3, aes(x = Symptom.PriorTimepoint, y = Symptom.LaterTimepoint)) + + geom_tile(aes(fill = Corr)) + + ggtitle('12-month vs 24-month') + + scale_fill_gradientn(breaks = seq(-1, 1, length.out = 6), + colors = viridis(6), n.breaks = 6, limits = c(-1, 1), na.value = 'gray27') + + theme(panel.grid.major = element_blank(), + panel.background = element_blank(), + axis.line = element_blank(), + axis.ticks.x = element_blank(), + axis.ticks.y = element_blank(), + axis.title.x = element_blank(), + axis.title.y = element_blank(), + axis.text.x = element_text(size = 15, color = 'black', face = "bold", angle = 45, hjust = 1), + axis.text.y = element_blank(), + plot.title = element_text(size = 20, hjust = 0.5, face = "bold", color = 'black'), + legend.text = element_text(size = 15, hjust = 0.5, color = 'black'), + legend.title = element_text(size = 15, hjust = 0.5, face = "bold", color = 'black')) +# Arrange heatmaps ###################################################################################### +Layout.Mat <- matrix(c(rep(1, 14), rep(2, 10), rep(3, 12)), nrow = 1) +grid.arrange(p_acute_6m, + p_acute_12m, + p_acute_24m, + layout_matrix = Layout.Mat) +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/Heatmap-Tutorial-in-R/2a378e9b043535a679c90e68db6f9b08f6b533a7/Multiple%20Heatmaps%20(Aggregated).jpeg) + +Eventually, we get the final one! + diff --git a/_posts/2022-05-02-Box-Violin.md b/_posts/2022-05-02-Box-Violin.md new file mode 100644 index 00000000000..50118b45815 --- /dev/null +++ b/_posts/2022-05-02-Box-Violin.md @@ -0,0 +1,213 @@ +--- +title: "Box and Violin Plot in R with ggplot2" +mathjax: true +layout: post +categories: media +--- + +![Violin Plot](https://raw.githubusercontent.com/YzwIsALaity/Box-Violin-Plot-Tutorial-in-R/042bda2d9a9dee966d39cdfbc937b63c2de91855/p5.jpeg) + + +This is a short tutorial for how to change a __plain box plot__ to be more advanced versions (e.g. __box plot with p-value, violin plot__, etc.). +## 1. Format of dataset for boxplot +The dataset is a record of the level of biomarker for a disease and patients in the dataset are categorized into different groups based on their severity levels. Hence, this dataset has three columns: + +- `PTID`: __unique identifications for participants__ and there are 400 unique participants (string); + +- `Illness`: __severity levels of the disease [4 levels: Healthy Control, Moderate, Mild, Severe]__ (string); + +- `Biomarker`: a __numerical measurement of the biomarker__ (numerical); + +- `Sex`: a variable for sex of participants (string). + +![](https://raw.githubusercontent.com/YzwIsALaity/Box-Violin-Plot-Tutorial-in-R/042bda2d9a9dee966d39cdfbc937b63c2de91855/Dataset_Shape.jpeg) + +A __box plot__ is often used to __show the empirical distribution of a numerical covariate regarding different groups__. Usually, there are two main methods to visualize the empirical distribution of the data: + +- __Histogram/density__: it approximates the __"shape"__ of an empirical distribution; + +- __Box plot__: it provides more __descriptive statistics__ of an empirical distribution in a graphical way. Typically, it has __five elements: min, max, median, 1st & 3rd quartile__. + +Compared to a __histogram/density__, a __box plot__ is more often to use when people want to __compare the same numerical covariate through all groups__ and this process is typically involved in performing __hypothesis testing__. + +## 2. Version 0.0: basic +The first step in here we are going to draw a simple box plot to visualize empirical distribution of `Biomarker` through different groups in `Illness`. In this version, we will __remove the background and grid__ for the box plot. We make __titles in axes in bold type__. The main function for box plot is `geom_boxplot()` and we will set `Illness` in X-axis and `Biomarker` in Y-axis. + +```{r} +# Set Illness and Sex as factors +Dt$Illness <- factor(Dt$Illness, levels = c("Healthy Control", "Mild", "Moderate", "Severe")) +Dt$Sex <- factor(Dt$Sex, levels = c('Male', 'Female')) + +# Version 0.0 +ggplot(Dt, aes(x = Illness, y = Biomarker)) + + geom_boxplot(alpha = 0.5) + + xlab('Illness Severity') + ylab('Biomarker Level') + + theme_bw() + # dark-on-light theme + theme(panel.border = element_blank(), # these four are for the background and grid + panel.background = element_blank(), + panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + axis.line.x = element_line(), # these two are for the axis line + axis.line.y = element_line(), + axis.text.x = element_text(colour = "black", size = 11), # there two are for texts in axes + axis.text.y = element_text(colour = "black", size = 11), + axis.ticks.x = element_line(), # these two are for ticks in axes + axis.ticks.y = element_line(), + axis.title.x = element_text(colour = "black", size = 11, face = 'bold', vjust = -1), + axis.title.y = element_text(colour = "black", size = 11, face = 'bold'), + legend.title = element_text(colour = "black", size = 11, face = 'bold')) +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/Box-Violin-Plot-Tutorial-in-R/042bda2d9a9dee966d39cdfbc937b63c2de91855/p1.jpeg) + +In Version 0.0, we just show boxes for `Illness` groups and do not stratified them into `Sex`, which provides a general visualization of numerical biomarker levels regarding different `Illness` groups. + +## 3. Version 1.0: grouped +Based on the setting of Version 0.0, the boxplot can be stratified by `Sex` and this only required we pass `fill = Sex` into `aes()` for main ggplot function `ggplot()`. In this version, we will use preselected colors for `Sex`. + +```{r} +# we can preselect color for each box +Col <- c("#FF0000", "#80FF00") +# Version 1.0 +ggplot(Dt, aes(x = Illness, y = Biomarker, fill = Sex)) + + geom_boxplot(alpha = 0.5) + # 'alpha = 0.5' control the transparency of colors + scale_fill_manual(values = Col) + # use preselected color + xlab('Illness Severity') + ylab('Biomarker Level') + + theme_bw() + # dark-on-light theme + theme(panel.border = element_blank(), # these four are for the background and grid + panel.background = element_blank(), + panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + axis.line.x = element_line(), # these two are for the axis line + axis.line.y = element_line(), + axis.text.x = element_text(colour = "black", size = 11), # there two are for texts in axes + axis.text.y = element_text(colour = "black", size = 11), + axis.ticks.x = element_line(), # these two are for ticks in axes + axis.ticks.y = element_line(), + axis.title.x = element_text(colour = "black", size = 11, face = 'bold', vjust = -1), + axis.title.y = element_text(colour = "black", size = 11, face = 'bold'), + legend.title = element_text(colour = "black", size = 11, face = 'bold')) +``` + +![](https://github.com/YzwIsALaity/Box-Violin-Plot-Tutorial-in-R/blob/042bda2d9a9dee966d39cdfbc937b63c2de91855/p2.jpeg) + +This is the grouped version of a box plot based on Version 0.0. The above two versions are the most basic type of box plots and __if you want to show the box horizontally, you just need to shift X-axis and Y-axis__. In the next example, we will integrate a box plot with hypothesis testing together. + +## 4. Version 2.0: hypothesis testing +To integrate a box plot with hypothesis testing together, we need to use `ggpubr` package. We need to use a new function: + +- `stat_compare_means()`: it is used to conduct hypothesis testing among different groups regarding the same numerical covariate. We will introduce some common-used arguments of this function. + + * `method`: it is used to indicate what __types of testing methods__ should be performed (e.g. __`t.test` for normal data, `wilcox.test` for non-normal data, `kruskal.test`, and `anova`__). + + * `label`: it is used to specify label types [e.g. __"p.signif" (shows the significance levels) and "p.format" (shows the formatted p value)__]. + + * `vjust`: it is used to move the text up or down relative to the bracket. + + * `paired`: it is used to indicate whether you want a paired test. + +```{r} +# Version 2.0 +ggplot(Dt, aes(x = Illness, y = Biomarker, fill = Sex)) + + geom_boxplot(alpha = 0.5) + # 'alpha = 0.5' control the transparency of colors + stat_compare_means(aes(group = Sex), # 'group = Sex' choose groups for comparison + method = 'wilcox.test', # we use wilcox test for comparison + label = 'p.format', + vjust = -0.65) + + scale_fill_manual(values = Col) + # use preselected color + xlab('Illness Severity') + ylab('Biomarker Level') + + theme_bw() + # dark-on-light theme + theme(panel.border = element_blank(), # these four are for the background and grid + panel.background = element_blank(), + panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + axis.line.x = element_line(), # these two are for the axis line + axis.line.y = element_line(), + axis.text.x = element_text(colour = "black", size = 11), # there two are for texts in axes + axis.text.y = element_text(colour = "black", size = 11), + axis.ticks.x = element_line(), # these two are for ticks in axes + axis.ticks.y = element_line(), + axis.title.x = element_text(colour = "black", size = 11, face = 'bold', vjust = -1), + axis.title.y = element_text(colour = "black", size = 11, face = 'bold'), + legend.title = element_text(colour = "black", size = 11, face = 'bold')) +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/Box-Violin-Plot-Tutorial-in-R/042bda2d9a9dee966d39cdfbc937b63c2de91855/p3.jpeg) + +Therefore, it is very easy to integrate a box plot with hypothesis testing! Then, we are going to the final stage of box plot. + +## 5. Version 3.0: violin plot +__As we mentioned above, there are two main methods to visualize the empirical distribution of the data--histogram/density and box plot. If we blend them together, that is a violin plot.__ Actually, it is very easy to create a violin plot based on a box plot and the main function we are going to use is + +- `geom_violin()`: this is the main function for creating a violin plot. __If we use this function based on our Version 0.0/1.0/2.0, we just need to adjust the size of "violin" and "box" with the command `width = `.__ + +The first example of a violin plot is based on Version 0.0. + +- __The basic logic of creating a violin plot over existed box plot is to put `geom_violin()` at first and then call `geom_boxplot()`.__ + +- __Meanwhile, since the box plot is above the violin plot, we hope violins can cover boxes. It is necessary to adjust the size of violins and boxes to make sure each violin can cover each box entirely.__ + +```{r} +# Version 3.0 +ggplot(Dt, aes(x = Illness, y = Biomarker)) + + geom_violin(width = 1, + position = position_dodge(0.7)) + # each violin has larger size + geom_boxplot(alpha = 0.7, width = 0.3, + position = position_dodge(0.7)) + # this size of each box is smaller than a violin + xlab('Illness Severity') + ylab('Biomarker Level') + + theme_bw() + # dark-on-light theme + theme(panel.border = element_blank(), # these four are for the background and grid + panel.background = element_blank(), + panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + axis.line.x = element_line(), # these two are for the axis line + axis.line.y = element_line(), + axis.text.x = element_text(colour = "black", size = 11), # there two are for texts in axes + axis.text.y = element_text(colour = "black", size = 11), + axis.ticks.x = element_line(), # these two are for ticks in axes + axis.ticks.y = element_line(), + axis.title.x = element_text(colour = "black", size = 11, face = 'bold', vjust = -1), + axis.title.y = element_text(colour = "black", size = 11, face = 'bold'), + legend.title = element_text(colour = "black", size = 11, face = 'bold')) +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/Box-Violin-Plot-Tutorial-in-R/042bda2d9a9dee966d39cdfbc937b63c2de91855/p4.jpeg) + +The second example of a violin plot is based on Version 2.0. __Due to the overlapped of violins and boxes, we need to adjust their overlapped position and we will use one more arguments in both `geom_violin()` and `geom_boxplot()`--`position`. We will set `position = position_dodge()` and pass any some numerical values into `position_dodge()`. `position_dodge()` is used to control the horizontal distance between two groups. Finally, we need to make sure `geom_violin()` and `geom_boxplot()` share the same `position`.__ + +```{r} +# Version 4.0 +ggplot(Dt, aes(x = Illness, y = Biomarker, fill = Sex)) + + geom_violin(aes(fill = Sex), width = 1, alpha = 0.5, + position = position_dodge(1)) + + geom_boxplot(aes(fill = Sex), alpha = 0.5, width = 0.1, + position = position_dodge(1)) + + stat_compare_means(aes(group = Sex), # 'group = Sex' choose groups for comparison + method = 'wilcox.test', # we use wilcox test for comparison + label = 'p.format', + vjust = -0.65) + + scale_fill_manual(values = Col) + # use preselected color + xlab('Illness Severity') + ylab('Biomarker Level') + + theme_bw() + # dark-on-light theme + theme(panel.border = element_blank(), # these four are for the background and grid + panel.background = element_blank(), + panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + axis.line.x = element_line(), # these two are for the axis line + axis.line.y = element_line(), + axis.text.x = element_text(colour = "black", size = 11), # there two are for texts in axes + axis.text.y = element_text(colour = "black", size = 11), + axis.ticks.x = element_line(), # these two are for ticks in axes + axis.ticks.y = element_line(), + axis.title.x = element_text(colour = "black", size = 11, face = 'bold', vjust = -1), + axis.title.y = element_text(colour = "black", size = 11, face = 'bold'), + legend.title = element_text(colour = "black", size = 11, face = 'bold')) +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/Box-Violin-Plot-Tutorial-in-R/042bda2d9a9dee966d39cdfbc937b63c2de91855/p5.jpeg) + +Eventually, we can get a fancy violin plot with a plain box plot and hypothesis testing! We can also make them horizontally and this just need to exchange a variable in X-axis with a variable in Y-axis as well. + +In here, we just provide two types of combination of plots so it is very possible that people can combine box/violin plots with other plots (e.g. points, bars, etc.). + + diff --git a/_posts/2022-06-19-Line.md b/_posts/2022-06-19-Line.md new file mode 100644 index 00000000000..8c659d0e6d8 --- /dev/null +++ b/_posts/2022-06-19-Line.md @@ -0,0 +1,251 @@ +--- +title: "Line Plot in R with ggplot2" +mathjax: true +layout: post +categories: media +--- + +![Line Plot](https://raw.githubusercontent.com/YzwIsALaity/Line-Plot-Tutorial-in-R/3cb83a04056a112854923509fb147d4fb1c11c25/Version%203.0.jpeg) + + +In this tutorial, we will go through __different types of line-base plots (e.g. straight line, smooth curve, density, etc.) in R with `ggplot2`__ and how we can __stratify/group line plots with different categories/levels__. + +## 1. Format of dataset +The dataset we used for the tutorial includes cycle threshold values and numerical measurements of a biomarker for different respiratory viruses collected in different clinics in participant's level. It has 6 coloumns: + +- `PTID`: this is an __unique identification__ for each participants __[1200 participants]__ (string); + +- `Location`: this is the __location of clinic__ where participants visited __[6 locations: 'Hospital A', 'Hospital B', 'Clinic A', 'Clinic C', 'Urgent Care A', 'Urgent Care B']__ (string); + +- `Severity`: this is the __severity level of diseases [2 levels: 'Mild', 'Severe']__ (string); + +- `Virus`: this indicates the __types of viral infections [2 types: 'Enterovirus', 'Rhinovirus']__ (string); + +- `Ct`: this is the __cycle threshold value__ for a type of viral infection (numerical); + +- `Biomarker`: this is a __numerical measurement of a biomarker__ (numerical). + +![](https://raw.githubusercontent.com/YzwIsALaity/Line-Plot-Tutorial-in-R/3cb83a04056a112854923509fb147d4fb1c11c25/Dataset_Shape.jpeg) + + +No matter what types of plots we want to create with `ggplots`, the package has a fundamental input function for the dataset before passing into different plots. That is `ggplot()` and this function requires to passing dataset and setting variables X-axis and/or Y-axis. It basically need to use the inner argument `aes()` and we often pass below arguments into `aes()`: + +- `x = `: a variable for X-axis; + +- `y = `: a variable for Y-axis; + +- `group = `: a variable for indicating different groups in a plot. __Typically, we will put a categorical variable for this command and we want to visualize figures for different categories regarding the same X and Y variables.__ + +- `col = `: a numerical value/vector of colors for points/lines from different `group`. __The color in here is just used to select the color for outer boxes/frames/etc. rather than color filled in boxes/frames/etc.__; + +- `fill = `: a numerical value/vector of colors for __filling out in boxes/frames/etc. from different `group`__. + +Once we use `group`, `col`, and `fill`, the `ggplot` will automatically provide legends for them. The effects from `col` and `fill` are similar to `group` so we can just call `col` or `fill` without calling the command `group`. + +## 2. Basic line plot (connect points with lines) +We are going to create a basic line plot, that is, only connecting `Ct`-`Biomarker` pairs of points with straight lines and stratified points by `Virus`. We will still remove the grid and background for the figure and set `Ct` as X-axis and `Biomarker` as Y-axis. __The primary function we used in here from `ggplot2` package is `geom_line()`__. + +```{r} +# Version 0.0 +ggplot(Dt, aes(x = Ct, y = Biomarker, col = Virus)) + + geom_line() + + theme_bw() + # dark-on-light theme + theme(panel.border = element_blank(), # these four are for the background and grid + panel.background = element_blank(), + panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + axis.line.x = element_line(), # these two are for the axis line + axis.line.y = element_line(), + axis.text.x = element_text(colour = "black", size = 11), # there two are for texts in axes + axis.text.y = element_text(colour = "black", size = 11), + axis.ticks.x = element_line(), # these two are for ticks in axes + axis.ticks.y = element_line(), + axis.title.x = element_text(colour = "black", size = 11, face = 'bold', vjust = -1), + axis.title.y = element_text(colour = "black", size = 11, face = 'bold'), + legend.title = element_text(colour = "black", size = 11, face = 'bold')) +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/Line-Plot-Tutorial-in-R/3cb83a04056a112854923509fb147d4fb1c11c25/Version%200.0.jpeg) + +In this plot, we can find that it is really hard to distinguish a trace of each virus so we can __split Version 0.0 into two figures based on `Virus` with the same layout__. This will require us to use a new function: + +- __`facet_grid()`__: this is used to form a matrix of panels defined by row and column faceting variables and it has some main arguments: + + * `cols = ` and `rows = `: variable's name will be first passed into `vars()` and then into these two arguments and they are used to determine how we want to layout panels. __For example, if we set `cols = var(Virus)`, we will see two side-by-side figures and each one represents a type of `Virus`.__ + + * `scales = `: it is used to determine if __scales should be shared across all facets__ (the default, "fixed"), or do they vary across rows ("free_x"), columns ("free_y"), or both rows and columns ("free"). +In this example, we will set up `cols = var(Virus)` and `scales = 'fixed' since X-axis and Y-axis for two figures share the same variables and ranges. + +```{r} +# Version 1.0 +ggplot(Dt, aes(x = Ct, y = Biomarker)) + + geom_line() + + facet_grid(cols = vars(Virus), scales = 'fixed') + + theme_bw() + # dark-on-light theme + theme(panel.border = element_blank(), # these four are for the background and grid + panel.background = element_blank(), + panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + axis.line.x = element_line(), # these two are for the axis line + axis.line.y = element_line(), + axis.text.x = element_text(colour = "black", size = 11), # there two are for texts in axes + axis.text.y = element_text(colour = "black", size = 11), + axis.ticks.x = element_line(), # these two are for ticks in axes + axis.ticks.y = element_line(), + axis.title.x = element_text(colour = "black", size = 11, face = 'bold', vjust = -1), + axis.title.y = element_text(colour = "black", size = 11, face = 'bold'), + legend.title = element_text(colour = "black", size = 11, face = 'bold'), + strip.text = element_text(size = 15), # set up size of title for each figure with 15 + strip.background = element_blank()) # remove the background of titles for figures +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/Line-Plot-Tutorial-in-R/3cb83a04056a112854923509fb147d4fb1c11c25/Version%201.0.jpeg) + +The above example is just stratified pairs of `Ct`-`Biomarker` by types of `Virus` and for the next one, we can try to __stratify them with two variables__. __Namely, pairs of `Ct`-`Biomarker` will be stratified by types of `Virus` and levels of `Severity`.__ We will set up columns as `Virus` and rows as `Severity`. + +```{r} +# Version 2.0 +ggplot(Dt, aes(x = Ct, y = Biomarker)) + + geom_line() + + facet_grid(cols = vars(Virus), rows = vars(Severity), # stratified by Virus and Severity + scales = 'fixed') + + theme_bw() + # dark-on-light theme + theme(panel.border = element_blank(), # these four are for the background and grid + panel.background = element_blank(), + panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + axis.line.x = element_line(), # these two are for the axis line + axis.line.y = element_line(), + axis.text.x = element_text(colour = "black", size = 11), # there two are for texts in axes + axis.text.y = element_text(colour = "black", size = 11), + axis.ticks.x = element_line(), # these two are for ticks in axes + axis.ticks.y = element_line(), + axis.title.x = element_text(colour = "black", size = 11, face = 'bold', vjust = -1), + axis.title.y = element_text(colour = "black", size = 11, face = 'bold'), + legend.title = element_text(colour = "black", size = 11, face = 'bold'), + strip.text = element_text(size = 13, face = 'bold'), # set up size of title for each figure with 15 + strip.background = element_blank()) # remove the background of titles for figures +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/Line-Plot-Tutorial-in-R/3cb83a04056a112854923509fb147d4fb1c11c25/Version%202.0.jpeg) + +In this section, we have seen how we can use basic line plot to visualize the development of a numerical biomarker along with cycle threshold values of viruses. Obviously, the basic line plot does not provide useful information for two variables so one may think about using more generalized methods to show their relationship, that is, a __smooth curve__. + +## 3. Smooth curve plot +Beyond the basic version of line plots (Version 0.0 and Version 1.0), smooth curve plot is a better choice to visualize the development of one numerical variable along with another numerical variable. The main function for smooth curve in `ggplot2` package is + +- `geom_smooth()`: it is used to fit different smooth curves and it has several common arguments that users often specify: + + * `method = `: it is used to determine smoothing method (function) to use + + + "lm" for linear regression + + + "glm" for generalized linear regression + + + "gam" for generalized additive method __(default for observations more than 1000 and fitted with cubic spline [`y ~ s(x, bs = 'cs')`])__ + + + "loess" for local polynomial regression __(default for observations less than 1000)__ + + * `se = `: it is used to choose __whether the plot displays confidence interval around smooth (T or F)__ + + * `formula = `: it is used to provide formula to use in smoothing function (e.g. y ~ x, y ~ poly(x, 2), y ~ log(x)) + + * `method.args = list()`: it is used to provide a list for additional arguments passed on to the modelling function defined by `method` + +In this section, we are going to __use smooth curves to visualize the development of the numerical `Biomarker` along with the `Ct` values for different `Virus`, stratified by `Severity` level and `Location` of clinics__. Hence, we will modify the Version 2.0 to meet our new requirements and this time we will show the border of each panel and the background of titles for columns and rows. + +```{r} +# Version 3.0 +ggplot(Dt, aes(x = Ct, y = Biomarker, col = Severity, fill = Severity)) + # set up colors/fill for Severity + geom_smooth(method = 'loess', # use local polynomial regression + alpha = 0.3, # size = 0.5 --> line width + size = 0.5) + # alpha = 0.3 --> color transparency + facet_grid(cols = vars(Virus), rows = vars(Location), # stratified by Virus and Location + scales = 'fixed') + + theme_bw() + # dark-on-light theme + theme(panel.background = element_blank(), + panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + axis.line.x = element_line(), # these two are for the axis line + axis.line.y = element_line(), + axis.text.x = element_text(colour = "black", size = 11), # there two are for texts in axes + axis.text.y = element_text(colour = "black", size = 11), + axis.ticks.x = element_line(), # these two are for ticks in axes + axis.ticks.y = element_line(), + axis.title.x = element_text(colour = "black", size = 11, face = 'bold', vjust = -1), + axis.title.y = element_text(colour = "black", size = 11, face = 'bold'), + legend.title = element_text(colour = "black", size = 11, face = 'bold'), + legend.text = element_text(colour = "black", size = 11), + strip.text = element_text(size = 13, face = 'bold')) # set up size of title for each figure with 13 +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/Line-Plot-Tutorial-in-R/3cb83a04056a112854923509fb147d4fb1c11c25/Version%203.0.jpeg) + +Eventually we get the fancy one! Probably, it will be better to include borders and background of titles in columns and rows. + +## 4. Density plot/histogram +The last example we are going show is the density plot which almost conveys similar information as a histogram. A density plot is a specific version of smooth curve as well. The main functions we are going to use in here are + +- `geom_density()`: it is used to plot the density of a numerical variable; + +- `geom_histogram()`: it is used to plot the histogram of a numerical variable. + +In this example, we will show a normal density plot for `Ct` stratified by `Location` and a histogram for `Ct` stratified by `Location`. + +```{r} +# Version 4.0 +p1 <- + ggplot(Dt, + aes(x = Ct, col = Location, fill = Location)) + + geom_density(alpha = 0.3) + + xlab('Ct') + ylab('Density') + + theme_bw() + # dark-on-light theme + theme(panel.border = element_blank(), + panel.background = element_blank(), + panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + axis.line.x = element_line(), # these two are for the axis line + axis.line.y = element_line(), + axis.text.x = element_text(colour = "black", size = 11), # there two are for texts in axes + axis.text.y = element_text(colour = "black", size = 11), + axis.ticks.x = element_line(), # these two are for ticks in axes + axis.ticks.y = element_line(), + axis.title.x = element_text(colour = "black", size = 11, face = 'bold', vjust = -1), + axis.title.y = element_text(colour = "black", size = 11, face = 'bold'), + legend.title = element_text(colour = "black", size = 11, face = 'bold'), + legend.text = element_text(colour = "black", size = 11)) + +# Histogram +p2 <- + ggplot(Dt, + aes(x = Ct, col = Location, fill = Location)) + + geom_histogram() + + xlab('Ct') + ylab('Count') + + theme_bw() + # dark-on-light theme + theme(panel.border = element_blank(), + panel.background = element_blank(), + panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + axis.line.x = element_line(), # these two are for the axis line + axis.line.y = element_line(), + axis.text.x = element_text(colour = "black", size = 11), # there two are for texts in axes + axis.text.y = element_text(colour = "black", size = 11), + axis.ticks.x = element_line(), # these two are for ticks in axes + axis.ticks.y = element_line(), + axis.title.x = element_text(colour = "black", size = 11, face = 'bold', vjust = -1), + axis.title.y = element_text(colour = "black", size = 11, face = 'bold'), + legend.title = element_text(colour = "black", size = 11, face = 'bold'), + legend.text = element_text(colour = "black", size = 11)) +# Arrange plots +grid.arrange(p1, p2, nrow = 1) +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/Line-Plot-Tutorial-in-R/3cb83a04056a112854923509fb147d4fb1c11c25/Density.jpeg) + +Beside these plots, there are other functions related to line-base plots in `ggplot2` package and users can refer to other line plots such as + +- `geom_abline()`, `geom_hline()`, `geom_vline()`: these three provide __diagonal, horizontal, and vertical lines__; + +- `geom_segment()`, `geom_curve()`: these two provide __line segments and curves__. + +People can make different combinations of these lines and create a complex figure that fulfill their requirements. diff --git a/_posts/2022-07-29-Pie-Chart.md b/_posts/2022-07-29-Pie-Chart.md new file mode 100644 index 00000000000..860c478a086 --- /dev/null +++ b/_posts/2022-07-29-Pie-Chart.md @@ -0,0 +1,162 @@ +--- +title: "Pie Chart in R with ggplot2" +mathjax: true +layout: post +categories: media +--- + +![Cover](https://raw.githubusercontent.com/YzwIsALaity/Pie-Chart-In-R/main/Multiple%20Pie%20Chart.png) + + + +In this tutorial, I will demonstrate how to create a __pie chart__ using the __`ggplot2` and `ggrepel` packages in R__. A pie chart is a type of chart that __displays numerical proportions of a variable in polar coordinates__, similar to a bar chart. However, unlike a bar chart, a pie chart focuses on displaying percentages rather than raw counts. To create the chart, I will use the publicly available [National Cancer Database (NCDB)](https://www.facs.org/quality-programs/cancer-programs/national-cancer-database/) to display the percentage of combinations of race and ethnicity with the type of cancer facility. + +# 1. Dataset +After data cleaning, the dataset I used to create pie charts includes four variables: + +- `Type`: this is a __categorical variable__ that is used to indicate the type of cancer clinic __[5 levels: 'Academic/Research Program', 'Community Cancer Program', 'Comprehensive Community Cancer Program', 'Integrated Network Cancer Program', 'Missing']__ (string) + +- `Origin`: this is a __categorical variable__ that is used to indicate race/ethnicity group __[3 levels: 'Black, NH', 'White, NH', 'Mexican']__ (string) + +- `Count`: this is a numerical variable that is used to indicate a __count for patients in the combination of `Type` and `Origin`__ (numerical) + +- `Percentage`: this is a numerical variable that is used to indicate a __percentage for patients in the combination of `Type` and `Origin`__ (numerical) + +![](https://raw.githubusercontent.com/YzwIsALaity/Pie-Chart-In-R/main/Dataset%201.png) + +I will use this simple dataset to first create a single pie chart for `Mexican` patients, and then multiple pie charts for the other patients. First, I will __create a variable to display percentage labels in a pie chart__. I will __use the `paste0()` function to concatenate the numerical value of the `Percentage` with a percentage sign (%), and store the result as a string variable__. And then I will set `Origin` and `Type` as factors. + +```{r} +# Percentage label +Dt.Plot$'Percentage.Label' <- paste0(Dt.Plot$Percentage, '%') + +# Factor for Origin and Type +Dt.Plot$Origin <- factor(Dt.Plot$Origin, levels = c("Mexican", 'White, NH', 'Black, NH')) +Dt.Plot$Type <- factor(Dt.Plot$Type, levels = c("Comprehensive Community Cancer Program", + "Academic/Research Program", + "Community Cancer Program", + "Integrated Network Cancer Program", + "Missing")) +``` + +In R, a pie chart maps a bar chart into polar coordinates. If a user wants to label percentages in a pie chart, they must provide coordinates for `Percentage.Label` in the polar system. I am going to use `arrange()`, `desc()`, `group_by()`, and `mutate()` functions from the `dplyr` package and `cumsum()` function in R base. + +- `arrange()`: it orders the rows of a data frame by the values of selected columns + +- `desc()`: it transforms a vector into a format that will be sorted in descending order + +__Here, I arranged the dataset by `Origin` and sorted it in decreasing order based on the values in `Type`__. + +- `group_by()`: it takes a data frame and one or more variables to group by + +- `mutate()`: it creates new columns that are functions of existing variables + +- `cumsum()`: it returns a vector whose elements are the cumulative sums + +__An original bar plot from `geom_bar()` or `geom_col()` stacks rectangles for stratified variables vertically, and I plan to place the `Percentage.Label` in the middle of each rectangle__. + +![](https://raw.githubusercontent.com/YzwIsALaity/Pie-Chart-In-R/main/Bar%20Chart.png) + +This can be done by first grouping the data frame by the `Origin` variable, and then subtracting half of the `Percentage` from the cumulative sum of `Percentage` for each `Origin`. + +```{r} +# Pie chart +Dt.PieChart <- + Dt.Plot %>% + arrange(Origin, desc(Type)) %>% # arrange it with Origin and decreasing order of Type + group_by(Origin) %>% # group by Origin + mutate(text_y = cumsum(Percentage) - Percentage/2) # the middle of each rectangle + +head(Dt.PieChart) +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/Pie-Chart-In-R/main/Dataset%202.png) + +`Dt.PieChart` will be used to create pie charts! + +# 2. Single pie +To create a single pie chart, three functions are needed to be used: + +- `geom_col()`: it uses the heights of the bars to represent values in the data (equivalent to `stat = 'identity'` in `geom_bar()`) + + + `width`: it controls the bar width + +- `coord_polar()`: it transforms a stacked bar chart into polar coordinates + + + `theta`: it is an argument that map angle to axes (x or y) + +- `geom_text_repel()`: it is from the `ggrepel` package and adds text directly to the plot and I will use some arguments below: + + + `size`: it controls the size of text/label + + + `show.legend`: it controls if we want to show legend of text/label + + + `nudge_x` or `nudge_y`: it is an argument to perform horizontal and vertical adjustments to nudge the starting position of each text label. + +I will extract a subset of `Dt.PieChart` for `'Mexican'` and create a pie chart to show the percentage of facility type for Mexican patients. + +```{r} +Dt.Mexican <- Dt.PieChart[which(Dt.PieChart$Origin == 'Mexican'), ] + +ggplot(Dt.Mexican, + aes(x = '', # We don't want to show x axis + y = Percentage, # y axis is numerical percentage + fill = Type)) + # color for each facility type + geom_col(width = 1) + # create a bar chart with width = 1 + coord_polar(theta = "y") + # transfer y axis into polar coordinate + scale_fill_brewer() + # color + geom_text_repel(aes(y = text_y, + label = Percentage.Label), # label text + size = 4, # size of text + show.legend = FALSE, # remove legend + nudge_x = 1.5) + # the distance between text and figure + theme(axis.text = element_blank(), + axis.ticks = element_blank(), + axis.title = element_blank(), + panel.grid = element_blank(), + panel.background = element_blank(), + plot.background = element_blank(), + legend.background = element_blank(), + legend.title = element_text(size = 13, face = 'bold'), + legend.text = element_text(size = 13), + strip.text = element_text(size = 13, face = 'bold'), + strip.background = element_blank()) +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/Pie-Chart-In-R/main/Single%20Pie%20Chart.png) + +# 3. Multiple pies +For multiple pie charts, we will need to use `facet_grid()` functions to form a matrix of panels defined by row and column faceting variables. + +```{r} +ggplot(Dt.PieChart, + aes(x = '', # We don't want to show x axis + y = Percentage, # y axis is numerical percentage + fill = Type)) + # color for each facility type + geom_col(width = 1) + # create a bar chart with width = 1 + coord_polar(theta = "y") + # transfer y axis into polar coordinate + scale_fill_brewer() + # color + geom_text_repel(aes(y = text_y, + label = Percentage.Label), # label text + size = 4, # size of text + show.legend = FALSE, # remove legend + nudge_x = 1.5) + # the distance between text and figure + facet_grid(cols = vars(Origin), # faceted by Origin variable + scales = 'fixed') + + theme(axis.text = element_blank(), + axis.ticks = element_blank(), + axis.title = element_blank(), + panel.grid = element_blank(), + panel.background = element_blank(), + plot.background = element_blank(), + legend.background = element_blank(), + legend.title = element_text(size = 13, face = 'bold'), + legend.text = element_text(size = 13), + strip.text = element_text(size = 13, face = 'bold'), + strip.background = element_blank()) + +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/Pie-Chart-In-R/main/Multiple%20Pie%20Chart.png) + +Here we go! diff --git a/_posts/2022-09-13-Shaded-Area.md b/_posts/2022-09-13-Shaded-Area.md new file mode 100644 index 00000000000..27c7035ff59 --- /dev/null +++ b/_posts/2022-09-13-Shaded-Area.md @@ -0,0 +1,127 @@ +--- +title: "Shaded Area Plot in R with ggplot2" +mathjax: true +layout: post +categories: media +--- + +![Shaded Area](https://raw.githubusercontent.com/YzwIsALaity/Shaded-Area-Plot-Tutorial-in-R/4a3120d1ac3cbfd5a5d7068b3f941a0867650eb0/Version%202.jpeg) + + +In this tutorial, we will go through __different types of plots with shaded areas in R with `ggplot2`__ and how we can __integrate a shaded area plot with other elements (i.e., bar plot)__. The shaded area plot can help us to better understand how different types of rate/prevalence/incidence/frequency/... change over time. + +## 1. Format of dataset +The dataset we used for the tutorial is a public dataset from [Seattle Flu Study](https://seattleflu.org/pathogens). After some basic data cleaning, we only keep 6 coloumns: + +- `Epi.Week`: this is an __epidemiological week__, a standardized method of counting weeks to allow for the comparison of data year after year __[36 unique epi weeks]__ (string); + +- `Date`: this is the __calendar time__ (Month-Year) (string); + +- `Count`: this is the __frequency of detected respiratory pathogens__ during each `Date` period (numerical); + +- `Prevalence`: this is the __proportion of persons who have been detected respiratory pathogens__ during each `Date` period (numerical); + +- `Virus`: this is the __label for a type of viral infection__ __[8 viruses: 'RSV', 'Flu', 'Adenovirus', 'Enterovirus', 'Human Coronavirus', 'Human Parainfluenza', 'Human Metapneumovirus', 'Rhinovirus']__(string); + +- `Index`: this is a variable used to __indicate the order of calendar time__ (numerical). + +![](https://raw.githubusercontent.com/YzwIsALaity/Shaded-Area-Plot-Tutorial-in-R/4a3120d1ac3cbfd5a5d7068b3f941a0867650eb0/Dataset.png) + +## 2. Shaded area plot +The basic functions in `ggplot2` for shaded area plots are `geom_ribbon()` and `geom_area()` and we are going to introduce two functions: + +- `geom_ribbon()`: this is used to display a interval of `y` at a given `x` and it is defined by `ymin` and `ymax` within `aes(ymin, ymax)`; + +- `geom_area()`: this is a special case of `geom_ribbon()` since it sets `ymin = 0`. + +The first version we want to show is for the stacked "shaded area". The x-axis will be the calendar time (`Date`), the y-axis will be the frequency of detected respiratory pathogens (`Count`), and "shaded area" for each type of `Virus` will be stacked vertically. We will use the `geom_area()` in this example and use and we will a extra package `ggthemes` for color palette. + +```{r} +# Version 1: stacked +ggplot(Dt, aes(x = Index)) + # set up x-axis as numerical Index + geom_area(aes(y = Count, # pass Count for y axis + fill = Virus)) + # fill = Virus to set up different colors for viruses + scale_x_continuous(labels = unique(Dt$Date), # set up x-axis label as unique calendar time + breaks = unique(Dt$Index)) + + ylab('Frequency of Detected Virus') + xlab('Calendar Time (Year-Month)') + + scale_fill_tableau("Tableau 10") + # we will use the color palette from Tableau + theme_bw() + # dark-on-light theme + theme(panel.border = element_blank(), # these three are for the background and grid + panel.background = element_blank(), + panel.grid.major = element_blank(), + axis.line.x = element_line(), + axis.line.y = element_line(), + axis.text.x = element_text(colour = "black", size = 11, # rotate text in x axis 45 degrees and move + angle = 45, hjust = 1), # text to the left side for 1 unit + axis.text.y = element_text(colour = "black", size = 11), + axis.ticks.x = element_line(), + axis.ticks.y = element_line(), + axis.title.x = element_text(colour = "black", size = 11, face = 'bold', vjust = -1), + axis.title.y = element_text(colour = "black", size = 11, face = 'bold'), + legend.title = element_text(colour = "black", size = 11, face = 'bold')) +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/Shaded-Area-Plot-Tutorial-in-R/4a3120d1ac3cbfd5a5d7068b3f941a0867650eb0/Version%201.jpeg) + +In the above example, the frequency of detected virus is stacked vertically at each time point and the "shaded area" for each type of virus will be marked by a specific color (color palette is similar to __Tableau__). The next example is to integrate "shaded area" with a bar plot and we set "shaded area" for prevalence of detected viruses and bar for frequency of detected viruses. To simplify the figure, we will focus on the __prevalence and frequency of Adenovirus and Enterovirus__ to demonstrate the use of `geom_ribbon()`. Since the units of frequency and prevalence are different, we will implement the __second y axis__ in the plot to distinguish values for frequency and prevalence. + +```{r} +# Extract records for Adenovirus and Enterovirus +Dt.Sub <- Dt[which(Dt$Virus %in% c('Adenovirus', 'Enterovirus')), ] +# Check distribution of Count and Prevalence +summary(Dt.Sub[, c('Count', 'Prevalence')]) +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/Shaded-Area-Plot-Tutorial-in-R/4a3120d1ac3cbfd5a5d7068b3f941a0867650eb0/Distribution.png) + +In `ggplot2`, the __second y axis__ need to be tranformed from the __main y axis__ and we therefore need to check ranges of `Count` and `Prevalence`. We will set up `Count` as the __main y axis__ and transform the scale of `Prevalence` to match the scale of `Count` and we need to pass this process into `geom_ribbon()` and `scale_y_continuous()`: + +- `geom_ribbon()`: this function requires passing `ymin` and `ymax` into `aes()` [`geom_ribbon(aes(ymin = 0, ymax = Prevalence * 325 / 0.11))`: __max of `Count` is 325 and max of `Prevalence` is 0.11__]; + +- `scale_y_continuous()`: this function is used to modify the __second y axis__ through the argument `sec.axis = ` and the function `sec_axis()` [`scale_y_continuous(sec.axis = sec_axis(~ . * 0.11/325, name = 'Prevalence'))`] + + + `sec_axis()`: it is used to specify the __second y axis__ and `~.` can pass the scale of __main y axis__ with the transform function (`~ . * 0.11 / 325`). + +Noticed that __frequency and prevalence of virus__ come with different labels so we will create two types of labels for them and the below code trunk is used to create labels. + +```{r} +# Create labels for 'Count' and 'Prevalence' +Dt.Sub$Virus_Label <- factor(ifelse(Dt.Sub$Virus == 'Adenovirus', 'Diagnoses, Adenovirus', 'Diagnoses, Enterovirus'), + levels = c('Diagnoses, Adenovirus', 'Diagnoses, Enterovirus')) +Dt.Sub$Axis_Label <- factor(ifelse(Dt.Sub$Virus == 'Adenovirus', 'Prevalence, Adenovirus', 'Prevalence, Enterovirus'), + levels = c('Prevalence, Adenovirus', 'Prevalence, Enterovirus')) +``` + +After we set up all preprocessing for the subdataset, we are going to integrate a bar plot for frequency of detected viruses with "shaded areas" for prevalence of viruses. + +```{r} +# Version 2: bar + shaded area +## preselected color for different labels +Col <- c('gold2', 'purple', 'green4', 'plum3') +## figure +ggplot(Dt.Sub, aes(x = Index)) + + geom_bar(aes(y = Count, fill = Virus_Label), stat = 'identity', position = position_dodge()) + + geom_ribbon(aes(x = Index, + ymin = 0, ymax = Prevalence * 325 / 0.11, + fill = Axis_Label)) + + scale_y_continuous(sec.axis = sec_axis(~ . * 0.11/325, name = 'Prevalence')) + + scale_x_continuous(labels = unique(Dt$Date), # set up x-axis label as unique calendar time + breaks = unique(Dt$Index)) + + scale_fill_manual(values = c(Col[1:2], alpha(Col[3], 0.3), alpha(Col[4], 0.6))) + + ylab('No. of Cases') + xlab('Calendar Time (Year-Month)') + + guides(fill = guide_legend(title = "")) + + theme_bw() + + theme(axis.line.x = element_line(), # these two are for the axis line + axis.line.y = element_line(), + axis.title.x = element_text(color = 'black', face = "bold"), + axis.title.y = element_text(color = 'black', face = "bold"), + axis.text.x = element_text(color = 'black', face = "bold", angle = 45, hjust = 1), + axis.text.y = element_text(color = 'black', face = "bold"), + legend.text = element_text(color = 'black', face = "bold"), + legend.title = element_text(face = "bold", color = 'black'), + plot.title = element_text(hjust = 0.5, face = "bold", color = 'black')) +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/Shaded-Area-Plot-Tutorial-in-R/4a3120d1ac3cbfd5a5d7068b3f941a0867650eb0/Version%202.jpeg) + +Here we go! diff --git a/_posts/2022-11-04-Survival-Plot.md b/_posts/2022-11-04-Survival-Plot.md new file mode 100644 index 00000000000..2dac78eaaa0 --- /dev/null +++ b/_posts/2022-11-04-Survival-Plot.md @@ -0,0 +1,142 @@ +--- +title: "Kaplan Meier Curve in R with ggplot2" +mathjax: true +layout: post +categories: media +--- + +![Kaplan Meier](https://raw.githubusercontent.com/YzwIsALaity/Survival-Plot-Tutorial-in-R/d7afc4c734d6dd42cc2ea63bd52ff43ca5c127de/Version%203.jpg) + + +In this tutorial, we will go through **survival curves (e.g. Kaplan-Meier curve, Cox model curve)** with an extension package **`survminer`** from **`ggplots`**. + +## 1. Format of dataset +The dataset used for the tutorial includes follow-up time for patients with a specific type of cancer and their corresponding endpoint for the last follow-up visits. It includes clinical variables for patients as well. The detail is listed below: + +- `PTID`: this is an **unique identification** for each participants **[102345 participants]** (string); + +- `Stage`: this is the **stage of cancer** where participants visited **[4 levels: 'Stage I', 'Stage II', 'Stage III', 'Stage IV']** (string); + +- `Time`: this is the **last follow-up/right-censored time** for each patient (numerical); + +- `Histology`: this is the **types of microscopic anatomy [6 types: 'Squamous Cell Carcinoma', 'Adenocarcinoma', 'Unknown', 'Unspecified Carcinoma', 'Other', 'Sarcoma']** (string); + +- `Treatment`: this is the **types of treatments that patients took [4 levels: 'Group 1', 'Group 2', 'Group 3', 'Group 4']** (string); + +![](https://raw.githubusercontent.com/YzwIsALaity/Survival-Plot-Tutorial-in-R/cdd019f687706b20a49d042677ca253cd0db7cde/Dataset%20Table.png) + +## 2. Kaplan-Meier Curve +## (1). Version 1: Basic +We will draw a basic version of Kaplan-Meier curve with the aim of **`survminer`**. We first need to fit a **Kaplan-Meier estimator** for the cancer `Stage` variable and this will **require `survival` package**. In here, we will implement the following functions: + +- `Surv(Time, Status)`: this function is used to create survival subject in R and it requires two main arguments--follow-up time (`time = Time`)/repeated measurement of time (`time = start, time2 = end`) and the event indicator (`event = Status`). +- `survfit()`: this function is used to fit a Kaplan-Meier curve with an survival subject input. + +```{r} +# Fit a Kaplan-Meier curve for cancer stage +KM <- survfit(Surv(Time, Status) ~ Stage, data = Dt.Plot) +``` + +After we have a Kaplan-Meier estimator, we can plug into **the main function for our plot `ggsurvplot()`**. There are several arguments for the function and we are going to introduce them: + +- `fit =`: it is used to plug in a survival estimator (e.g. output from `survfit(Surv(time, time2, event))`); + +- `data =`: we need to use the same dataset as the Kaplan-Meier estimator; + +- `pval = T`: it is used to **show p-value** for the log rank test for the comparison between different strata (at least $\geq 2$); + +- `pval.size = 4`: it is used to control **the size of the font for p-value** display; + +- `pval.coord =`: it is used to control **the location of p-value** display and it should be corresponding to the X and Y axes in the plot (e.g. `c(X.loc, Y.loc)`); + +- `legend =`: it is used to control **the location of legend** and it provides five choices ("top", "bottom", "left", "right", "none" [do not show legend]); + +- `surv.median.line =` it is a character vector for **drawing a horizontal/vertical line at median survival** and provides four choices ("hv", "h", "v", "none" [do not show median survival]). + +```{r} +# Basic: survical curve with ggplot +ggsurvplot(fit = KM, data = Dt.Plot, # Kaplan-Meier estimator and dataset + pval = T, # show p-value + pval.size = 4, # font size of p-value + pval.coord = c(175, 1.0), # coordination of p-value (x, y) + legend = 'right', # location of legend + surv.median.line = 'hv') # show median survival line +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/Survival-Plot-Tutorial-in-R/d7afc4c734d6dd42cc2ea63bd52ff43ca5c127de/Version%201.jpg) + +Since the sample size of the dataset is large, the Keplan-Meier curves look smooth and log-rank test shows significance for difference in trends among stages. + +## (2). Version 2: Facet plot +The second version is to draw a multipanel survival plot. We want to stratify the Kaplan-Meier curve into different combinations of treatments and stages so we create another Kaplan-Meier estimator stratified by both `Treatment` and `Stage`. We also introduce two other arguments for this version: + +- `facet.by =`: it indicates a categorical variable (the name of variable) that you use to **create multi-panel for survival plots**; + + + It also accepts a character vector with length 2 (i.e., `c('Treatment', 'Stage')`); + +- `legend.labs =`: this is a character vector used to create a new labels for the legend and it has to be the same length as the initial legend. + +```{r} +KM2 <- survfit(Surv(Time, Status) ~ Stage + Treatment, data = Dt.Plot) +ggsurvplot(KM2, data = Dt.Plot, + facet.by = 'Treatment', # create multipanel used variable "Treatment" + pval = TRUE, + legend = 'right', + legend.labs = c('Stage I', 'Stage II', 'Stage III', 'Stage IV')) # create a self-defined legend label +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/Survival-Plot-Tutorial-in-R/d7afc4c734d6dd42cc2ea63bd52ff43ca5c127de/Version%202.jpg) + +We can a 2-by-2 multi-panel survival plot, grouped by `Treatment` and in each panel, it is stratified by cancer `Stage`. Tests on trends are performed for each panel as well. In the next version, we are going to explore how to the modification of non-graphical argument for survival plots (i.e., font size/family, etc.). + +## (3). Version 3: Modification of non-graphical components + +In here, we first introduce a self create function for modifying non-graphical components and this self defined function **uses `ggplot2` arguments**. + +```{r} +# Customize theme +Customized.Theme <- function(){ + theme_survminer() %+replace% # a ggplot2 function + theme(panel.border = element_blank(), + panel.background = element_blank(), + panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + axis.line.x = element_line(), # these two are for the axis line + axis.line.y = element_line(), + axis.text.x = element_text(family = 'Times', colour = "black", size = 11), # there two are for texts in axes + axis.text.y = element_text(family = 'Times', colour = "black", size = 11), + axis.ticks.x = element_line(), # these two are for ticks in axes + axis.ticks.y = element_line(), + axis.title.x = element_text(family = 'Times', colour = "black", size = 11, face = 'bold', vjust = -1), + axis.title.y = element_text(family = 'Times', colour = "black", size = 11, face = 'bold', angle = 90, vjust = 3), + legend.title = element_text(family = 'Times', colour = "black", size = 11, face = 'bold'), + legend.text = element_text(family = 'Times', colour = "black", size = 11), + strip.text.x = element_text(family = 'Times', colour = "black", size = 11)) +} +``` + +In the `Customized.Theme()` function, we set the font family for text as "Times" and size as 11 for axes, legends, and labels. Meanwhile, we remove the background and grid and use self defined color/palette for different cancer `Stage` as usual. + +```{r} +# Color/palette for cancer stage +Legend.Labs <- c('Stage I', 'Stage II', 'Stage III', 'Stage IV') # create labels for cancer stages +n.color <- length(Legend.Labs) # the lenght of vector for labels +Palette.New <- rainbow(n.color, alpha = 0.5) # obtain a vector of colors for labels + +# Kaplan-Meier estimator +KM2 <- survfit(Surv(Time, Status) ~ Stage + Treatment, data = Dt.Plot) + +# Survival plot +ggsurvplot(KM2, data = Dt.Plot, + facet.by = 'Treatment', # create multipanel used variable "Treatment" + pval = TRUE, + legend = 'right', # legend position + legend.labs = Legend.Labs, # self-defined legend label + palette = Palette.New, # self-defined color + xlab = 'Time (Month)', ylab = 'Survival Probability', # modify X-axis and Y-axis labels + ggtheme = Customized.Theme()) # use modified non-graphical components +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/Survival-Plot-Tutorial-in-R/d7afc4c734d6dd42cc2ea63bd52ff43ca5c127de/Version%203.jpg) + +Here we go! For other non-graphical components, we can change them in the `Customized.Theme()` and follow arguments in `ggplot2`. diff --git a/_posts/2022-12-02-CIF-Plot.md b/_posts/2022-12-02-CIF-Plot.md new file mode 100644 index 00000000000..2d386f71ca1 --- /dev/null +++ b/_posts/2022-12-02-CIF-Plot.md @@ -0,0 +1,112 @@ +--- +title: "Cumulative Incidence Function Plots in R with ggplot2" +mathjax: true +layout: post +categories: media +--- + +![CIF Plot](https://raw.githubusercontent.com/YzwIsALaity/CIF-Plot-in-R/main/Cumulative%20Incidence%20Function.jpeg) + + +In this tutorial, we will explore how to create __plots for the cumulative incidence function__ in R using `ggplot2`, `survminer`, and `tidycmprsk`. The cumulative incidence function (CIF) is frequently used in survival analysis when __competing risks__ are present in the dataset. In traditional survival data, the survival label $\delta$ only has two classes: 0 or 1 (often referred to as alive or deceased). However, in the presence of competing risks, the survival outcome may involve more than two states, such as 0, 1, or 2 (often referred to as healthy, progressing, or deceased). + +# 1. Format of dataset +We will be using the monoclonal gammopathy data from the `survival` package, which consists of the natural history of 1341 sequential patients with monoclonal gammopathy of undetermined significance (__MGUS__). This dataset involves survival outcomes with three possible states: __censored, progression to plasma cell malignancy disease (PCM), and death due to MGUS__. After data preprocessing, there are 7 columns in the dataset: + +- `Sex`: this is a variable for sex __[2 levels (factor): "Female" and "Male"]__ (string); + +- `Status`: this is a combined event variable for three states __[3 levels (factor): "Censor", "Plasma Cell Malignancy", "Death"] (string)__; + +- `Time`: this is a __combined survival time (year)__ until any event happened or censored (numerical); + +- `PCM`: this is an event indicator for whether a patient progressed to PCM __[2 levels (factor): "No", "Yes"] (string)__; + +- `PCM_Time`: this is a __survival time (year)__ until the `PCM` event happened or censored (numerical); + +- `Death`: this is an event indicator for whether a patient died __[2 levels (factor): "No", "Yes"] (string)__; + +- `Death_Time`: this is a __survival time (year)__ until the `Death` event happened or censored (numerical). + +![](https://raw.githubusercontent.com/YzwIsALaity/CIF-Plot-in-R/main/Dataset.jpeg) + +# 2. Cumulative incidence function plot +## (1). Cumulative incidence function +Before creating a figure, we need to construct a model for the cumulative incidence function in order to __examine the differences in survival among the three states, stratified by the sex of the patients__ and we need to use `cuminc()` function from the `tidycmprsk` package to build a CIF model. + +```{r} +# Cumulative incidence functioin +require(tidycmprsk) +CIF <- cuminc(Surv(Time, Status) ~ Sex, data = Dt.Plot) +``` + +`cuminc()` function requires two main arguments: + +- `formula`: it should be a `Surv()` on LHS and covariates on RHS; + +- `data`: it is the dataset used to estimate cumulative incidence. + +After we fitted a cumulative incidence function, we can pass it into functions for creating figures. + +## (2). Customized theme +To more effectively __customize non-graphical components__ in `ggplot2` for survival data, it is recommended to create a function that allows for theme customization. + +```{r} +# Customize theme of kaplan meier curve +Customized.Theme <- function(){ + theme_survminer() %+replace% # a ggplot2 function + theme(panel.border = element_blank(), + panel.background = element_blank(), + panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + axis.line.x = element_line(), # these two are for the axis line + axis.line.y = element_line(), + axis.text.x = element_text(size = 11), # there two are for texts in axes + axis.text.y = element_text(size = 11), + axis.ticks.x = element_line(), # these two are for ticks in axes + axis.ticks.y = element_line(), + axis.title.x = element_text(size = 11, face = 'bold', vjust = -1), + axis.title.y = element_text(size = 11, face = 'bold', angle = 90, vjust = 3), + legend.title = element_text(size = 11, face = 'bold'), + legend.text = element_text(size = 11), + legend.position = 'right', + plot.title = element_text(hjust = 0.5)) +} +``` + +In the `Customized.Theme()` function, we set the font weight to bold and font size to 11 for the x-axis label, y-axis label, and main title. Additionally, we remove the background and grid lines. + +## (3). Plot +Now, we will create a figure to visualize the cumulative incidence function (`CIF`). To do this, we will use the `ggcuminc()` function from the `ggsurvfit` package, which requires three arguments: + +- `x`: this argument represents a cumulative incidence function model from either `cuminc()` or `survfit2()`; + +- `outcome`: this is an argument for indicating which outcome(s) to include in plot and default is to include the first competing event; + +- `theme`: this is an argument for a `survfit` theme and default is `theme_ggsurvfit_default()`. + +```{r} +# Create title and outcome +Title <- paste0('Cumulative Incidence of Death from \n MGUS or Progression to PCM Stratified by ', 'Sex') +Oucome <- c("Plasma Cell Malignancy", "Death") + +# Create CIF plot +CIF_Plot <- ggcuminc(x = CIF, # the above CIF function + outcome = Oucome, # the label for competing events + theme = Customized.Theme()) + # customize theme + labs(x = 'Time (Years Post Diagnosis)', # modify xlab, ylab, and main title + y = 'Cumulative Incidence', + title = Title) +CIF_Plot +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/CIF-Plot-in-R/main/Cumulative%20Incidence%20Function.jpeg) + +Here we go! + +## (4). Comparison between Kaplan-Meier curve +In contrast to the cumulative incidence function, the __Kaplan-Meier curve__ estimates the failure rate for each competing event separately. When the event of interest is __progression to PCM__, the __death due to MGUS__ will be considered as censored events, in addition to the usual censored observations. __On the other hand, the cumulative incidence function uses the overall survival function, which takes into account events from both the competing events and the events of interest__. + +![](https://raw.githubusercontent.com/YzwIsALaity/CIF-Plot-in-R/main/Comparison%20of%20Curves.jpeg) + +It is clear that the Kaplan-Meier curve for progression to PCM treats events that are due to death as censored at later years post-diagnosis, which cannot be accurately represented. However, this situation can be visualized using the cumulative incidence function. + diff --git a/_posts/2023-01-28-PCA-Visualization.md b/_posts/2023-01-28-PCA-Visualization.md new file mode 100644 index 00000000000..2ff72a48bfe --- /dev/null +++ b/_posts/2023-01-28-PCA-Visualization.md @@ -0,0 +1,302 @@ +--- +title: "Visualization of PCA in R: Scatter & Trejactory" +mathjax: true +layout: post +categories: media +--- + +![Cover](https://raw.githubusercontent.com/YzwIsALaity/PCA-Visualization-Tutorial-in-R/main/The%20Cover.jpeg) + + + +In this tutorial, we will explore how to __visualization of data with principle component analysis (PCA)__ in R, using `ggplot2` and `plotly.` When dealing with __high dimensional data analysis (# of variables is much greater than # of obs.)__, PCA is one of the most popular choices for dimension reduction. As an unsupervised learning model, PCA does not require users to provide an outcome variable (y). Instead, it performs singular value decomposition or eigenvalue decomposition on a high-dimensional input matrix (X) to improve the interpretability of the data. It is often useful to visualize the results from a PCA model, and users typically focus on the first two or three principal components (PC1, PC2, and PC3) since they may explain a significant proportion of the variance in the input data (X). + +# 1. Dataset +With the development of technology for molecular-level observation, many new types of data have emerged, such as genomics, RNA-seq, proteomics, and neural signals. In this example, we used a simulated RNA-seq dataset to visualize the results of principal component analysis (PCA). The dataset assumes a cohort of __five unique patients__ who have been diagnosed with a disease and have __take an experimental medicine for 24 months__. During the first 12 months, the patients did not take the medicine, and during the later 12 months, they did take the medicine. There are 500 different RNA-seq values recorded each month. The dataset includes __503 columns and 120 observations__: + +- `Patient_ID`: this is an __unique identification__ for each participants [5 participants] (string); + +- `Time`: this is the __time in months__ for each patient (numerical); + +- `Treatment`: this is a __binary indicator__ for if a patient take the medicine at each `Time` point __[2 levels: 'No', 'Yes']__ (string); + +- `RNA_1`, `RNA_2`,..., `RNA_500`: these are all __numerical values for different RNA-seq__ (numerical). + +![](https://raw.githubusercontent.com/YzwIsALaity/PCA-Visualization-Tutorial-in-R/b914277f23d70ba3831a69bf0428310099d7b1dc/Dataset.jpeg) + +It is clear that the dataset is high-dimensional, as the number of variables (_p = 500_) is much greater than the number of observations (_n = 120_). PCA will be performed on 500 different RNA-seq variables. + +# 2. Principle component analysis +In this section, we will build up a PCA model with `prcomp()` function and the design matrix for 500 RNA-seq. It mainly requires two arguments: + +- `x`: a numeric or complex matrix/data frame for PCA + +- `scale`: a logical value indicating whether the variables should be scaled to have unit variance before the analysis takes place (in general, we need to scale data before putting into PCA). + +```{r} +# Extract columns for RNA-seq +X <- Dt[, -c(1:3)] +# Perform PCA +pca <- prcomp(x = X, scale = T) +# Extract first 3 PCs for each point in X +X_pca <- pca$x[, c('PC1', 'PC2', 'PC3')] +# Combine the value of rotated data with time and treatment info +Dt.Plot <- cbind(Dt[, 1:3], X_pca) +head(Dt.Plot) +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/PCA-Visualization-Tutorial-in-R/b914277f23d70ba3831a69bf0428310099d7b1dc/Dataset%20after%20PCA.jpeg) + +There are many different packages in R that can perform various types of principal component analysis. Regardless of which one users choose to implement, the result will include a matrix that represents a rotation of the original dataset, and the visualization will be based on this matrix. After passing our RNA-seq data into a PCA model, we need to obtain a __matrix of the rotated data values__ via the argument `pca$x`. __This matrix represents a mapping of the original row observations into the space represented by the principal components__. Mostly, we want to visualize its results in 2D or 3D spaces so we extract first three principle components. In the next sections, we are going to show how we can __visualize PCA results with 2D and 3D spaces with__ `ggplot2` __and__ `plotly` __packages__. + +# 3. Basic: scatter plot in 2D or 3D +The standard method for visualizing the results of Principal Component Analysis (PCA) involves plotting a 2D scatter plot with the reduced dataset, with the x-axis representing `PC1` and the y-axis representing `PC2`. In our case, we will create two versions of this scatter plot, one with stratification by unique patient IDs and one without. +## (1). 2D scatter plot without stratification by IDs + +```{r} +require(ggplot2) +p_2D_PCA <- +ggplot(Dt.Plot, aes(x = PC1, y = PC2, col = Treatment)) + + geom_point(alpha = 0.8) + + theme_bw() + # dark-on-light theme + theme(panel.border = element_blank(), + panel.background = element_blank(), + panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + axis.line.x = element_line(), # these two are for the axis line + axis.line.y = element_line(), + axis.text.x = element_text(colour = "black"), # there two are for texts in axes + axis.text.y = element_text(colour = "black"), + axis.ticks.x = element_line(), # these two are for ticks in axes + axis.ticks.y = element_line(), + axis.title.x = element_text(colour = "black", face = 'bold', vjust = -1), + axis.title.y = element_text(colour = "black", face = 'bold'), + legend.title = element_text(colour = "black", face = 'bold'), + legend.text = element_text(colour = "black")) +p_2D_PCA +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/PCA-Visualization-Tutorial-in-R/b914277f23d70ba3831a69bf0428310099d7b1dc/2D%20PCA%20Scatter.jpeg) + +The first scatter plot of PCA results is a 2D representation of the dataset without stratification by unique patient ID, but with points colored according to treatment. Despite performing dimension reduction, we did not observe any clear indication that treatments differed in RNA-seq levels. + +## (2). 2D scatter plot with stratification by IDs +Occasionally, users may be interested in examining how a PCA model performs on each individual subject. To enable this analysis, we generate a 2D scatter plot with stratification by subject IDs. We implement the `facet_grid()` function to arrange the stratification in a column orientation. + +```{r} +p_2D_PCA_ID <- +ggplot(Dt.Plot, aes(x = PC1, y = PC2, col = Treatment)) + + geom_point(alpha = 0.8) + + facet_grid(cols = vars(Patient_ID)) + + theme_bw() + # dark-on-light theme + theme(panel.border = element_blank(), + panel.background = element_blank(), + panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + axis.line.x = element_line(), # these two are for the axis line + axis.line.y = element_line(), + axis.text.x = element_text(colour = "black"), # there two are for texts in axes + axis.text.y = element_text(colour = "black"), + axis.ticks.x = element_line(), # these two are for ticks in axes + axis.ticks.y = element_line(), + axis.title.x = element_text(colour = "black", face = 'bold', vjust = -1), + axis.title.y = element_text(colour = "black", face = 'bold'), + legend.title = element_text(colour = "black", face = 'bold'), + legend.text = element_text(colour = "black")) +p_2D_PCA_ID +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/PCA-Visualization-Tutorial-in-R/b914277f23d70ba3831a69bf0428310099d7b1dc/2D%20PCA%20Scatter%20with%20ID.jpeg) + +After stratifying the data by patient ID, we observed that the effect of treatments on RNA-seq varied among patients. Specifically, patients P_4 and P_5 showed minimal differences in RNA-seq in the 2D space created by principle components, while patient P_3 exhibited clear separation in treatments within the same 2D space. + +## (3). 3D scatter plot +To visualized the PCA results in 3D space, we need to implement `plotly` package and we are going introduce some functions in the package: + +- `plot_ly()`: it is similar to `ggplot()` function `ggplot2` package, which provides abstractions for creating figures from the package and it requires some arguments: + + - `data`: a matrix/data frame for plot + + - `x = ~ PC1`, `y = ~ PC2`, `z = ~ PC3`: they are used to define each axis in 3D space and need to column names in `data` with tilde `~` + + - `color`: it is used to put values mapped to relevant 'fill-color' attributes + + - `colors`: it is a vector/array used to specify what colors will be used in the figure + + - `marker = list(size = 5)`: it is used to specify the size of points in 3D space and it requires to provide a `list()` type of inputs + +- `add_markers()`: it is used to indicate that the trace type of the figure is a scatter plot + +- `layout()`: it is used to modify non-graphical components in a figure so we use it to modify titles for axes + +After we specify these functions based on our style, we can create a scatter plot in 3D space. + +```{r} +require(plotly) +p_3D_PCA <- plot_ly(data = Dt.Plot, # dataset + x = ~ PC1, y = ~ PC2, z = ~ PC3, # specify x/y/z axes + color = ~ Treatment, colors = c('#D6604D', '#4393C3'), # colors + marker = list(size = 5)) # point size +p_3D_PCA <- p_3D_PCA %>% add_markers() # specify scatter plot +p_3D_PCA <- p_3D_PCA %>% layout(scene = list(xaxis = list(title = 'PC1'), # axes title + yaxis = list(title = 'PC2'), + zaxis = list(title = 'PC3'))) +p_3D_PCA +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/PCA-Visualization-Tutorial-in-R/b914277f23d70ba3831a69bf0428310099d7b1dc/3D%20PCA%20Scatter.png) + +Unfortunately, although we represented the original data with three principal components, we did not observe any significant separation between the high-dimensional RNA-seq data after dimension reduction based on the effect of treatment. + +# 4. Advanced: aggregated trajectory plot in 2D or 3D +An alternative visualization method for __presenting PCA results of longitudinal RNA-seq data aggregated at the patient level is to represent it as a trajectory over time__, as opposed to using scatter plots. Currently, high-dimensional data is often measured repeatedly over time, and it has been observed that this data preserves its time order. As a result, the trajectory of the data over time in PCA results can also preserve the order of time. + +## (1). Aggregated PCA results +Instead of showing the PCA trajectory at the subject level, we first aggregate the data and calculate the mean value for each principal component at each time point. This allows us to show the PCA trajectory at an aggregated level. Therefore, we need to implement `dplyr` package to perform aggregation and mainly use three functions: + +- `%>%`: it pipes an object forward into a function or call expression + +- `group_by()`: it takes an existing data frame and converts it into a grouped data frame where operations are performed "by group" and it requires to specify a variable to group by + +- `summarise()`: it returns one row for each combination of grouping variables and it will contain one column for each grouping variable and one column for each of the summary statistics that you have specified + +```{r} +require(tidyr) +require(dplyr) +# Aggregated +Dt.Plot.Aggregated <- Dt.Plot %>% group_by(Time) %>% summarise(PC1_Aggregated = mean(PC1), + PC2_Aggregated = mean(PC2), + PC3_Aggregated = mean(PC3)) +Dt.Plot.Aggregated$'Treatment' <- factor(c(rep('No', 12), rep('Yes', 12)), + levels = c('No', 'Yes')) +head(Dt.Plot.Aggregated) +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/PCA-Visualization-Tutorial-in-R/b914277f23d70ba3831a69bf0428310099d7b1dc/Dataset%20for%20aggregate%20PCA.jpeg) + +The aggregated dataset consists of five columns: time points, the means of principal components aggregated by patient level at different time points, and a treatment variable. After obtaining the aggregated dataset, we will implement smoothing functions, such as cubic splines, to obtain trajectories in 2D or 3D spaces. To do this, we will first use the aggregated PC values to fit a spline regression over time in each dimension using the `splinefun()` function. Then, we will use a series of consecutive points with equal length to make interpolations of the fitted spline. To specify `x` and `y` in the `splinefun()` function, we will set `x` as time and `y` as aggregated PC values. + +```{r} +# Interprolated points +Time <- Dt.Plot.Aggregated$Time +Interprolate <- seq(from = min(Time), to = max(Time), len = 200) +# Spline regression on each PC dimension +PC1 <- splinefun(Time, Dt.Plot.Aggregated$PC1_Aggregated)(Interprolate) +PC2 <- splinefun(Time, Dt.Plot.Aggregated$PC2_Aggregated)(Interprolate) +PC3 <- splinefun(Time, Dt.Plot.Aggregated$PC3_Aggregated)(Interprolate) +# Combine with treatment variable +Treatment <- factor(ifelse(Interprolate <= 12, 'No', 'Yes'), # 1-12 month: no treatment, 13-24 month: treatment + levels = c('No', 'Yes')) +Dt.Plot.Traj <- data.frame(Treatment, 'PC1' = PC1, 'PC2' = PC2, 'PC3' = PC3) +head(Dt.Plot.Traj) +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/PCA-Visualization-Tutorial-in-R/b914277f23d70ba3831a69bf0428310099d7b1dc/Dataset%20after%20PCA.jpeg) + +Since we used 200 consecutive points, `Dt.Plot.Traj` data frame has 200 rows and 4 columns. + +## (2). 2D trajectory plot +To create a figure for 2D PCA trajectory, we mainly use the `geom_path()` function which is used to connect the observations in __the order in which they appear in the data__ and in our data this order is corresponding to time. To better highlight the order of time, we will use an argument for arrow inside `geom_path()` with `arrow()` function which is required to specifiy: + +- `angle`: the angle of the arrow head in degrees + +- `ends`: indicating which ends of the line to draw arrow heads ("last", "first", or "both") + +- `type`: indicating whether the arrow head should be a closed triangle ("open" or "closed") + +```{r} +p_2D_PCA_Traj <- + ggplot(Dt.Plot.Traj, aes(x = PC1, y = PC2, col = Treatment)) + + geom_path(arrow = arrow(angle = 10, + ends = "last", # arrow at the end of a curve + type = "closed")) + # closed triangle for arrow head + theme_bw() + # dark-on-light theme + theme(axis.line.x = element_line(), # these two are for the axis line + axis.line.y = element_line(), + axis.text.x = element_text(colour = "black"), # there two are for texts in axes + axis.text.y = element_text(colour = "black"), + axis.ticks.x = element_line(), # these two are for ticks in axes + axis.ticks.y = element_line(), + axis.title.x = element_text(colour = "black", face = 'bold', vjust = -1), + axis.title.y = element_text(colour = "black", face = 'bold'), + legend.title = element_text(colour = "black", face = 'bold'), + legend.text = element_text(colour = "black")) +p_2D_PCA_Traj +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/PCA-Visualization-Tutorial-in-R/b914277f23d70ba3831a69bf0428310099d7b1dc/2D%20PCA%20Trejactory.jpeg) + +In the 2D version, the effect of treatment over time in 2D PC space shows different trends. Without treatment, the trajectory tends to move towards the upper-left corner over time, whereas after treatment is applied, it tends to move towards the lower-right corner. + +## (3). 3D trajectory plot +Compared to 2D version, 3D version can better visualize the development of trajectory and we still use `plot_ly()` function but need to specify other arguments: + +- `type`: it is used to specify the trace type (we need to specify `type` as `"scatter3d"` to serve as a base for line plot in 3D) + +- `mode`: it is used to specify what users want to show in a figure + +- `line`: it is used to control graphic components for lines + +```{r} +p_3D_PCA_Traj <- plot_ly(Dt.Plot.Traj, + x = ~ PC1, y = ~ PC2, z = ~ PC3, + color = ~ Treatment, colors = c('#D6604D', '#4393C3'), + type = 'scatter3d', # trace is 'scatter3d' + mode = 'lines', # show lines + line = list(width = 4)) # Control line width +p_3D_PCA_Traj <- p_3D_PCA_Traj %>% layout(scene = list(xaxis = list(title = 'PC1'), + yaxis = list(title = 'PC2'), + zaxis = list(title = 'PC3'))) +p_3D_PCA_Traj +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/PCA-Visualization-Tutorial-in-R/b914277f23d70ba3831a69bf0428310099d7b1dc/3D%20PCA%20Trejactory.jpeg) + +Here we go! + +## (4). 3D trajectory in subject level +For trajectory in subject level, we first need to fit separated smoothers for each PC dimension and each patient. + +```{r} +id <- unique(Dt.Plot$Patient_ID) +n_id <- length(id) +Dt.Plot.Subject <- data.frame() +for(i in 1:n_id){ + Loca <- which(Dt.Plot$Patient_ID == id[i]) + Trans <- Dt.Plot[Loca, ] + # Interprolated points + Time <- Trans$Time + Interprolate <- seq(from = min(Time), to = max(Time), len = 200) + # Spline regression on each PC dimension + PC1 <- splinefun(Time, Trans$PC1)(Interprolate) + PC2 <- splinefun(Time, Trans$PC2)(Interprolate) + PC3 <- splinefun(Time, Trans$PC3)(Interprolate) + # Combine treatment variable + Treatment <- factor(ifelse(Interprolate <= 12, 'No', 'Yes'), # 1-12 month: no treatment, 13-24 month: treatment + levels = c('No', 'Yes')) + # Result + Result <- data.frame('Patient_ID' = rep(id[i], 200), + 'Treatment'= Treatment, + 'PC1' = PC1, + 'PC2' = PC2, + 'PC3' = PC3) + Dt.Plot.Subject <- rbind(Dt.Plot.Subject, Result) +} +``` + +To create a figure for 3D PCA trajectory in subject level, we just need to use `Patient_ID` in `color` argument. + +```{r} +p_3D_PCA_Traj <- plot_ly(Dt.Plot.Subject, + x = ~ PC1, y = ~ PC2, z = ~ PC3, + color = ~ Patient_ID, + type = 'scatter3d', # trace is 'scatter3d' + mode = 'lines', # show lines + line = list(width = 4)) # Control line width +p_3D_PCA_Traj <- p_3D_PCA_Traj %>% layout(scene = list(xaxis = list(title = 'PC1'), + yaxis = list(title = 'PC2'), + zaxis = list(title = 'PC3'))) +p_3D_PCA_Traj +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/PCA-Visualization-Tutorial-in-R/b914277f23d70ba3831a69bf0428310099d7b1dc/3D%20PCA%20Trajectory%20Subject%20Level.jpeg) diff --git a/_posts/2023-03-06-Regular-Raster-Plot.md b/_posts/2023-03-06-Regular-Raster-Plot.md new file mode 100644 index 00000000000..a7678d7c8ec --- /dev/null +++ b/_posts/2023-03-06-Regular-Raster-Plot.md @@ -0,0 +1,70 @@ +--- +title: "Regular Raster Plot in R with ggplot2" +mathjax: true +layout: post +categories: media +--- + +![Cover](https://raw.githubusercontent.com/YzwIsALaity/Regular-Raster-Plot-In-R/main/2D%20Raster%20Plot.jpg) + + +This is a brief tutorial on how to create __a raster plot in Python using the__ `imshow()` __function from the `matplotlib` library__. The focus is on displaying __1-dimensional discrete time series data (i.e., neural signal recordings of spikes over time)__ using the `ggplot2` package. While there are no functions in `ggplot2` that perform exactly the same operation as `imshow()`, we can still use alternative functions to achieve the same result. + +# 1. Dataset +The simulated dataset is pretty simple which only includes two columns: + +- `Time`: this is the __time in seconds__ [1 - 1800 seconds] (numerical) + +- `State`: this is a __categorical variable indicating if there is a spike [4 levels: "Baseline 1", "Signal 1", "Baseline 2", "Signal 2"]__ (string) + +![](https://raw.githubusercontent.com/YzwIsALaity/Regular-Raster-Plot-In-R/main/Dataset.jpeg) + +Our objective is to display this 1-dimensional data over time! + +# 2. Plot +To create this figure, we first need to create a numerical y axis as a new column `Level` with value one, and then set `State` variable as a factor with four levels. + +```{r} +Dt.Plot$'Level' <- 1 +Dt.Plot$State <- factor(Dt.Plot$State, + levels = c('Baseline 1', 'Signal 1', 'Baseline 2', 'Signal 2')) +``` + +The core function to create this plot is `geom_bin_2d()` which is used to divide the plane into rectangles, count the number of cases in each rectangle, and then (by default) map the number of cases to the rectangle's fill. It also requires to set up an argument `stat = "identity"` to display the sum of values in the points column. After that, we want to minimize the gap between axes and bar with two functions: + +- `scale_x_continuous()` and `scale_y_continuous()`: it is used to modify the x/y axis and two core arguments need to be passed into these two functions + + + `limits`: it is used to provide a limit of the axis (noticed that the default __length of the bar in y axis is 0.5-1.5__) + + + `expand`: it is used to provide a vector of range expansion constants used to add some padding around the data to ensure that they are placed some distance away from the axes and we need to __set up as `expand = c(0, 0)`__ + +Now we can create this figure! + +```{r} +ggplot(Dt.Plot, aes(x = Time, + y = Level, + fill = State)) + + geom_bin_2d(stat = "identity") + + scale_x_continuous(limits = c(0, max(Dt.Plot$Time)), # the range of x axis: 0-1800s + expand = c(0, 0)) + # there is no gap between bar and x axis + scale_y_continuous(limits = c(0.5, 1.5), # the range of y axis: 0.5-1.5 (default length of bar) + expand = c(0, 0)) + # there is no gap between bar and x axis + labs(x = 'Time (s)', # set up labels for x and y axes + y = 'States of Neural Behavior') + + theme_bw() + # dark-on-light theme + theme(panel.border = element_rect(colour = "black"), # keep border of figure + panel.grid = element_blank(), + axis.text.x = element_text(colour = "black", size = 11), # text to the left side for 1 unit + axis.text.y = element_blank(), + axis.ticks.x = element_line(), + axis.ticks.y = element_blank(), # remove ticks in y axis + axis.title.x = element_text(colour = "black", size = 11, face = 'bold'), + axis.title.y = element_text(colour = "black", size = 11, face = 'bold'), + legend.title = element_blank()) # remove title of legend + + +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/Regular-Raster-Plot-In-R/main/2D%20Raster%20Plot.jpg) + +Here we go! diff --git a/_posts/2023-04-29-Bump-Chart.md b/_posts/2023-04-29-Bump-Chart.md new file mode 100644 index 00000000000..ce3274d0be7 --- /dev/null +++ b/_posts/2023-04-29-Bump-Chart.md @@ -0,0 +1,95 @@ +--- +title: "Bump Chart in R with ggplot2" +mathjax: true +layout: post +categories: media +--- + + +![Cover](https://raw.githubusercontent.com/YzwIsALaity/Bump-Chart-Tutorial-in-R/09e5ddb7a89f9ca80013383e2ef6fee2b5666ffb/Bump%20Chart%202.jpeg) + + +In this tutorial, I aim to demonstrate how to effectively visualize the changes in ranks over time using a __bump chart__ created with `ggplot2` and `ggbump` package. A bump chart is a specialized type of line plot specifically designed to __display the relative ranks or orders of subjects as they evolve over time__. __Unlike an alluvium plot__, which showcases the actual values or metrics for each subject over time, a bump chart focuses solely on the ranking aspect. To accomplish this, I will utilize the publicly available dataset for Covid-19 surveillance provided by the [Seattle Flu Study](https://seattleflu.org/pathogens). This dataset comprises the recorded number of specimens received in various collection channels in Seattle, WA, for each month. Consequently, it offers both the actual values and the corresponding ranks of specimens within each channel. + +## Dataset +After data preprocessing, we first look at the dataset, which comprises 4 columns:- + +- `collection_channel`: it indicated the channel of specimen collection __[7 levels: Childcare, City testing sites, Clinical residuals, Other, Public space, Shelter, Swab-n-send]__ (string); + +- `Year`: it indicated the calendar year (numerical); + +- `Total`: it indicated the __total number of collected specimens in a given year__ (numerical); + +- `Rank`: it indicated the rank of `Total` (numerical). + +![](https://raw.githubusercontent.com/YzwIsALaity/Bump-Chart-Tutorial-in-R/617b9e60ab59ebbfea3c94034e6ccf46b0c2cb9f/Dataset.png) + +We basically need to use `collection_channel`, `Year`, and `Rank` variables for creating bump charts. + +## Version 1 +To generate a bump chart, we primarily utilize the `geom_bump()` function from the `ggbump` package. This function creates a smooth line based on the rank of subjects over time and it mainly requires two arguments: + +- `linewidth`: it is used to indicate the width of the smooth line, with a larger number indicating a wider line.; + +- `smooth`: it is used to determine the degree of smoothness for the curve, with a higher value indicating a steeper smooth curve. + +To complete the chart, we should assign the `Year` variable to the x-axis and the `Rank` variable to the y-axis. Additionally, it is important to assign a distinct color to each type of `collection_channel.` +```{r} +p <- +ggplot(Dt, + aes(x = Year, # x axis: Year + y = Rank, # y axis: Rank + col = collection_channel)) + # Each collection channel has a distinct color + geom_bump(linewidth = 2, # Set line width as 2 + smooth = 8) # Set smoothness as 8 +p +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/Bump-Chart-Tutorial-in-R/617b9e60ab59ebbfea3c94034e6ccf46b0c2cb9f/Bump%20Chart%201.jpeg) + +In this preliminary bump chart, it is obvious that __the first rank, representing the highest count of collected specimens, is positioned at the bottom of the y-axis__. However, some people may __prefer to have the first rank at the top of the chart__. To accommodate this preference, we need to reverse the order of the y-axis scale. + +## Version 2 +In addition to that, we would like to make a few more modifications to enhance the figure. Firstly, we want to display only rounded values for the `Year` on the x-axis. Secondly, we aim to replace the legend for `collect_channel` by __placing the corresponding label on the left and right sides of the chart__. To achieve a cleaner appearance, we intend to __remove the background, border, and grid lines__ in the bump chart. Finally, we would like to utilize a different color palette, distinct from the default one, by employing the `scale_color_tableau()` function from the `ggthemes` package. +```{r} +p <- +ggplot(Dt, aes(x = Year, + y = Rank, + col = collection_channel)) + + geom_bump(linewidth = 2, + smooth = 8) + + geom_point(size = 5) + # Add points + geom_text(data = Dt[which(Dt$Year == 2019), ], # Left: label of collection_channel + aes(x = Year - 0.02, # Move label to left a little bit + y = Rank, + label = collection_channel), # Label names + size = 5, # Size of label + hjust = 1) + # Adjust horizontal location + geom_text(data = Dt[which(Dt$Year == 2021), ], # Right: label of collection_channel + aes(x = Year + 0.02, # Move label to right a little bit + y = Rank, + label = collection_channel), + size = 5, + hjust = 0) + # Adjust horizontal location + scale_y_reverse(limits = c(7, 1), # Reverse the y scale + breaks = rev(seq(1, 7, 1))) + + scale_x_continuous(limits = c(2018.9, 2021.1), # Round x scale + breaks = seq(2019, 2021, 1)) + + scale_color_tableau(palette = 'Color Blind') + # Use Tableau color palette + labs(x = NULL, # Remove title for x axis + y = 'Rank') + + theme_bw() + + theme(axis.ticks = element_blank(), # Remove ticks in axes + panel.border = element_blank(), # Remove border + panel.grid = element_blank(), # Remove grid lines + panel.background = element_blank(), # Remove background + plot.background = element_blank(), + legend.position = "none", + axis.text = element_text(size = 15, face = 'bold'), + axis.title.y = element_text(size = 15, face = 'bold')) +p +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/Bump-Chart-Tutorial-in-R/09e5ddb7a89f9ca80013383e2ef6fee2b5666ffb/Bump%20Chart%202.jpeg) + +Here we go! diff --git a/_posts/2023-06-04-Maps.md b/_posts/2023-06-04-Maps.md new file mode 100644 index 00000000000..c2d783eebea --- /dev/null +++ b/_posts/2023-06-04-Maps.md @@ -0,0 +1,216 @@ +--- +title: "Map in R with ggplot2" +mathjax: true +layout: post +categories: media +--- + + +![Cover](https://raw.githubusercontent.com/YzwIsALaity/Map-Tutorial-in-R/ded9575dbb1358e79cafd249af654759aabadd01/Map%20Version%201.jpeg) + + + +In this tutorial, I will be demonstrating how to __plot geographical information in R using the__ `ggplot2` __package__. In addition to the core package, we will also need to load `ggthemes`, `ggrepel`, and `tigris` for visualization, labeling, and text on maps. Generally, __a 2D map is represented using a latitude and longitude coordinate system, with longitude typically placed on the x-axis and latitude on the y-axis__. For this tutorial, we will utilize two COVID-19 surveillance datasets obtained from the [SCAN program](https://seattleflu.org/history/scandashboards) in King County, Washington and the [Washington Department of Health](https://doh.wa.gov/emergencies/covid-19/data-dashboard). In the `ggplot2` package, map visualization primarily involves two types of functions: `geom_polygon()` for __plotting polygons__ and `geom_sf()` for __plotting simple feature (sf) objects__, which are commonly used for mapping data with latitude and longitude coordinates. First, we will cover the usage of `geom_polygon()`, followed by an exploration of `geom_sf()`. + +# 1. Polygon Version +## (1). Data preprocessing +We used a dataset to plot the polygon version, which contains records of COVID-19 cases confirmed by PCR tests in Washington state, and is aggregated at the county level, pertaining to the year 2021. It can be accessed through the website for [Washington Department of Health](https://doh.wa.gov/emergencies/covid-19/data-dashboard). The dataset is very simple and only includes two columns: + +- `County`: a categorical __list of counties__ in the Washington state __[39 unique counties]__ (string); + +- `Frequency`: a numerical frequency of confirmed COVID-19 cases for each county (numerical). + +To obtain geographical information such as latitude and longitude coordinates for each county in Washington state, we needed to extract this data since it was not originally included in the dataset. + +![](https://raw.githubusercontent.com/YzwIsALaity/Map-Tutorial-in-R/ded9575dbb1358e79cafd249af654759aabadd01/First%20Covid%20Dataset.png) + +The `ggplot2` provides a convenient function `map_data()` that easily turns data from the `maps` package into a data frame suitable for plotting with `ggplot2`, and it mainly requires two arguments: + +- `map`: name of map provided by the `maps` package. These include `maps::county()`, `maps::france()`, `maps::italy()`, `maps::nz()`, `maps::state()`, `maps::usa()`, `maps::world()`, `maps::world2()`; + +- `region`: name(s) of subregion(s) to include. Defaults to `.` which includes all subregions. + +In `Map_WA` dataframe, it includes three columns: + +- `lon` and `lat`: the numerical latitude and longitude coordinates (numerical); + +- `id`: the name of counties (string). + +```{r} +# Obtain a map for the Washington state and it is in county level +Map_WA <- map_data(map = 'county', region = 'washington') %>% select(lon = long, lat, id = subregion) +head(Map_WA) +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/Map-Tutorial-in-R/ded9575dbb1358e79cafd249af654759aabadd01/map_data%20Dataset.png) + +To __plot each county in this dataframe as a polygon__, we assign multiple pairs of latitude and longitude coordinates to describe the county's boundary. By using the `geom_polygon()` function, we can connect these coordinate pairs to create the polygon representation. To maintain consistency with the `Dt` dataframe, it is necessary to change the column name from `id` to `County` for the `Map_WA` dataframe and capitalize the name of each county. We can achieve this by utilizing the `str_to_title()` function from the `stringr` package. Once these modifications are made, we will merge the dataset for COVID-19 frequency and `Map_WA` dataframes to obtain the final dataframe `Dt_Plot` for generating figures. In the first plot, we aim to __highlight the top three counties with the highest number of confirmed COVID-19 cases in 2021, namely King, Pierce, and Snohomish counties__. Additionally, we intend to __mark four major cities (Seattle, Bellevue, Kirkland, and Redmond) within King county as points on the map through latitude and longitude coordinates__ since there are lots of software engineers lol. + +```{r} +# Convert first letter of every word to uppercase +require(stringr) +Map_WA$id <- str_to_title(Map_WA$id) + +# Change "id" to "County" +colnames(Map_WA)[3] <- 'County' + +# Join geographical data with COVID-19 data +Dt_Plot <- left_join(Map_WA, Dt, by = 'County') + +# Highlight counties with confirmed cases > 50000 +HighlightCounty <- Dt_Plot[which(Dt_Plot$County %in% c('King', 'Pierce', 'Snohomish')), ] + +# Mark location of Seattle, Bellevue, Kirkland with +PointedCity <- data.frame('City' = c('Seattle', 'Bellevue', 'Kirkland', 'Redmond'), + 'lon' = c(-122.335167, -122.200676, -122.20715, -122.121513), + 'lat' = c(47.608013, 47.610378, 47.676607, 47.673988)) +``` + +After completing the data preprocessing, we are now moving forward with the creation of a map to visualize the total number of COVID-19 cases in Washington state at the county level. + +## (2). Figure 1: Polygon +As we mentioned above, __a 2D map is represented using a latitude and longitude coordinate system, with longitude typically placed on the x-axis and latitude on the y-axis__. To create a better 2D map, we need to take care of procedures below: + +- Based on the `Dt_Plot` dataframe, our primary implementation involves using the `geom_polygon()` function. This function __connects all the points used to describe a polygon, and the interior of the polygon is colored based on the `fill` argument__. Since our goal is to visualize the frequency of COVID-19 cases, we will __set `fill = Frequency` to create a continuous scale of colors ranging from light to dark__. This will effectively __represent the magnitude of case counts__. __The `group` argument determines which points are connected to form a polygon__. In our case, we __treat each county as a unique group__, so we will set `group = County`. __The `coord_sf()` is used to ensures that all layers use a common Coordinate Reference Systems which is used to define, with the help of coordinates, how the two-dimensional, projected map is related to real locations on the earth__. + +- To __visualize selected counties with different colored boundaries__, we can __pass the `HighlightCounty` dataframe into an additional `geom_polygon()` function alongside the main one__. Since our goal is to highlight the boundaries without affecting the visualization of the COVID-19 case frequencies, __there is no need to `fill` in any colors. We can achieve this by setting `fill = NA`__. + +- To __visualized selected cities with points__, we can __pass the `PointedCity` dataframe into the `geom_point()` function where x and y axes are longitude and latitude__. Since these __cities are all in King county__, we need to __set `group = 'King'` to match the main `geom_polygon()` function__. To __label these cities__, we need to __use the `geom_text_repel()` function from the `ggrepel` package. The details of using this function can be found in my show case for__ [Pie Chart in R](https://yzwisalaity.github.io/Pie-Chart/). + +- For different color gradient, we will use the `scale_fill_gradient2_tableau()` function from the `ggthemes` package. We also change the legend title for continuous color scale with the `guide_colorbar()` function. We also remove x and y axes in displaying the figure. + +Now we are going to combine them together to get a 2D map! + +```{r} +require(ggrepel) +require(ggthemes) +# Figure ################################################################################################### +p1 <- + ggplot(Dt_Plot, aes(x = lon, # x-axis: longitude + y = lat)) + # y-axis: latitude + geom_polygon(aes(group = County, # Draw polygon for each county + fill = Frequency), # Shade of color represented for magnitude of case counts + alpha = 0.8, # Contrl transparency of color + color = "grey70") + # Color for boundary line + geom_polygon(data = HighlightCounty, # Use the new dataset to draw highlighted counties + aes(col = County), # Boudary line for each county with different color + linewidth = 0.6, # Control line width + fill = NA) + # Do not fill any color for interior in this function + geom_point(data = PointedCity, # Use another dataset for marked cities + aes(x = lon, y = lat, # x-axis: longitude | y-axis: latitude + group = 'King'), # These cities are in King county + pch = 20) + # Control point type + geom_text_repel(data = PointedCity, + aes(x = lon, y = lat, + label = City, + group = 'King'), + fontface = "bold", # Make labels bold + nudge_x = c(-0.5, 0.5, 1, 1), + nudge_y = c(0.5, -0.5, 0.5, -0.5)) + + coord_sf() + + scale_fill_gradient2_tableau(palette = 'Gold-Purple Diverging') + + theme_bw() + + guides(fill = guide_colorbar(title = c("COVID-19 Cases"))) + # Modify legend title for color scale + ggtitle('Overall Confirmed COVID-19 Cases in \nWashington State in 2021') + + theme(panel.border = element_blank(), + panel.grid = element_blank(), + axis.text.x = element_blank(), + axis.text.y = element_blank(), + axis.ticks.x = element_blank(), + axis.ticks.y = element_blank(), + axis.title.x = element_blank(), + axis.title.y = element_blank(), + legend.title = element_text(color = 'black', face = 'bold'), + plot.title = element_text(hjust = 0.5, face = 'bold')) # Center main title +p1 +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/Map-Tutorial-in-R/ded9575dbb1358e79cafd249af654759aabadd01/Map%20Version%201.jpeg) + +Now we can visualize the overall frequency of confirmed COVID-19 cases in Washington state at the county level with a 2D map! + +# 2. Simple Feature (sf) Version +## (1). Data preprocessing +Unlike the use of polygons to represent a 2D map, the concept of simple feature (sf) pertains to a formal standard that outlines how real-world objects can be accurately represented in computers, with particular focus on the spatial geometry of these objects. Consequently, geographical information such as latitude and longitude coordinates are commonly stored as `sf` objects. For additional details about `sf` objects, you can refer to the documentation provided in the [`sf` package](https://r-spatial.github.io/sf/articles/sf1.html). To illustrate how we can use `sf` object to visualize maps in R, we plan to use the COVID-19 survelliance data from the [SCAN program](https://seattleflu.org/history/scandashboards) in King County, Washington. As we noticed in the previous section, King county had the biggest amount of overall confirmed COVID-19 cases in 2021, we then want to investigate how this number was distributed in cities belonged to King County. The SCAN program provide a comprehensive record of COVID-19 cases. + +To demonstrate the utilization of `sf` objects for visualizing maps in R, our plan is to utilize the COVID-19 surveillance data obtained from the [SCAN program](https://seattleflu.org/history/scandashboards) in King County, Washington. As highlighted in the preceding section, King County had the highest number of confirmed COVID-19 cases in 2021. Therefore, our objective is to explore the distribution of these cases across cities within King County. The SCAN program offers a comprehensive record of COVID-19 cases and one of its public dataset categorized by zip codes has 8 columns: + +- `geo_id`: a character list of zip codes for cities in King County (string); + +- `Population`: the number of population in each city (numerical); + +- `Frequency`: a numerical frequency of confirmed COVID-19 cases for each city (numerical); + +- `lat` and `lon`: the numerical latitude and longitude coordinates (numerical); + +- `City` and `State`: the name of city and state (string). + +It is important to note that each city may encompass multiple zip codes, and the latitude and longitude coordinates provided for a zip code only represent its center point. Therefore, the complete shape of each zip code area cannot be accurately described by a limited set of latitude and longitude coordinates, and we need to obtain a better record to describe each zip code area. + +![](https://raw.githubusercontent.com/YzwIsALaity/Map-Tutorial-in-R/ded9575dbb1358e79cafd249af654759aabadd01/Second%20Covid%20Dataset.png) + +In order to obtain comprehensive geographical information for each zip code area, we can utilize the `zctas()` function provided by the `tigris` package. This function requires the specification of several arguments: + +- `cb`: if cb is set to TRUE, download a generalized (1:500k) ZCTA file. Defaults to FALSE (the most detailed TIGER/Line file). If `cb = TRUE`, a small zip code dataset will be downloaded, and if `cb = FALSE`, a large full dataset will be downloaded; + +- `starts_with`: character vector specifying the beginning digits of the ZCTAs you want to return; + +- `year`: the data year; + +- `class`: desired class of return object. + +```{r} +require(tigris) +Zip <- zctas(cb = T, starts_with = c("98"), year = 2020, class = "sf") +Zip +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/Map-Tutorial-in-R/ded9575dbb1358e79cafd249af654759aabadd01/Zip%20Code.png) + +Our objective is to extract two specific columns from the data: `ZCTA5CE20` (5-Digit ZIP Code Tabulation Area) and `geometry` (as an sf object) for the purpose of plotting a 2D map. Once we have retrieved these columns, we will merge them with the COVID-19 surveillance dataset. Additionally, we intend to emphasize the boundaries of four cities (Seattle, Bellevue, Kirkland, Redmond) by assigning them distinct colors. + +```{r} +Zip <- as.data.frame(Zip) # Change the dataset to dataframe +Zip <- Zip[, c('ZCTA5CE20', 'geometry')] # Obtain zip code and sf object columns +colnames(Zip)[1] <- 'geo_id' # Change the column name for zip code + +Dt_Plot <- left_join(Dt, Zip, by = 'geo_id') # Left join + +# Highlight cities +HighlightCity <- Dt_Plot[which(Dt_Plot$City %in% c('Seattle', 'Bellevue', 'Kirkland', 'Redmond')), ] +``` + +## (2). Figure 2: sf +After obtaining the `Dt_Plot` dataframe for creating a figure, we will utilize the `ggplot()` function. __The primary component we will utilize is `geom_sf()`, which renders different geometric objects based on the presence of sf objects in the data__. __The colors for the interior and exterior of a polygon (i.e., zip code area) are controlled by the `fill` and `col` arguments within the `aes()` function. In the first `geom_sf()` function, we set `fill = Frequency` to represent a gradient color scale based on the number of confirmed COVID-19 cases. In the second `geom_sf()` function, we set `col = City` to assign distinct colors to selected cities. However, we do not want to modify the color of the polygon's interior, so we pass `fill = NA` outside the `aes()` function in the second `geom_sf()`. It is important to note that we must specify `mapping = aes()` in the second `geom_sf()` function to avoid encountering an error__. + +```{r} +# Figure ################################################################################################### +p1 <- + ggplot(Dt_Plot) + + geom_sf(aes(fill = Frequency, + geometry = geometry), + alpha = 0.7) + + geom_sf(HighlightCity, + mapping = aes(geometry = geometry, col = City), + linewidth = 0.6, + fill = NA) + + scale_fill_gradient2_tableau(palette = 'Gold-Purple Diverging') + + theme_bw() + + guides(fill = guide_colourbar(title = c("COVID-19 Cases"))) + + ggtitle('Overall Confirmed COVID-19 Cases in \nKing County, WA up to May 24, 2023') + + theme(panel.border = element_blank(), + panel.grid = element_blank(), + axis.text.x = element_blank(), + axis.text.y = element_blank(), + axis.ticks.x = element_blank(), + axis.ticks.y = element_blank(), + axis.title.x = element_blank(), + axis.title.y = element_blank(), + legend.title = element_text(color = 'black', face = 'bold'), + plot.title = element_text(hjust = 0.5, face = 'bold')) +p1 +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/Map-Tutorial-in-R/ded9575dbb1358e79cafd249af654759aabadd01/Map%20Version%202.jpeg) + +Here we go! diff --git a/_posts/2023-07-14-Mosaic-Plots.md b/_posts/2023-07-14-Mosaic-Plots.md new file mode 100644 index 00000000000..d491aaa956f --- /dev/null +++ b/_posts/2023-07-14-Mosaic-Plots.md @@ -0,0 +1,188 @@ +--- +title: "Mosaic Plot in R with ggplot2" +mathjax: true +layout: post +categories: media +--- + + +![Cover](https://raw.githubusercontent.com/YzwIsALaity/Mosaic-Plot-Tutorial-in-R/cdf4a2f47ce10d71e8821be97eeec1d3375d8b9b/Plot%204.jpeg) + + +In this tutorial, we will introduce the mosaic plot using `ggplot2` and the extension package `ggmosaic.` The __mosaic plot is a widely used visualization technique for displaying two or more categorical variables__ in a dataset. It is typically presented as a stacked percentage bar plot. In the case of a single categorical variable, it can be considered as a stacked bar plot representing counts or percentages. To demonstrate the capabilities of ggmosaic, we will utilize the [__National Cancer Database (NCDB)__](https://www.facs.org/quality-programs/cancer-programs/national-cancer-database/), which is a publicly available data resource. + +# 1. Dataset +The NCDB is a large database, but for visualization purposes, we only require a small portion of it. Specifically, we will focus on the dataset dedicated to cervical cancer, as the NCDB provides separate datasets for different types of cancer. The sample dataset after data preprocessing only includes 5 columns: + +- `Primary.Treatment`: it is a categorical variable that indicates the __type of primary (first) treatment__ received by a patient __[4 levels: Surgery, Radiation, Chemotherapy, Unknown]__ (string); + +- `Income.Level`: it is a categorical variable that indicates the __income quantile of the neighborhood__ for each patient __[5 levels: < $40,227, $40,227 - $50,353, $50,354 - $63,332, >= $63,333, Unknown]__ (string); + +- `Insurance.Type`: it is a categorical variable that indicates the __type of insurance__ for each patient __[5 levels: Private Insurance, Uninsured, Medicaid, Medicare, Other Government/Unknown]__ (string); + +- `Facility.Type`: it is a categorical variable that indicates the __type of medical facility__ for each patient __[3 levels: Community Cancer Program, Academic/Research Program, Integrated Network Cancer Program]__ (string); + +- `Status`: it is a categorical variable that indicates the __survival status of each patient at the last follow-up time [2 levels: Death, Alive]__ (string). + +![](https://raw.githubusercontent.com/YzwIsALaity/Mosaic-Plot-Tutorial-in-R/cdf4a2f47ce10d71e8821be97eeec1d3375d8b9b/Dataset.png) + +A mosaic plot is an ideal option for understanding the interactive relationship among these four categorical variables. In the next section, we will demonstrate the relationships among these variables. + +# 2. Mosaic plot +## (1). Income level and insurance type +The first example we are going to create is between `Income.Level` and `Facility.Type.` To create this plot, we will introduce two core functions: `geom_mosaic()` and `product()`. These functions are necessary for creating a mosaic plot. The core function for creating a mosaic plot is `geom_mosaic()`, which is similar to other `geom`-based functions in `ggplot2` package. It requires several arguments to be specified: + +- `mapping`: it is the same as the other `geom`-based functions and need to pass `aes()` function into it. + + + __However, when using `geom_mosaic()` in the `ggmosaic` package, the `aes()` function requires a specific function called `product()` to be used for the x-axis argument. This means that you need to specify `aes(x = product(Var1, Var2, ...)`) when using `geom_mosaic()`__. + + + `product(Var1, Var2, ...)`: to clarify, when using `product()` as a wrapper for a list, `Var1` will be treated as a variable on the y-axis, while `Var2` will be treated as a variable on the x-axis. This allows you to define the relationships between these variables in the mosaic plot. + +- `offset`: it is used to set the space between each stacked box, also known as the `spine` in a mosaic plot. By adjusting this argument, you can control the spacing between the individual categories in the plot. By default, `offset = 0.01`. + +Once we have specified the required arguments for the `geom_mosaic()` function, we can create the first version of the mosaic plot. By providing the necessary arguments, we can generate the initial representation of the plot. First, we will set up variables as factors to better order levels in categorical variables. + +```{r} +require(ggplot2) +require(dplyr) +require(ggmosaic) +require(ggthemes) +require(gridExtra) +# Set variables as factors +Dt$Income.Level <- factor(Dt$Income.Level, levels = c('Unknown', '$50,354 - $63,332', '$40,227 - $50,353', '< $40,227', '>= $63,333')) + +Dt$Facility.Type <- ifelse(Dt$Facility.Type == 'Integrated Network Cancer Program', 'Integrated Network \n Cancer Program', + ifelse(Dt$Facility.Type == 'Community Cancer Program', 'Community Cancer \n Program', + ifelse(Dt$Facility.Type == 'Academic/Research Program', 'Academic/Research \n Program', Dt$Facility.Type))) +Dt$Facility.Type <- factor(Dt$Facility.Type, levels = c('Integrated Network \n Cancer Program', 'Community Cancer \n Program', 'Academic/Research \n Program')) + +# Version 1.1: without legend +p1.1 <- +ggplot(data = Dt) + + geom_mosaic(aes(x = product(Income.Level, # y-axis: Income level + Facility.Type), # x-axis: Facility type + fill = Income.Level), # Color of "box" specified by income level + show.legend = FALSE) + # Text in y-axis has already showed label so don't need legend + xlab('Facility Type') + ylab('Neighborhood \n Income Level') + + scale_fill_tableau() + # Use better color palette + theme_mosaic() + # Specific theme for mosaic plot + theme(axis.ticks.x = element_blank(), # Remove ticks in x and y axes + axis.ticks.y = element_blank(), + axis.title.x = element_text(color = 'black', face = 'bold'), + axis.title.y = element_text(color = 'black', vjust = 3, face = 'bold'), + axis.text.x = element_text(color = 'black', angle = 45, hjust = 1), + axis.text.y = element_text(color = 'black')) + +# Version 1.2: with legend +p1.2 <- +ggplot(data = Dt) + + geom_mosaic(aes(x = product(Income.Level, # y-axis: Income level + Facility.Type), # x-axis: Facility type + fill = Income.Level)) + # Color of "box" specified by income level + xlab('Facility Type') + ylab('Neighborhood \n Income Level') + + guides(fill = guide_legend(title = "Neighborhood \n Income Level")) + # Modify legend title + scale_fill_tableau() + # Use better color palette + theme_mosaic() + # Specific theme for mosaic plot + theme(axis.ticks.x = element_blank(), # Remove ticks in x and y axes + axis.ticks.y = element_blank(), + axis.title.x = element_text(color = 'black', face = 'bold'), + axis.title.y = element_blank(), + axis.text.x = element_text(color = 'black', angle = 45, hjust = 1), + axis.text.y = element_blank(), + legend.title = element_text(color = 'black', face = 'bold')) + +grid.arrange(p1.1, p1.2, nrow = 1) +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/Mosaic-Plot-Tutorial-in-R/cdf4a2f47ce10d71e8821be97eeec1d3375d8b9b/Plot%201.jpeg) + +In this figure, we can find that __each level in the `Income.Level` variable will be filled by a color__. One of the advantages of a mosaic plot is that it allows for the comparison between two categorical variables in both horizontal and vertical orientations, using a percentage scale. + +For example, the majority of patients lived in neighborhoods that had an income level greater than \$63,333 and sought treatment in an integrated network cancer program, followed by academic/research programs and community cancer programs. Patients seeking cancer treatment in academic/research programs mostly lived in neighborhoods that had an income level less than \$40,227, followed by neighborhoods with income levels greater than \$63,333, \$40,227 - \$50,353, \$50,353 - \$63,332, and unknown income levels. + +After making a mosaic plot for two categorical variables, in the next section, we will __explore the visualization of more than two categorical variables in a single mosaic plot__. This will allow us to gain insights and understand the relationships among multiple categorical variables simultaneously. + +## (2). Income level, insurance type, and survival status +To incorporate the categorical variable for survival status, we can utilize the `facet_grid()` function to create a grid for multiple levels in the `Status` variable. The basic settings for this plot are similar to the previous examples. However, in this case, we will introduce an additional function, `facet_grid( ~ Status)`, to facilitate the creation of the grid. Additionally, we can modify the space between two boxes by setting `offset = 0.02`. + +```{r} +# Factor for Status variable +Dt$Status <- factor(Dt$Status, levels = c('Death', 'Alive')) + +# Version 2 +p2 <- +ggplot(data = Dt) + + geom_mosaic(aes(x = product(Income.Level, # y-axis: Income level + Facility.Type), # x-axis: Facility type + fill = Income.Level), # Color of "box" specified by income level + offset = 0.02) + # More space between boxes + facet_grid( ~ Status) + # Facet by survival status + xlab('Facility Type') + + guides(fill = guide_legend(title = "Neighborhood \n Income Level")) + # Modify title of legend + scale_fill_tableau() + # Use better color palette + theme_mosaic() + # Specific theme for mosaic plot + theme(axis.ticks.x = element_blank(), # Remove ticks in x and y axes + axis.ticks.y = element_blank(), + axis.title.x = element_text(color = 'black', face = 'bold'), + axis.title.y = element_blank(), + axis.text.x = element_text(color = 'black', angle = 45, hjust = 1), + axis.text.y = element_blank(), + legend.title = element_text(color = 'black', face = 'bold'), + strip.text = element_text(size = 10, color = 'black', face = 'bold'), + strip.background = element_blank()) +p2 +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/Mosaic-Plot-Tutorial-in-R/cdf4a2f47ce10d71e8821be97eeec1d3375d8b9b/Plot%202.jpeg) + +After stratifying the mosaic plot by the survival status, we observed that there wasn't an obvious difference in the distribution of neighborhood income level and facility type based on survival status. Consequently, we can combine the facet version with the mosaic plot, considering all survival statuses together. This will allow us to visualize the relationships between multiple categorical variables in a comprehensive manner. + +![](https://raw.githubusercontent.com/YzwIsALaity/Mosaic-Plot-Tutorial-in-R/cdf4a2f47ce10d71e8821be97eeec1d3375d8b9b/Plot%203.jpeg) + +Here we go! + +In addition to using `facet_grid()` to stratify a mosaic plot, there is an alternative embedded method using the `conds` argument within `aes()` in `geom_mosaic()`. __However, this approach may result in unattractive labels on the x-axis_. To address this, we can manually modify the labels using the `annotate()` function. This requires specifying the numerical location in (x, y) coordinates to annotate the text and replace the initial labels on the x-axis__. Therefore, we will introduce two implementations that we need to use here: + +- `conds`: this argument, `conds`, is used within `aes()` and requires passing a variable through `product()`. For instance, if we want to stratify the plot by `Status`, we need to set `conds = product(Status)`. This allows us to specify the variable for stratification and create separate panels for each level of that variable within the mosaic plot + +- `annotate()`: this is a function used to add `geoms` (geometric objects) to a plot, and it requires specifying certain arguments. The specific arguments depend on the type of `geom` being added: + + + `geom`: the name of `geom` to use for annotation; + + + `x`, `y`: numerical location in (x, y) coordinates; + + + `label`: the label of the annotation; + + + `fontface`: the font face of label. + +In our case, we need to use `annotate()` to manually create the label and text for the x-axis. The following code snippet displays the final figure. + +```{r} +# Version 3 +p3 <- +ggplot(data = Dt) + + geom_mosaic(aes(x = product(Income.Level, # y-axis: Income level + Facility.Type), # x-axis: Facility type + fill = Income.Level, + conds = product(Status)), # Color of "box" specified by income level + offset = 0.025) + # More space between boxes + annotate(geom = "text", x = 0.25, y = -0.1, label = "Death") + # Change text for x axis + annotate(geom = "text", x = 0.75, y = -0.1, label = "Alive") + + annotate(geom = "text", x = 0.50, y = -0.2, label = "Survival Status", fontface = "bold") + # x-axis label + ylab('Facility Type') + + guides(fill = guide_legend(title = "Neighborhood \n Income Level")) + # Modify title of legend + scale_fill_tableau() + # Use better color palette + theme_mosaic() + # Specific theme for mosaic plot + theme(axis.ticks.x = element_blank(), # Remove ticks in x and y axes + axis.ticks.y = element_blank(), + axis.title.x = element_blank(), + axis.title.y = element_text(color = 'black', face = 'bold', vjust = 5), + axis.text.x = element_blank(), + axis.text.y = element_text(color = 'black'), + legend.title = element_text(color = 'black', face = 'bold')) +p3 +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/Mosaic-Plot-Tutorial-in-R/cdf4a2f47ce10d71e8821be97eeec1d3375d8b9b/Plot%204.jpeg) + +Here we go! diff --git a/_posts/2023-09-01-Contour-Plots.md b/_posts/2023-09-01-Contour-Plots.md new file mode 100644 index 00000000000..2e79d5e67c2 --- /dev/null +++ b/_posts/2023-09-01-Contour-Plots.md @@ -0,0 +1,192 @@ +--- +title: "Contour Plot in R with ggplot2" +mathjax: true +layout: post +categories: media +--- + + +![Cover](https://raw.githubusercontent.com/YzwIsALaity/Contour-Plot-Tutorial-in-R/adb30487e4d7c721b504c8440e1fa46ad59242c0/Cover.jpeg) + + + +This tutorial guides you through the process of creating __contour plots__ using `ggplot2`. A contour plot is a type of visualization used to represent a 3-dimensional surface by projecting constant `z` values onto a 2-dimensional space. It provides a straightforward way to visualize complex 3-dimensional datasets in a simplified 2-dimensional format. To illustrate the usage of contour plots, we will use the `volcano` dataset from the `rgl` package. + +# 1. Contour plot: line +To illustrate the usage of contour plots, we will use the `volcano` dataset from the `rgl` package. The `volcano` dataset can be thought of as representing the __longitudinal, latitudinal, and height data of a volcano__. The longitudinal and latitudinal coordinates are presented by the `x` and `y` axes, respectively, while the height is represented by the `z` values in the matrix. This dataset consists of 87 rows and 61 columns, describing 5307 observations of the volcano's topography. After data preprocessing, we will work with a subset of the dataset, including 3 columns for `x`, `y`, and `z` dimensions. + +![](https://raw.githubusercontent.com/YzwIsALaity/Contour-Plot-Tutorial-in-R/adb30487e4d7c721b504c8440e1fa46ad59242c0/Dataset.png) + +The data structure is simple, and we will first explore the line version of contour plots. In `ggplot2`, we primarily rely on two different functions: `geom_contour()` for contour plots. + + - `geom_contour()`: This function is used for creating raw contour plots, providing a straightforward way to visualize the data + + + `bins = `: It is used to control the number of bins. + + + `binwidth = `: This argument controls the width of the bins used for calculating contours. Smaller `binwidth` values result in more detailed contours, while larger values produce smoother contours. will be overridden by `bins`. Therefore, users only need to specify one of them. + +In the basic version of a contour plot, we have the ability to manage the number of bins to effectively visualize distinct contours within the same dataset. Each figure in this representation incorporates contour lines that faithfully reflect the underlying 3-dimensional data structure. + +```{r} +# Version 1 +## bins = 15 +p1 <- +ggplot(Dt, aes(x, y, z = z)) + + geom_contour(bins = 15) + # Create contour plot with 15 bins + ggtitle('bins = 15') + + scale_x_continuous(expand = c(0, 0)) + + scale_y_continuous(expand = c(0, 0)) + + theme_bw() + # dark-on-light theme + theme(panel.background = element_blank(), + axis.line.x = element_line(), # these two are for the axis line + axis.line.y = element_line(), + axis.text.x = element_text(colour = "black"), # there two are for texts in axes + axis.text.y = element_text(colour = "black"), + axis.ticks.x = element_line(), # these two are for ticks in axes + axis.ticks.y = element_line(), + axis.title.x = element_text(colour = "black", face = 'bold', vjust = -1), + axis.title.y = element_text(colour = "black", face = 'bold'), + plot.title = element_text(face = 'bold'), + legend.title = element_text(colour = "black", face = 'bold'), + legend.text = element_text(colour = "black")) + +## bins = 30 +p2 <- +ggplot(Dt, aes(x, y, z = z)) + + geom_contour(bins = 30) + # Create contour plot with 30 bins + ggtitle('bins = 30') + + scale_x_continuous(expand = c(0, 0)) + + scale_y_continuous(expand = c(0, 0)) + + theme_bw() + # dark-on-light theme + theme(panel.background = element_blank(), + axis.line.x = element_line(), # these two are for the axis line + axis.line.y = element_line(), + axis.text.x = element_text(colour = "black"), # there two are for texts in axes + axis.text.y = element_blank(), + axis.ticks.x = element_line(), # these two are for ticks in axes + axis.ticks.y = element_line(), + axis.title.x = element_text(colour = "black", face = 'bold', vjust = -1), + axis.title.y = element_blank(), + plot.title = element_text(face = 'bold'), + legend.title = element_text(colour = "black", face = 'bold'), + legend.text = element_text(colour = "black")) + +# Arrange 2 figures +Layout.Mat <- matrix(c(rep(1, 9), rep(2, 8)), nrow = 1) +grid.arrange(p1, p2, layout_matrix = Layout.Mat) +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/Contour-Plot-Tutorial-in-R/adb30487e4d7c721b504c8440e1fa46ad59242c0/Contour%20(Version%201).jpeg) + +We observe that increasing the number of `bins` results in a denser figure. Although the transformation from 3-dimensional data to a 2-dimensional representation provides a general shape of the data, it doesn't fully capture the original range of continuous variable `z`. Therefore, in the upcoming version, our aim is to visualize the magnitude of the continuous `z` using a contour plot. In the default configuration of `geom_contour()`, two intermediate values are calculated: `level_mid` (a numerical value corresponding to the midpoint of the contour levels) and `level` (an ordered factor representing the contour range). Hence, it becomes necessary to __utilize the `after_stat()` function__ in `ggplot2`. This function enables us to __extract and utilize variables that have been calculated by the `stat`__. + +```{r} +# Version 2 +p3 <- +ggplot(Dt, aes(x, y, z = z)) + + geom_contour(aes(colour = after_stat(level)), bins = 30) + # Create contour plot with 30 bins and set color of level + scale_color_continuous(type = 'viridis') + + ggtitle('bins = 30') + + scale_x_continuous(expand = c(0, 0)) + + scale_y_continuous(expand = c(0, 0)) + + theme_bw() + # dark-on-light theme + theme(panel.background = element_blank(), + axis.line.x = element_line(), # these two are for the axis line + axis.line.y = element_line(), + axis.text.x = element_text(colour = "black"), # there two are for texts in axes + axis.text.y = element_text(colour = "black"), + axis.ticks.x = element_line(), # these two are for ticks in axes + axis.ticks.y = element_line(), + axis.title.x = element_text(colour = "black", face = 'bold', vjust = -1), + axis.title.y = element_text(colour = "black", face = 'bold'), + plot.title = element_text(face = 'bold'), + legend.title = element_text(colour = "black", face = 'bold'), + legend.text = element_text(colour = "black")) + +# Modify legend title +p3$labels$colour <- 'Level of z' +p3 +``` + +![Contour (Version 2)](https://raw.githubusercontent.com/YzwIsALaity/Contour-Plot-Tutorial-in-R/adb30487e4d7c721b504c8440e1fa46ad59242c0/Contour%20(Version%202).jpeg) + +In this version, the contour line values are color-coded using a continuous color scale. Bright colors signify higher magnitudes of `z`, while darker colors represent lower magnitudes of `z`. Instead of visualizing contours solely with lines, we can also represent them using color-filled areas or tiles which we will explore in the next section. + +# 2. Contour plot: tile +To create a contour plot with color-filled areas or tiles, we mainly utilize two functions: `geom_contour_filled()` or `geom_raster()`. In this section, we'll introduce the usage of `geom_contour_filled()`, which closely resembles `geom_contour()`. However, unlike controlling the number of `bins` with `geom_contour_filled()`, we modify the tile version of the contour plot by adjusting the number of `breaks` within the range of `z` values. The __`breaks` argument, when supplied to `geom_contour_filled()`, serves the purpose of defining a numerical vector of breaks to segment the `z` range into distinct sub-intervals, each represented by a unique color__. To enhance the creation of a divergent color palette, we will employ a function, namely `brewer.pal()`, from the `RColorBrewer` package. The `brewer.pal()` function take two arguments: + + - `n = `: it is used to set up the number of different colors in the palette + + - `name = `: it is used to choose a palette name + +Once we've chosen a color palette, we can apply it by passing it to `scale_fill_manual()` in order to manually set the color palette for the color-filled tiles within the contour plot. The generated color palette is assigned to the `values` argument, and we use `drop = F` to ensure that unused factor levels are not dropped by the function. Additionally, we utilize the `guide_colorsteps()` function within the `guides()` function to ensure the color legend is displayed vertically (`guide_colorsteps(direction = "vertical")`). In general, `guides()` function is used to modify different scales (i.e., color, fill, legend, etc.) in `ggplot2`. + +```{r} +# Create selected palette +require(RColorBrewer) +Breaks <- c(-Inf, seq(150, 400, length.out = 9), Inf) # set up the number of breaks +Spectral.colors <- brewer.pal(n = length(Breaks), name = "Spectral") # generate color palette + +# Version 3 +p4 <- +ggplot(Dt, aes(x, y, z = z)) + + geom_contour_filled(breaks = Breaks) + # pass Breaks (approximately bins = 11) + scale_fill_manual(values = rev(Spectral.colors), # set up color palette manually and using decrasing order for values + drop = F) + # avoid dropping unused factors + guides(fill = guide_colorsteps(direction = "vertical")) + # vertically display legend + scale_x_continuous(expand = c(0, 0)) + + scale_y_continuous(expand = c(0, 0)) + + theme_bw() + # dark-on-light theme + theme(panel.background = element_blank(), + axis.line.x = element_line(), # these two are for the axis line + axis.line.y = element_line(), + axis.text.x = element_text(colour = "black"), # there two are for texts in axes + axis.text.y = element_text(colour = "black"), + axis.ticks.x = element_line(), # these two are for ticks in axes + axis.ticks.y = element_line(), + axis.title.x = element_text(colour = "black", face = 'bold', vjust = -1), + axis.title.y = element_text(colour = "black", face = 'bold'), + plot.title = element_text(face = 'bold'), + legend.title = element_text(colour = "black", face = 'bold'), + legend.text = element_text(colour = "black")) + +# Modify legend title +p4$labels$fill <- 'Level of z' +p4 +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/Contour-Plot-Tutorial-in-R/adb30487e4d7c721b504c8440e1fa46ad59242c0/Contour%20(Version%203).jpeg) + +In this version of the contour plot, each area represented by contour lines from `geom_contour()` is now filled with different colors from a continuous color scale. When compared to the line version of the contour plot, the tile version is likely to be more intuitive. Another alternative to the `geom_contour_filled()` function is to combine the use of `geom_raster()` and `geom_contour()`. The `geom_raster()` function is particularly useful when all the tiles are of the same size, offering high performance. In this context, each tile in a contour plot corresponds to a rectangle or square, and colors are applied to fill each tile to represent the mapping of the 'z' values onto a 2-dimensional grid. + +```{r} +# Version 4 +p5 <- +ggplot(Dt, aes(x, y, z = z)) + + geom_contour(bins = 11) + # create contour plot with 30 bins and set color of level + geom_raster(aes(fill = z)) + # fill grid + scale_fill_continuous(type = 'viridis') + + ggtitle('bins = 11') + + scale_x_continuous(expand = c(0, 0)) + + scale_y_continuous(expand = c(0, 0)) + + theme_bw() + # dark-on-light theme + theme(panel.background = element_blank(), + axis.line.x = element_line(), # these two are for the axis line + axis.line.y = element_line(), + axis.text.x = element_text(colour = "black"), # there two are for texts in axes + axis.text.y = element_text(colour = "black"), + axis.ticks.x = element_line(), # these two are for ticks in axes + axis.ticks.y = element_line(), + axis.title.x = element_text(colour = "black", face = 'bold', vjust = -1), + axis.title.y = element_text(colour = "black", face = 'bold'), + plot.title = element_text(face = 'bold'), + legend.title = element_text(colour = "black", face = 'bold'), + legend.text = element_text(colour = "black")) +# Modify legend title +p5$labels$fill <- 'Level of z' +p5 +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/Contour-Plot-Tutorial-in-R/adb30487e4d7c721b504c8440e1fa46ad59242c0/Contour%20(Version%204).jpeg) + +This version of the contour plot conveys the same level of information as the one mentioned earlier. The key distinction in this version lies in the discretization of the continuous x and y axes into a grid of uniform size. Each grid cell is then filled with a color corresponding to the value of `z`. diff --git a/_posts/2023-10-16-Ridgeline-Plots.md b/_posts/2023-10-16-Ridgeline-Plots.md new file mode 100644 index 00000000000..78f39fd4f77 --- /dev/null +++ b/_posts/2023-10-16-Ridgeline-Plots.md @@ -0,0 +1,203 @@ +--- +title: "Ridgeline Plot in R with ggplot2" +mathjax: true +layout: post +categories: media +--- + + +![Cover](https://raw.githubusercontent.com/YzwIsALaity/Ridgeline-Plot-Tutorial-in-R/2a324d0c39139ff786a3f0ddf6960053578386cc/Fake%20Unknown%20Pleasure.jpeg) + + +In this tutorial, we'll introduce the use of __ridgeline plots__ in R using the __`ggplot2` and `ggridges` packages__. These plots display a sequence of histogram or density plots stacked vertically, making it a useful tool for __visualizing changes in the distribution of a numerical variable across different time points or categories__. + +## 1. Dataset +We will use a simulated dataset for this visualization. Suppose there is a 3-year cohort consisting of 200 participants who initially contracted severe Covid-19 at enrollment. We collected cytokine data at 0, 30, 90, 180, 360, 540, and 720 days (7 time points). Additionally, we hypothesized that the __TNF-alpha cytokine__ levels (pg/mL) would increase during the initial 90 days and then gradually decrease as the acute symptoms subsided. The ridgeline plot is an excellent tool for __visualizing longitudinal changes in TNF-alpha__ data. The dataset is very simple and includes 4 variables only: + +- `PTID`: this is an __unique identification__ for each participants __[200 participants]__ (string); + +- `Time`: this variable represents the __time point of sample collection__ (numerical); + +- `TNF_alpha`: this variable represents the __value of TNF-alpha cytokine__ in pg/mL (numerical); + +- `Numerical_PTID`: This is equivalent to `PTID`, but it is represented numerically (numerical). + +![](https://raw.githubusercontent.com/YzwIsALaity/Ridgeline-Plot-Tutorial-in-R/2a324d0c39139ff786a3f0ddf6960053578386cc/Dataset.png) + +At each time point, we collected 200 observations of TNF-alpha cytokine values in pg/mL. Since we hypothesize that this cytokine is linked to pro-inflammatory processes, monitoring its values can serve as a key indicator of inflammation. We aim to investigate whether inflammatory symptoms gradually decrease after the acute period. + +## 2. Ridgeline plot +The ridgeline plot is generated using the `ggridges` extension package for `ggplot2.` We primarily employ three functions: `geom_ridgeline()`, `geom_density_ridges()`, and `geom_density_ridges_gradient()`, to create various types of ridgeline plots. In the basic version, we will cover the usage of `geom_ridgeline()` and `geom_density_ridges()`, while the comprehensive version will include a detailed walkthrough of `geom_density_ridges_gradient()`. + +### (1). Basic version +In the basic version, we will begin by introducing two functions: `geom_ridgeline()` and `geom_density_ridges()`. These functions are used for visualizing the raw ridgeline of a numerical variable and the density of a numerical variable, respectively. The `geom_ridgeline()` function is utilized to create lines with filled areas beneath them, and it necessitates certain arguments: + +- `height`: it is used to __define the height of the ridgeline__, measured from the corresponding `y` value, where `y` is regarded as a grouping or categorical variable for stratification. It needs to be passed into `aes()` in `geom_ridgeline()`. + +- `scale`: it acts as a scaling factor to __adjust the height of the ridgelines__. A __higher value indicates a larger scaling of height__ relative to the height of each group. + +- `rel_min_height`: it is used to __set a percentage cutoff in relation to the highest point of any density curve__. A __smaller value signifies a more extended tail on both sides of the density curve__. + +__Quantiles__ can be highlighted using the following arguments: + +- `quantile_lines`: it is a binary logical argument, which can be set to either `TRUE` or `FALSE.` By default, it is set to `FALSE`, and setting it to `TRUE` will result in the quantile line being displayed in the area under the curve/line. + +- `quantiles`: it is used to define a vector of quantiles (i.e., 0.25, 0.5, etc.) + +__Jittered points__ (added a small amount of random variation to the location of each point) can be showed using the following arguments: + +- `jittered_points`: it is a binary logical argument, which can be set to either `TRUE` or `FALSE.` By default, it is set to `FALSE`, and setting it to `TRUE` will result in visualizing jittered points. + +- `position`: it is a string argument used to set up position of jittered points in a ridgeline plot. + + + `position = "points_jitter"`: jittered points are randomly shifted up and down and/or left and right. + + + `position = "points_sina"`: jittered points are randomly distributed to fill the entire shaded area representing the data density. + + + `position = "raincloud"`: jittered points lie all underneath the baseline of each individual ridgeline. + +In addition to the arguments within `geom_ridgeline()` and `geom_density_ridges()`, the package also offers a function for customizing the theme using `theme_ridges()`. By default, it does not center labels on the x and y axes, so you can use the `center_axis_labels = TRUE` argument to center the axis labels. + +```{r} +require(ggplot2) +require(ggthemes) +require(ggridges) + +# Basic ridgeline +p1 <- +ggplot(Dt, aes(x = Numerical_PTID, + y = Time)) + + geom_ridgeline(aes(height = TNF_alpha), + scale = 0.001, + fill = "lightblue") + + ggtitle('Basic ridgeline') + + scale_x_continuous(expand = c(0, 0)) + + scale_y_discrete(expand = c(0, 0)) + + theme_ridges(center_axis_labels = TRUE) + +# Ridgeline for density +p2 <- +ggplot(Dt, aes(x = TNF_alpha, y = Time)) + + geom_density_ridges(scale = 5, + rel_min_height = 0.001, + fill = "lightblue", + quantiles = c(0.25, 0.5, 0.75), + quantile_lines = TRUE) + + ggtitle('Density ridgeline') + + scale_x_continuous(expand = c(0, 0)) + + scale_y_discrete(expand = c(0, 0)) + + theme_ridges(center_axis_labels = TRUE) + +# Ridgeline for density with jittered points +p3 <- +ggplot(Dt, aes(x = TNF_alpha, y = Time)) + + geom_density_ridges(scale = 5, + rel_min_height = 0.0001, + fill="lightblue", + jittered_points = TRUE, + position = "raincloud") + + ggtitle('Density ridgeline with jittered points') + + scale_x_continuous(expand = c(0, 0)) + + scale_y_discrete(expand = c(0, 0)) + + theme_ridges(center_axis_labels = TRUE) + +# Ridgeline in binline version +p4 <- +ggplot(Dt, aes(x = TNF_alpha, y = Time)) + + geom_density_ridges(stat = "binline", + scale = 3, + rel_min_height = 0.0001, + fill="lightblue") + + ggtitle('Ridgeline in bins') + + scale_x_continuous(expand = c(0, 0)) + + scale_y_discrete(expand = c(0, 0)) + + theme_ridges(center_axis_labels = TRUE) + +grid.arrange(p1, p2, p3, p4, nrow = 2, ncol = 2) +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/Ridgeline-Plot-Tutorial-in-R/2a324d0c39139ff786a3f0ddf6960053578386cc/Ridgeline%20(Basic).jpeg) + +In the __Basic ridgeline__ plot (top left), `geom_ridgeline()` simply maps TNF-alpha values at each time point according to numerical patient IDs. In contrast, the __Density ridgeline__ plot (top right) and the __Density ridgeline with jitter points__ plot (bottom left) offer more informative visuals, displaying the density of TNF-alpha value distributions grouped by time points. If users prefer a binned version (histogram), they can opt for the __Ridgeline in bins__ plot (bottom right). + +### (2). Comprehensive version +For a more enhanced visualization of the TNF-alpha value distribution density, you can employ the `geom_density_ridges_gradient()` function, which __displays gradient colors under the density curve__ for each time point. To achieve this, you need to utilize the `after_stat(x)` function within the `aes()` function of the main `ggplot()` function. This allows you to map the transformed TNF-alpha values from `x` to the `fill` argument. + +```{r} +# Texts for y axis +yText <- c('At enrollment', paste0(c(30, 90, 180, 360, 540, 720), '-day')) + +p5 <- +ggplot(data = Dt, aes(x = TNF_alpha, # x-axis: value of TNF-alpha + y = Time, # y-axis: time point + fill = after_stat(x))) + + geom_density_ridges_gradient(scale = 4) + + scale_fill_gradient2_tableau() + + scale_y_discrete(expand = c(0, 0), + breaks = c(0, 30, 90, 180, 360, 540, 720), + label = yText) + + scale_x_continuous(expand = c(0, 0)) + + labs(x = expression("Mean TNF"*alpha ~ '(pg/mL)'), # x-axis label + fill = expression("TNF"*alpha ~ '(pg/mL)'), # legend title + title = expression('Density of' ~ "TNF"*alpha ~ '(pg/mL) at Different Time Points')) + + theme_bw() + + theme(panel.border = element_blank(), + panel.grid.major.x = element_blank(), # Remove vertical gridlines + panel.grid.minor.x = element_blank(), + axis.line.x = element_line(), # these two are for the axis line + axis.line.y = element_blank(), + axis.text.x = element_text(colour = "black"), # there two are for texts in axes + axis.text.y = element_text(colour = "black"), + axis.ticks.x = element_line(), # these two are for ticks in axes + axis.ticks.y = element_blank(), + axis.title.x = element_text(colour = "black", vjust = -1), + axis.title.y = element_text(colour = "black"), + legend.title = element_text(colour = "black"), + legend.text = element_text(colour = "black"), + plot.title = element_text(hjust = 0.5)) +p5 +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/Ridgeline-Plot-Tutorial-in-R/2a324d0c39139ff786a3f0ddf6960053578386cc/Ridgeline%20(Comprehensive).jpeg) + +In this figure, we have eliminated all extraneous elements and focused solely on highlighting the density curve for TNF-alpha values. The intensity of TNF-alpha is represented by a gradient color scale. + +__The inspiration for the ridgeline plot design comes from the cover of the debut studio album by the rock band Joy Division, and we can also experiment with mimicking that cover.__ + +```{r} +nObs <- 1000 +nTime <- 100 + +Mean <- 100 +SD <- 1 + +Dt_Plot <- data.frame() +for(i in 1:nTime){ + NoiseForMean <- rnorm(1, mean = 5, sd = 2) + NoiseForSD <- rnorm(1, mean = 2, sd = 0.5) + MeanForGenerate <- Mean + NoiseForMean + SDForGenerate <- SD + NoiseForSD + Value <- rnorm(n = nObs, mean = MeanForGenerate, sd = SDForGenerate) + Trans <- data.frame('Row' = paste0('X', i), 'Value' = Value) + Dt_Plot <- rbind(Dt_Plot, Trans) +} + +p <- +ggplot(data = Dt_Plot, aes(x = Value, y = Row)) + + geom_density_ridges(scale = 10, fill = 'white') + + theme_bw() + + theme(# panel.background = element_rect(fill = 'black'), + panel.border = element_blank(), + panel.grid = element_blank(), + axis.text.x = element_blank(), + axis.text.y = element_blank(), + axis.ticks.x = element_blank(), + axis.ticks.y = element_blank(), + axis.title.x = element_blank(), + axis.title.y = element_blank(), + legend.title = element_text(color = 'black', face = 'bold'), + plot.title = element_text(hjust = 0.5, face = 'bold')) +p +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/Ridgeline-Plot-Tutorial-in-R/2a324d0c39139ff786a3f0ddf6960053578386cc/Fake%20Unknown%20Pleasure.jpeg) diff --git a/_posts/2023-11-30-Gene-Plot.md b/_posts/2023-11-30-Gene-Plot.md new file mode 100644 index 00000000000..00240062fa1 --- /dev/null +++ b/_posts/2023-11-30-Gene-Plot.md @@ -0,0 +1,181 @@ +--- +title: "Gene Map Plot in R with ggplot2" +mathjax: true +layout: post +categories: media +--- + + +![Cover](https://raw.githubusercontent.com/YzwIsALaity/Gene-Map-Tutorial-in-R/0e7db2a2436dadda942687893249bd16f51e9a79/Figure%202.jpeg) + + +This tutorial introduces the visualization of __gene maps with arrows in R using the `ggplot2` and `gggenes` packages__. It provides a linear representation of gene strain segments in genomes, commonly employed in scientific publications for gene description. We'll generate a synthetic dataset to simulate a gene map resembling Figure 1E in a recent paper detailing a [mouse model for triple-negative breast cancer](https://www.nature.com/articles/s41467-023-40841-6). This synthetic dataset comprises three distinct genomes, each containing different gene segments. With a straightforward structure, the dataset consists of four columns: + + - `Molecule`: it is a categorical variable denoting the __names of the genomes [3 unique genomes]__ (string); + + - `Gene`: it is a categorical variable denoting the __names of the gene segments [10 unique genes]__ (string); + + - `Start` and `End`: they are used to denote the __starting and ending location of a gene segment__ within a genome (numerical). + +![](https://raw.githubusercontent.com/YzwIsALaity/Gene-Map-Tutorial-in-R/0e7db2a2436dadda942687893249bd16f51e9a79/Dataset%201.png) + +We will initially demonstrate how to visualize a single genome and then extend the discussion to multiple genomes. Two core functions are used to generate a gene map: + + - `geom_gene_arrow()`: coming from the gggenes package, it is utilized to __draw genes as arrows__, offering flexibility with various optional arguments: + + + `arrowhead_width` and `arrowhead_height`: these optional arguments can be leveraged to control the __width/height of the arrowhead__. Users often use the `unit()` function from the `ggplot2` package to specify the size in most cases. + + - `theme_genes()`: it is utilized to specify the theme for gene map. + + - `geom_gene_label()`: it is utilized to label genes with texts and __must be called after `geom_gene_arrow()`__. The argument `align` can be used to specify the location of texts ("left", "centre", "right"). + +In addition to these functions, it's essential to define the `xmin` (start location), `xmax` (end location), `y` (genome), and `fill` (gene segments) arguments within the `aes()` function. By specifying the `fill` argument, each gene within a genome will be categorized with a distinct color and displayed in the legend. To label each gene in the figure and remove the legend, users should include an additional argument, `label`, in the `aes()` function within the `ggplot()` function. + +```{r} +SingleGenome <- Dt[which(Dt$Molecule == 'Rosa LSL-Myc'), ] +# Version 1: default +p1 <- +ggplot(SingleGenome, + aes(xmin = Start, xmax = End, # Specify start/end location for a gene + y = Molecule, # Specify genome + fill = Gene)) + # Specify color for genes + geom_gene_arrow() + # Draw gene map as arrows + scale_fill_brewer(palette = "Set3") + # Specify color palette + theme_genes() + # Specify theme for gene map + theme(axis.text.x = element_blank(), # Remove x axis + axis.ticks.x = element_blank(), + axis.title.x = element_blank(), + axis.line.x = element_blank(), + axis.title.y = element_blank(), # Remove title in y axis + axis.text.y = element_text(face = 'bold', color = 'black'), + legend.title = element_text(face = 'bold', color = 'black')) +p1 + +# Version 2: modify the shape of arrowhead +p2 <- +ggplot(SingleGenome, + aes(xmin = Start, xmax = End, # Specify start/end location for a gene + y = Molecule, # Specify genome + fill = Gene)) + # Specify color for genes + geom_gene_arrow(arrowhead_height = unit(3, "mm"), # Modify height and width + arrowhead_width = unit(1, "mm")) + + scale_fill_brewer(palette = "Set3") + # Specify color palette + theme_genes() + # Specify theme for gene map + theme(axis.text.x = element_blank(), # Remove x axis + axis.ticks.x = element_blank(), + axis.title.x = element_blank(), + axis.line.x = element_blank(), + axis.title.y = element_blank(), # Remove title in y axis + axis.text.y = element_text(face = 'bold', color = 'black'), + legend.title = element_text(face = 'bold', color = 'black')) +p2 + +# Version 3: add label to genes and remove legend +p3 <- +ggplot(SingleGenome, + aes(xmin = Start, xmax = End, # Specify start/end location for a gene + y = Molecule, # Specify genome + fill = Gene, # Specify color for genes + label = Gene)) + + geom_gene_arrow(arrowhead_height = unit(3, "mm"), # Modify height and width + arrowhead_width = unit(1, "mm")) + + geom_gene_label() + # Specify labels + scale_fill_brewer(palette = "Set3") + # Specify color palette + theme_genes() + # Specify theme for gene map + theme(axis.text.x = element_blank(), # Remove x axis + axis.ticks.x = element_blank(), + axis.title.x = element_blank(), + axis.line.x = element_blank(), + axis.title.y = element_blank(), # Remove title in y axis + axis.text.y = element_text(face = 'bold', color = 'black'), + legend.position = 'none') +p3 + +# Show figure +grid.arrange(p1, p2, p3, ncol = 1) +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/Gene-Map-Tutorial-in-R/0e7db2a2436dadda942687893249bd16f51e9a79/Figure%201.jpeg) + +The initial plot represents the default version, the second plot reflects a slight modification to the arrowhead, and in the last one, we remove the legend while adding text labels to the genes. After visualizing a single genome, the next step involves creating a collective representation of multiple genomes in a single figure. Here, the `facet_wrap()` function from the `ggplot2` package becomes essential. Given that genes in genomes occupy distinct locations, __it's crucial to include the argument `scales = "free"` in the function__. This enables stacking various gene maps together using `ncol = 1`. + +```{r} +p4 <- +ggplot(Dt, + aes(xmin = Start, xmax = End, # Specify start/end location for a gene + y = Molecule, # Specify genome + fill = Gene, # Specify color for genes + label = Gene)) + + geom_gene_arrow(arrowhead_height = unit(3, "mm"), # Modify height and width + arrowhead_width = unit(1, "mm")) + + geom_gene_label() + # Specify labels + facet_wrap(~ Molecule, scales = "free", ncol = 1) + + scale_fill_brewer(palette = "Set3") + # Specify color palette + theme_genes() + # Specify theme for gene map + theme(axis.text.x = element_blank(), # Remove x axis + axis.ticks.x = element_blank(), + axis.title.x = element_blank(), + axis.line.x = element_blank(), + axis.title.y = element_blank(), # Remove title in y axis + axis.text.y = element_text(face = 'bold', color = 'black'), + legend.position = 'none') +p4 +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/Gene-Map-Tutorial-in-R/0e7db2a2436dadda942687893249bd16f51e9a79/Figure%202.jpeg) + +In this figure, each gene map for every genome is vertically stacked, with each gene designated a unique color. In this synthetic dataset for genomes, the genes are not shared across genomes. However, at times, the same genes are shared across different genomes, but located in different positions. Hence, we'll introduce the `make_alignment_dummies()` function from the `gggenes` package to assist in aligning genes across genomes. We will use the sample dataset provided by the `gggenes` package to show how we can align genes across genomes. The `make_alignment_dummies()` function takes several arguments: + +- `data`: it is used to specify the dataset for genes + +- `mapping`: it is the same as any `geom` function and used to specify `aes()`. For example, `aes(xmin = start, xmax = end, y = molecule, id = gene)` (it is required to specify an additional argument for `id`). + +- `on`: it is used to __specify the name of gene to be visually aligned across facets__. + +- `side`: it is used to specify the visual alignment be of the `"left"` (default) or `"right"`. + +After passing the gene dataset into the `make_alignment_dummies()` function, it's essential to store the result and subsequently input it into the `geom_blank()` function from the `ggplot2` package when creating the figure. Let's start by examining the sample dataset (`example_genes`) provided by the `gggenes` package, which comprises 6 columns: + +- `molecule`: the names of genomes + +- `gene`: the names of genes + +- `start` and `end`: the start/end positions of the gene + +- `strand`: the strand of genes + +- `orientation`: the orientation of the gene (1 for left, 0 for right) + +![](https://raw.githubusercontent.com/YzwIsALaity/Gene-Map-Tutorial-in-R/0e7db2a2436dadda942687893249bd16f51e9a79/Dataset%202.png) + +As the sample dataset includes gene orientations, indicating the arrowhead orientation for each gene, this information can be integrated into the figure using an additional argument, `aes(forward = orientation)`, in the `geom_gene_arrow()` function. Please note that the `forward` argument can only accept a vector with numerical values of 0 or 1. We can incorporate all this information to generate a comprehensive gene map plot: + +```{r} +dummies <- make_alignment_dummies(example_genes, + aes(xmin = start, xmax = end,y = molecule,id = gene), + on = "genE") + +p5 <- +ggplot(example_genes, + aes(xmin = start, xmax = end, # Specify start/end location for a gene + y = molecule, # Specify genome + fill = gene))+ # Specify color for genes + geom_gene_arrow(aes(forward = orientation), + arrowhead_height = unit(3, "mm"), # Modify height and width + arrowhead_width = unit(1, "mm")) + + geom_blank(data = dummies) + + facet_wrap(~ molecule, scales = "free", ncol = 1) + + scale_fill_brewer(palette = "Set3") + # Specify color palette + theme_genes() + # Specify theme for gene map + theme(axis.text.x = element_blank(), # Remove x axis + axis.ticks.x = element_blank(), + axis.title.x = element_blank(), + axis.line.x = element_blank(), + axis.title.y = element_blank(), # Remove title in y axis + axis.text.y = element_text(face = 'bold', color = 'black')) +p5 +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/Gene-Map-Tutorial-in-R/0e7db2a2436dadda942687893249bd16f51e9a79/Figure%203.jpeg) + +In this gene map, we can find that the figure was aligned in the `geneE`. diff --git a/_posts/2024-01-15-ROC-Curve.md b/_posts/2024-01-15-ROC-Curve.md new file mode 100644 index 00000000000..fef94d9a243 --- /dev/null +++ b/_posts/2024-01-15-ROC-Curve.md @@ -0,0 +1,200 @@ +--- +title: "ROC Curve in R with ggplot2" +mathjax: true +layout: post +categories: media +--- + +![Cover](https://raw.githubusercontent.com/YzwIsALaity/ROC-Curve-Tutorial-in-R/2bd2adb8eef665384facc3581ca748abd44a6045/ROC%20Plot%20(Version%203).jpeg) + + +In this tutorial, we will explore the application of the `ggplot2` and `plotROC` packages for __visualizing Receiver Operating Characteristic (ROC) curves in R__. ROC curves are commonly examined when assessing machine learning models for binary classification. They are closely associated with the __evaluation metric Area Under the Curve (AUC)__, which quantifies the area beneath a ROC curve in two-dimensional space. While typically used for machine learning models, ROC curves are also relevant for assessing medical tests, particularly those involving continuous biomarkers. In essence, __operating characteristics__ are characterized by a pair of values: __False Positive Fraction (FPF) and True Positive Fraction (TPF), denoted as (FPF, TPF)__. The ROC curve illustrates the complete range of operating characteristics. In general, a pair of operating characteristics can be obtained in a 2-by-2 table as below + +- __Confusion matrix__ (classification model): + +| | Actual label = 0 | Actual label = 1 | +| :---------------: | :-------------------: | :-----------------: | +| __Predicted label = 0__ | True negative (TN) | False negative (FN) | +| __Predicted label = 1__ | False positive (FP) | True positive (TP) | + +- __Contingency table__ (medical test performance): + +| | Disease status = 0 | Disease status = 1 | +| :---------------------: | :-------------------: | :-----------------: | +| __Medical test = 0__ | True negative (TN) | False negative (FN) | +| __Medical test = 1__ | False positive (FP) | True positive (TP) | + +Hence, we can derive three key summary statistics from the tables above: true positive fraction (sensitivity), true negative fraction (specificity), and false positive fraction (1 - specificity). Note that __FPF__ is also known as __type I error__, and __FNF__ is referred to as __type II error__. + +Assuming that all abbreviations in the above tables represent the number of observations in their respective cells, we can estimate them as follows: + +$$ \hat{TPF} = \frac{TP}{TP + FN}, \quad \hat{FPF} = \frac{FP}{FP + TN}, \quad \hat{TNF} = \frac{TN}{TN + FP}. $$ + +If observations/predictions are independent, __confidence intervals for the aforementioned summary statistics can be obtained using exact binomial or asymptotic inference__. Essentially, a ROC curve is depicted in a two-dimensional space defined by the pair $$ (FPF, TPF) $$, with __FPF on the x-axis and TPF on the y-axis__. To bridge the concept of the 2-by-2 tables with the ROC curve, let's assume predicted probabilities or values of a continuous medical test as $$ Y $$, the true label as $$ D $$, and a continuous threshold value $$ c $$. We can mathematically express TPF, FPF, and the ROC curve as follows: + +$$ TPF(c) = P(Y \geq c | D = 1), \quad FPF(c) = P(Y \geq c | D = 0), \quad ROC(*) = \{(FPF(c), TPF(c)), c \in (-\infty, \infty) \}. $$ + +It's important to note that a __ROC curve is a monotone increasing function__ that maps the interval $$ (0, 1) $$ onto $$ (0, 1) $$. In the case of an __uninformative classification__, where the predicted $$ Y $$ is unrelated to the true label $$ D $$, it is represented by $$ TPF(c) = FPF(c) $$ for any arbitrary threshold value $$ c $$. This scenario corresponds to __a line with a unit slope in the ROC curve figure__. In contrast, __a perfect ROC curve__ demonstrates a scenario where, for some threshold $$ c $$, we have $$ TPF(c) = 1 $$ and $$ FPF(c) = 0 $$. This corresponds to the __left and upper borders of the positive unit quadrant__. In a more rigorous formulation, we can view a ROC curve as mapping $$ t $$ to $$ TPF(c) $$, where $$ c $$ is the threshold corresponding to $$ t = FPF(c) $$. Denoting the __survivor function__ for $$ Y $$ in the actual label or disease status = 1 and in the actual label or disease status = 0 as + +$$ S_1(y) = P(Y \geq y | D = 1), \quad S_0(y) = P(Y \geq y | D = 0), $$ + +respectively, the ROC model is represented as follows: + +$$ ROC(t) = S_1(S_0^{-1}(t)), \quad t \in (0, 1). $$ + +The initial appearance of this equation might not be very intuitive, but it essentially aligns with the definition of the ROC curve. If we let $$ c = S_0^{-1}(t) $$, noting that $$ c $$ is the threshold corresponding to false positive fraction $$ t $$ then + +$$ t = FPF(c) = P(Y \geq c | D = 0). $$ + +The corresponding TPF is + +$$ TPF(c) = P(Y \geq c | D = 1) = S_1(c). $$ + +Therefore, we can have + +$$ ROC(t) = S_1(c) = S_1(S_0^{-1}(t)). $$ + +__You might wonder why we can take the inverse of__ $$ S_0(y) $$ __in this context__. (Hint: consider the nature of a survival function and its interpretation--__monotonically decreasing__). The content appears rather technical and theoretical, and more comprehensive explanations of the mathematical properties related to ROC curves can be found elsewhere. For now, let's skip this material lol. + +## 1. Dataset +Now let's start to talk about the visualization of ROC curve with `ggplot2` and `plotROC` (an extension of `ggplot2`) packages. We will use a dataset from a study investigated by [Wieand et al.](https://academic.oup.com/biomet/article-abstract/76/3/585/298253), demonstrating nonparametric statistics for comparing diagnostic markers. The dataset is straightforward, including three columns: + +- `CA_19.9`: it is a protein found in human blood known as cancer antigen 19-9, measured as a __continuous value in units of U/mL__ (numerical); + +- `CA_125`: it is a protein found in human blood known as cancer antigen 125, measured as a __continuous value in units of U/mL__ (numerical); + +- `Pancreatic_Cancer`: it serves as a __binary indicator__ for a patient, distinguishing between non-disease (`Pancreatic_Cancer` = 0) and disease groups (`Pancreatic_Cancer` = 1) (numerical). + +![](https://raw.githubusercontent.com/YzwIsALaity/ROC-Curve-Tutorial-in-R/2bd2adb8eef665384facc3581ca748abd44a6045/Dataset.png) + +Hence, we intend to illustrate the utilization of these two continuous biomarkers for visualizing ROC curves in the subsequent sections. + +## 2. ROC curve +In essence, we will employ the __`geom_roc()` function__, with the option to adjust certain display parameters: + +- `n.cuts`: a __numerical value__ should be provided to __control the number of cut points__ displayed along each curve (by default = 10). + +- `cutoffs.at`: a __vector of user supplied cutoffs__ to plot as points should be provided to __display specific cut points__. + +Additionally, the aesthetic mappings created by `aes()` differ from the initial ggplot, and it necessitates the provision of two arguments: + +- `d`: it is utilized to input an __actual binary label or the true disease status__. + +- `m`: it is employed to provide __continuous biomarkers or predicted probabilities__ from a well-trained model. + +Additional parameters like `col` or `fill` can still be passed into the `aes()` function. As mentioned earlier, __confidence intervals for both TPF and FPF__ can also be obtained and displayed along with a ROC curve at a given cutoff point. This requires using the function __`geom_rocci()`__, which can be configured with the following arguments: + +- `sig.level`: it is used to __specify the significance level__ for the confidence regions (by default = 0.05). + +- `ci.at`: it is used to specify a __vector of user supplied values__ in the range of the biomarker (by default = NULL) + +The final optional function for __adjusting plot style__ is `style_roc()`, designed to incorporate guides and annotations to a ROC curve. For example, users can customize x/y-axis labels using arguments such as `xlab` or `ylab` within this function, or alter the theme function via the `theme` argument. + +### (1). Continuous medical test performance +Now, let's proceed to visualize the initial basic version of a ROC curve for the CA 19-9 biomarker. + +```{r} +require(plotROC) +require(ggplot2) +# Version 1 +p_ROC1 <- + ggplot(Dt, aes(d = Pancreatic_Cancer, # true disease label + m = CA_19.9)) + # continuous biomarker + geom_roc(labelsize = 3) + # draw ROC curve + geom_abline(slope = 1, intercept = 0) + # add a unit slope line + style_roc(xlab = 'False Positive Fraction', # modify x/y-axis labels + ylab = 'True Positive Fraction') + + theme(axis.text.x = element_text(color = 'black', size = 11), + axis.text.y = element_text(color = 'black', size = 11), + axis.title.x = element_text(color = 'black', size = 11, face = 'bold'), + axis.title.y = element_text(color = 'black', size = 11, face = 'bold')) +p_ROC1 +``` +![](https://raw.githubusercontent.com/YzwIsALaity/ROC-Curve-Tutorial-in-R/2bd2adb8eef665384facc3581ca748abd44a6045/ROC%20Plot%20(Version%201).jpeg) + +In this basic version of the ROC curve, the `geom_roc()` function, by default, exhibits 10 cutoff points along with their associated values, displayed as points on the curve. Note that the value for each cutoff point is indeed the value of the continuous biomarker CA 19-9 in U/mL. Upon initial observation of the figure, __a reasonable cutoff level for CA 19-9 appears to be 21.8 U/mL, where FPF is only 25%, while the associated TPF is slightly greater than 75%. Based on the dataset, this suggests that if a patient without pancreatic cancer has a CA 19-9 level below this cutoff, there's a 25% chance of a false cancer diagnosis. On the other hand, for patients with pancreatic cancer, the test using this cutoff level can detect 75% of true cancer cases__. In the next version, we will illustrate the usage of a 95% confidence region for the selected cutoff point (CA 19-9 = 21.8 U/mL) and display only 5 cutoff points in the ROC curve. + +```{r} +# Version 2 +p_ROC2 <- + ggplot(Dt, aes(d = Pancreatic_Cancer, # true disease label + m = CA_19.9)) + # continuous biomarker + geom_roc(n.cuts = 5) + # draw ROC curve + geom_rocci(ci.at = 21.8) + # draw 95% CI for a selected point + geom_abline(slope = 1, intercept = 0) + # add a unit slope line + style_roc(xlab = 'False Positive Fraction', # modify x/y-axis labels + ylab = 'True Positive Fraction') + + theme(axis.text.x = element_text(color = 'black', size = 11), + axis.text.y = element_text(color = 'black', size = 11), + axis.title.x = element_text(color = 'black', size = 11, face = 'bold'), + axis.title.y = element_text(color = 'black', size = 11, face = 'bold')) +p_ROC2 +``` +![](https://raw.githubusercontent.com/YzwIsALaity/ROC-Curve-Tutorial-in-R/2bd2adb8eef665384facc3581ca748abd44a6045/ROC%20Plot%20(Version%202).jpeg) + +Note that __the confidence region is asymmetric, despite its rectangular shape__. This asymmetry arises because if $$ (FPF_L, TPF_L) $$ and $$ (FPF_U, TPF_U) $$ represent $$ 1-\alpha^* $$ univariate confidence intervals for FPF and TPF, where $$ \alpha^* = 1 - (1-\alpha)^{1/2} $$, then the rectangle $$ R = (FPF_L, TPF_L) \times (FPF_U, TPF_U) $$ forms a $$ 1 - \alpha $$ confidence interval for a pair of (FPF, TPF). (Hint: __statistically, FPF is independent of TPF if data is independent__.) In the next version, we will illustrate how to plot two ROC curves together to compare the performance of two different continuous biomarkers for the diagnosis or prognosis of diseases. We will use the `melt_roc()` function from the `plotROC` package to transform a wide dataset into a long version. By implementing this function, we can create a long dataset and subsequently plot two ROC curves for CA 19-9 and CA 125, each with a distinct color. + +```{r} +require(ggthemes) +# Obtain the long version of dataset and modify columns/labels +Dt_long <- melt_roc(Dt, d = 'Pancreatic_Cancer', c('CA_19.9', 'CA_125')) +colnames(Dt_long) <- c('Pancreatic Cancer', 'Level', 'Biomarker') +Dt_long$Biomarker <- ifelse(Dt_long$Biomarker == 'CA_19.9', 'CA 19-9', 'CA 125') + +# Version 3 +p_ROC3 <- + ggplot(Dt_long, aes(d = `Pancreatic Cancer`, # true disease label + m = Level, # continuous biomarker level + col = Biomarker)) + # color for different biomarkers + geom_roc(pointsize = 0, labels = F) + # draw ROC curve and remove labels/points + geom_rocci(ci.at = 21.8) + + geom_abline(slope = 1, intercept = 0) + # add a unit slope line + style_roc(xlab = 'False Positive Fraction', # modify x/y-axis labels + ylab = 'True Positive Fraction') + + scale_color_tableau() + + theme(axis.text.x = element_text(color = 'black', size = 11), + axis.text.y = element_text(color = 'black', size = 11), + axis.title.x = element_text(color = 'black', size = 11, face = 'bold'), + axis.title.y = element_text(color = 'black', size = 11, face = 'bold')) +p_ROC3 +``` +![](https://raw.githubusercontent.com/YzwIsALaity/ROC-Curve-Tutorial-in-R/2bd2adb8eef665384facc3581ca748abd44a6045/ROC%20Plot%20(Version%203).jpeg) + +In this figure, CA 125 and CA 19-9 are represented by blue and orange, respectively. When controlling FPF to be around 0.25, it is evident that TPF for CA 19-9 exceeds 0.75, while the TPF for CA 125 is approximately 0.50. This observation indicates that, based on this dataset, the performance of CA 19-9 is superior to that of CA 125. + +### (2). Classification model performance +In addition to visualizing continuous biomarkers, these functions can be employed to visualize the ROC curve for evaluating classification models. This necessitates the model to output __predicted probabilities__ from a classification model. Meanwhile, users may desire to evaluate the ROC curve with an AUC score. To achieve this, we will use the `roc()` function from the `pROC` package and implement a logistic regression for CA 19-9. + +```{r} +# Using pROC package to calculate AUC +require(pROC) +# Classification model: logistic regression +model <- glm(Pancreatic_Cancer ~ CA_19.9, family = binomial(), data = Dt) +# Predicted probability +predProb <- predict(model, newdata = Dt, type = 'response') +# Combined prob with real labels +Result <- data.frame('Predicted_Probability' = predProb, + 'Pancreatic_Cancer' = Dt$Pancreatic_Cancer) +# Calculate AUC +AUC <- round(auc(response = Result$Pancreatic_Cancer, + predictor = Result$Predicted_Probability), 2) + +# Version 4 +p_ROC4 <- + ggplot(Result, aes(d = Pancreatic_Cancer, # true disease label + m = Predicted_Probability)) + # continuous biomarker + geom_roc(n.cuts = 0) + # draw ROC curve + geom_abline(slope = 1, intercept = 0) + # add a unit slope line + annotate("text", x = .75, y = .25, # add AUC score + label = paste("AUC =", AUC)) + + style_roc(xlab = 'False Positive Fraction', # modify x/y-axis labels + ylab = 'True Positive Fraction') + + theme(axis.text.x = element_text(color = 'black', size = 11), + axis.text.y = element_text(color = 'black', size = 11), + axis.title.x = element_text(color = 'black', size = 11, face = 'bold'), + axis.title.y = element_text(color = 'black', size = 11, face = 'bold')) +p_ROC4 +``` +![](https://raw.githubusercontent.com/YzwIsALaity/ROC-Curve-Tutorial-in-R/2bd2adb8eef665384facc3581ca748abd44a6045/ROC%20Plot%20(Version%204).jpeg) + +The AUC score for the logistic regression model assessing the binary pancreatic cancer outcome based on the continuous biomarker CA 19-9 is 0.86. This value suggests that the performance of the logistic regression model is decent. diff --git a/_posts/2024-03-02-Baseline-Hazard-Plot.md b/_posts/2024-03-02-Baseline-Hazard-Plot.md new file mode 100644 index 00000000000..fd03b30e7b6 --- /dev/null +++ b/_posts/2024-03-02-Baseline-Hazard-Plot.md @@ -0,0 +1,208 @@ +--- +title: "Estimated Baseline Hazard Function in R with ggplot2" +mathjax: true +layout: post +categories: media +--- + +![Cover](https://raw.githubusercontent.com/YzwIsALaity/Baseline-Hazard-Plot-Tutorial-in-R/256df4a29230228e7a0effc11944d042fd84a528/Figure%202%20(Smooth).jpeg) + + +In this tutorial, we will delve into depicting the estimated __(cumulative) baseline hazard functions__ in R using the `ggplot2` package. As we observe, a Cox proportional hazard regression comprises two components: __a non-specific baseline hazard function of time and a linear predictor in exponential form__: + +$$ \lambda_{i}(t) = \lambda_{0}(t)e^{X_{i}(t) \beta}, $$ + +where $$ i $$ is the index for a unique patient, $$ t $$ represents a specific timepoint, $$ \lambda_{0}(t) $$ is the non-specific baseline hazard function over time, $$ X_{i}(t) $$ includes covariates for patient $$ i $$ at time $$ t $$, and $$ \beta $$ is a vector of coefficients. Typically, estimated coefficients can be obtained using the `coxph()` function in the `survival` package. However, procedures to obtain $$ \lambda_{0}(t) $$ are less commonly discussed. This is because __Cox regression is a semi-parametric model__ that incorporates __a non-parametric component for the baseline hazard function__, making it unspecified. Additionally, the estimation of the baseline hazard function is often challenging, and its statistical inference remains mysterious in the cutting-edge research of survival analysis. In comparison, estimating the cumulative (or integrated) baseline hazard function is generally easier than estimating the baseline hazard function. Therefore, we will start our illustration by focusing on the estimation of the cumulative baseline hazard function. + +## 1. Estimator + +We begin with a non-parametric estimator, the __Nelson-Aalen estimator for the cumulative baseline hazard function__. It can be expressed as: + +$$ \hat{\Lambda}(t) = \sum_{i: t_{i} \leq t} \frac{d_i}{r_{i}}, $$ + +where $$ d_i $$ is the number of events at time $$ t_{i} $$, and $$ r_{i} $$ is the number of patients "at risk" (i.e., under observation) at time $$ t_{i} $$. More specifically, under counting-process-type expression, it can also be expressed as: + +$$ \hat{\Lambda}(t) = \int_{0}^{t} \frac{d \bar{N}(s)}{ \sum_{i} \bar{Y}_{i}(s) }, $$ + +where $$ d \bar{N}(s) $$ is the number of events occurring precisely at time $$ s $$, and $$ \bar{Y}_{i}(s) $$ is the number of patients "at risk" at time $$ s $$. Heuristically, the __Nelson-Aalen estimator is a method of moments estimator__ and can be reasonably considered as a Poisson process at a short time period $$ t $$. Therefore, its estimated variance can be obtained as + +$$ \hat{Var}(\hat{\Lambda}) = \sum_{i: t_{i} \leq t} \frac{d_i}{r_{i}^2}, $$ + +where $$ r_{i} $$ is the number of patients "at risk" (i.e., under observation). The detailed procedure involves measure theory and stochastic process and is quite technical, so we will omit it here. In addition to the Nelson-Aalen estimator, the Breslow estimator for the cumulative baseline hazard function is often used after fitting Cox regression. Essentially, the Breslow estimator aims to estimate a survival function $$ S(t) $$ and is intricately connected with the cumulative baseline hazard function $$ S(t) = e^{- \Lambda(t)} $$. As the exponential function is continuous and monotonic, the crucial aspect of the Breslow estimator lies in obtaining the estimate of $$\Lambda(t) $$. In general, the Breslow estimator can be expressed as + +$$ \hat{\Lambda}(t) = \sum_{i: t_{i} \leq t} \frac{d_i}{\sum_{i: t_{i} \leq t} e^{X_{i} \hat{\beta}}}, $$ + +where $$ \hat{\beta} $$ can be obtained from the maximum partial likelihood estimator. __Interestingly, the Breslow estimator is equivalent to the Nelson-Aalen estimator when all covariates are set to zero__. Considering the counting-process-type expression, we can rewrite it as follows: + +$$ \hat{\Lambda}(t) = \sum_{i = 1}^{n} [\int_{0}^{t} \frac{dN_{i}(s)}{\sum_{i = 1}^{n}(Y_{i}(s) e^{X_{i} \hat{\beta}})}], $$ + +where $$ dN_{i}(s) $$ indicates if an event happened at time $$ s $$ for a patient $$ i $$, $$ Y_{i}(s) $$ indicates if a patient $$ i $$ is still under observation at time $$ s $$, $$ n $$ is the total number of patients. Therefore, we noticed that + +$$ \bar{N}(s) = \sum_{i = 1}^{n} N_{i}(s), \quad \bar{Y}_{i}(s) = \sum_{i = 1}^{n} Y_{i}(s). $$ + +The detailed proof involves the use of Fubini's theorem to enable the interchangeability of integration and summation, but we will omit it here. Since both estimators are designed for the cumulative (integrated) baseline hazard, if we aim to derive the baseline hazard function $$ \lambda_{0}(t) $$ from the cumulative baseline hazard $$ \Lambda(t) $$, we need to take the __stepwise discrete-time derivative for the cumulative (integrated) baseline hazard__. It is worth noting that, while __this approximation may not be very precise, the associated statistical inference (i.e., asymptotics) remains somewhat mysterious. Nevertheless, it serves as an informative tool for visualizing the baseline risk of event occurrence__. Sometimes, understanding the behavior of a Cox model in this context can be useful and may be used for comparisons with the incidence rate over time in lots of infectious diseases studies. Particularly, when applying time-dependent covariate or time-varying coefficient Cox models, the visualization of the baseline hazard can convey more information about the underlying behavior of the model. Now, we are moving on to the visualization part. + +## 2. Dataset + +The simulated dataset is designed to investigate the risk of secondary infection with various respiratory viruses after the primary rhinovirus infections, representing a type of viral interference. It comprises six columns: + +- `PID`: this is an **unique identification** for each participants (string); + +- `start` and `stop`: these two columns represent the __counting-process-type of data__, where they together __define the observation time interval__ (`start`, `end`] (numerical); + +- `rhinovirusInitial`: this is a __binary indicator for whether a rhinovirus infection is detected__ at the end of an observation interval (i.e., 1 for detected and 0 for not detected) (numerical); + +- `secondInfection`: this is a __binary indicator for whether other viral infection is detected__ at the end of an observation interval (i.e., 1 for detected and 0 for not detected) and it is the outcome variable (numerical); + +- `afterInitialInfection`: this is a __time-dependent covariate__ that categorizes an observational interval into four types: no secondary infections, detected secondary infection less than or equal to 90 days after the primary infection, detected secondary infection greater than 90 days but less than or equal to 180 days after the primary infection, and detected secondary infection greater than 180 days but less than or equal to 365 days after the primary infection [4 levels: `No`, `Day <= 90`, `90 < Day <= 180`, `180 < Day <= 365`] (string). + +![](https://raw.githubusercontent.com/YzwIsALaity/Baseline-Hazard-Plot-Tutorial-in-R/256df4a29230228e7a0effc11944d042fd84a528/Dataset.png) + +We also assumed that these patients are part of a __prospective longitudinal cohort study__, ensuring that each patient had at least two observations over the study period. To incorporate the __calendar time scale__ into the modeling procedure, the `start` and `stop` time points are adjusted relative to the study's initialized calendar date. + +## 3. Stepwise discrete-time derivative + +Given the awareness that only the cumulative (integrated) baseline hazard can be estimated with valid statistical inference, results from a Cox model in R provide the estimated cumulative baseline hazard function using the Breslow estimator, which is also equivalent to the Nelson-Aalen estimator when all covariates are set to zero. Therefore, we need to create a manual function to calculate the stepwise discrete-time derivative and use it to manually perform the derivative for the estimated cumulative baseline hazard function. The following code snippet is an example of performing the stepwise discrete-time derivative. + +```{r} +finite_differences <- function(x, y) { + n <- length(x) + derivativeFunction <- vector(length = n) + for (i in 2:n) { + derivativeFunction[i-1] <- (y[i-1] - y[i]) / (x[i-1] - x[i]) + } + derivativeFunction[n] <- (y[n] - y[n - 1]) / (x[n] - x[n - 1]) + return(derivativeFunction) +} +``` + +Since there are multiple code structures that can compute the stepwise discrete-time derivative, we present a self-created version. One may choose any type of stepwise discrete-time derivative function to calculate the derivative. + +## 4. Baseline hazard function + +To obtain the estimated baseline hazard function, we first need to construct a Cox model in R. Essentially, we need to utilize the `Surv()` and `coxph()` functions from the `survival` package to construct a Cox model: + +- `Surv()`: it is a function used to __create a survival object__, typically employed as a response variable in a model formula. It requires to specify several arguments: + + - `time` and `time2`: they are used to specify an observation time interval, denoted as (`time`, `time2`] = (start, stop]); + + - `event`: it is used to specify the status indicator, representing a binary event outcome. + +- `coxph()`: it is used to __fit a Cox proportional hazards regression model__. + +We will utilize the `Surv()` function to set up a survival outcome and employ `coxph()` to fit a model. + +```{r} +require(survival) +# Fit cox regression +fomula <- Surv(start, stop, secondInfection) ~ afterInitialInfection # construct formula +model <- coxph(fomula, cluster = PID, data = Dt) # due to repeated measurement over time using robust variance (set cluster = PID) +Summary <- summary(model) # Obtain model summary + +# Create coefficient table +resultTable <- cbind(Summary$coefficients, Summary$conf.int[, c('lower .95', 'upper .95')]) +round(resultTable, 4) +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/Baseline-Hazard-Plot-Tutorial-in-R/256df4a29230228e7a0effc11944d042fd84a528/Coefficient%20table.png) + +After fitting the Cox model, we obtained estimated hazard ratios for the time-dependent covariate to evaluate the risk of secondary infections of other viruses after the primary rhinovirus infection. We found that **the risk of secondary infections of other viruses within 90 days (HR: 0.58, 95% CI: 0.43, 0.77) and after 180 days but before 365 days (HR: 0.27, 95% CI: 0.18, 0.42) after the primary rhinovirus infection was significantly decreased compared to patients without a primary rhinovirus infection**. Although the risk of secondary infections of other viruses after 90 days but before 180 days was not statistically significant, it was still estimated as a decreased risk. These results are consistent with previous research related to the antiviral state after human rhinovirus infection. + +Now that we have obtained the fitted Cox model, we are going to derive the estimated cumulative baseline hazard function from this model using the `basehaz()` function. Essentially, this function is used to __compute the cumulative hazard at each time point. By default, all covariates in the fitted Cox model are set to 0__. Additionally, it is an alias for the `survfit()` function, which creates survival curves from survival models. It includes two columns: `hazard` represents the estimated cumulative hazard, and `time` corresponds to the timepoints for the cumulative hazard. + +```{r} +# Obtain cumulative baseline hazard function from Breslow estimator (same as Nelson-Aalen estimator by default) +bhest <- basehaz(model) +bhest +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/Baseline-Hazard-Plot-Tutorial-in-R/256df4a29230228e7a0effc11944d042fd84a528/Cumulative%20hazard%20function.png) + +Given that the estimated cumulative hazard is a function of time (i.e., $$ \hat{\Lambda}(t) $$), we can manually derive the stepwise discrete-time derivative for the estimated cumulative baseline hazard function (`hazard`) with respect to `time`. Utilizing our self-created derivative function from the prior section, we then applied a non-parametric smooth curve to both the estimated cumulative baseline hazard function and baseline hazard function. There are various smoothing techniques available, and for this analysis, we employed the `ss()` function from the `npreg` package. We specified different degrees of freedom (`df` argument) to create smooth splines for the two estimated baseline hazards. + +```{r} +# Calculate numerical derivative +deri <- finite_differences(x = bhest$time, y = bhest$hazard) +baselineHazard <- cbind(bhest, deri) +colnames(baselineHazard) <- c('CumulativeBaselineHazard', 'Time', 'BaselineHazard') + +# Non-parametric smooth curve for estimating baseline hazard function over time +smoothBH <- ss(x = baselineHazard$Time, y = baselineHazard$BaselineHazard, df = 4) + +# Non-parametric smooth curve for estimating cumulative baseline hazard function over time +smoothCBH <- ss(x = baselineHazard$Time, y = baselineHazard$CumulativeBaselineHazard, df = 6) + +Dt_Plot <- data.frame('Time' = baselineHazard$Time, + 'BH' = baselineHazard$BaselineHazard, + 'smoothBH' = smoothBH$y, + 'CBH' = baselineHazard$CumulativeBaselineHazard, + 'smoothCBH' = smoothCBH$y) +``` + +After combining the smooth curves with the original estimated (cumulative) baseline hazard functions using the `Time` steps, we will use this combined dataset to visualize these two functions. Initially, we visualize the raw estimated (cumulative) baseline functions without applying smoothers, using different colors. Here, since the __cumulative baseline hazard__ ($$ \Lambda(t) \geq 0 $$) and __baseline hazard__ ($$ 0 \leq \lambda_{0}(t) \leq 1 $$) are on different scales, we __create a second y-axis for the cumulative baseline hazard__. Therefore, it is necessary to __construct a scale factor__ to map values from the scale of cumulative hazard to the scale of baseline hazard. + +```{r} +# Set up color for smooth curves +colors <- c("Baseline hazard" = "blue", "Cumulative baseline\nhazard" = "#FFA500") + +# Create scale factor for second y-axis +scaleFactor <- max(Dt_Plot$BH) / max(Dt_Plot$CBH) + +# Create x and y labels +xlab <- seq(0, 1000, length.out = 11) +ylab <- seq(0, 0.04, length.out = 5) + +# Create figure +p <- + ggplot(Dt_Plot, aes(x = Time)) + + geom_point(aes(y = BH, color = 'Baseline hazard'), size = 0.75, alpha = 0.3) + + geom_path(aes(y = BH, color = 'Baseline hazard')) + + geom_path(aes(y = CBH * scaleFactor, color = 'Cumulative baseline\nhazard')) + + geom_point(aes(y = CBH * scaleFactor, color = 'Cumulative baseline\nhazard'), size = 0.75, alpha = 0.3) + + labs(x = 'Time', y = 'Raw baseline hazard') + + scale_x_continuous(expand = c(0, 0.01), limits = c(0, 1000), breaks = xlab, labels = xlab) + + scale_y_continuous(expand = c(0, 0.01), + sec.axis = sec_axis(trans = ~ ./scaleFactor, name = "Raw cumulative baseline hazard")) + + scale_color_manual(values = colors) + + theme_bw() + + theme(panel.background = element_blank(), + axis.line.x = element_line(), + axis.line.y = element_line(), + axis.text.x = element_text(color = "black", size = 7), + axis.text.y = element_text(color = 'black', size = 7), + legend.title = element_blank()) +p +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/Baseline-Hazard-Plot-Tutorial-in-R/256df4a29230228e7a0effc11944d042fd84a528/Figure%201%20(Raw).jpeg) + +In this figure, the estimated baseline hazard points are depicted in blue using the left y-axis, the estimated cumulative baseline hazard is depicted in orange using the right y-axis, and the x-axis represents the time. __It is noteworthy that the relationship between the two estimated functions is such that the slope of the cumulative baseline hazard function at each time point actually reflects the value of the baseline hazard function at that specific time point__. The raw estimated cumulative baseline hazard function appears smoother than the estimated baseline hazard function. Now we are going to visualize the smoothed version of these two functions. + +```{r} +# Create scale factor for second y-axis +scaleFactor <- max(Dt_Plot$smoothBH) / max(Dt_Plot$smoothCBH) + +# Create figure +p <- + ggplot(Dt_Plot, aes(x = Time)) + + geom_path(aes(y = smoothBH, color = 'Baseline hazard')) + + geom_path(aes(y = smoothCBH * scaleFactor, color = 'Cumulative baseline\nhazard')) + + labs(x = 'Time', y = 'Baseline hazard') + + scale_x_continuous(expand = c(0, 0), limits = c(0, 1000), breaks = xlab, labels = xlab) + + scale_y_continuous(expand = c(0, 0), limits = c(0, 0.04), breaks = ylab, labels = round(ylab, 3), + sec.axis = sec_axis(trans = ~ ./scaleFactor, name = "Cumulative baseline hazard")) + + scale_color_manual(values = colors) + + theme_bw() + + theme(panel.background = element_blank(), + axis.line.x = element_line(), + axis.line.y = element_line(), + axis.text.x = element_text(color = "black", size = 7), + axis.text.y = element_text(color = 'black', size = 7), + legend.title = element_blank()) +p +``` + +![](https://raw.githubusercontent.com/YzwIsALaity/Baseline-Hazard-Plot-Tutorial-in-R/256df4a29230228e7a0effc11944d042fd84a528/Figure%202%20(Smooth).jpeg) + +In the smoothed version, the estimated cumulative baseline hazard function monotonically increases, while the estimated baseline hazard function reflects the baseline risk of secondary viral infections when there are no factors adjusted for. The risk of getting a secondary viral infection in the environment decreases initially, increases in the second stage, and eventually decreases again approaching the end of the study period. Since this is simulated data, the estimated baseline risk may not truly reflect respiratory viral interference in the environment. Sometimes, this __estimated baseline risk can be verified by the smooth curve for the estimated incidence rate calculated from the same study__. + diff --git a/index.html b/index.html index 0b3329cdd2f..f874ca42a94 100644 --- a/index.html +++ b/index.html @@ -1,10 +1,10 @@ --- layout: default -title: "Home" +title: "Blog" --- {% if site.show_excerpts %} {% include home.html %} {% else %} - {% include archive.html title="Posts" %} + {% include archive.html title="posts" %} {% endif %}