Text Analysis

Text to analyze:

Lexical diversity options:

Readability options:

Word length (letters)
Word length (syllables)
Summary
Details
                      
Summary
Details
                      
Note

This web application is developed with Shiny.


List of Packages Used
library(shiny)
library(koRpus)

Code

Source code for this application is mostly from koRpus: An R packge for text analysis.

The code for this web application is available at GitHub.

If you want to run this code on your computer (in a local R session), run the code below:
library(shiny)
runGitHub("corpus","mizumot")


Citation in Publications

Mizumoto, A. (2015). Langtest (Version 1.0) [Web application]. Retrieved from http://langtest.jp


Article

Mizumoto, A., & Plonsky, L. (2015). R as a lingua franca: Advantages of using R for quantitative research in applied linguistics. Applied Linguistics, Advance online publication. doi:10.1093/applin/amv025


Recommended

To learn more about R, I suggest this excellent and free e-book (pdf), A Guide to Doing Statistics in Second Language Research Using R, written by Dr. Jenifer Larson-Hall.

Also, if you are a cool Mac user and want to use R with GUI, MacR is defenitely the way to go!


Author

Atsushi MIZUMOTO, Ph.D.
Professor of Applied Linguistics
Faculty of Foreign Language Studies /
Graduate School of Foreign Language Education and Research,
Kansai University, Osaka, Japan



Code for "Text Analysis"
by Atsushi Mizumoto

show with app
library(shiny)
library(koRpus)

shinyServer(function(input, output){

	tagged.text <- reactive(tokenize(input$text, format="obj", lang="en"))
	hyphenated.text <- reactive({
			# set the next line to activate caching, if this application is run on a shiny server
			#set.kRp.env(hyph.cache.file=file.path("/var","shiny-server","cache","koRpus",paste("hyph.cache.",input$lang,".rdata", sep="")))
			hyphen(tagged.text(), quiet=TRUE)
		})
    
    output$word.list <- renderTable({
        x <- input$text
        x <- tolower(x)
        words <- unlist (strsplit (x, split = "[[:space:]]+|[[:punct:]]+"))
        Word <- words[words !=""]
        Word.freq <- as.data.frame(table (Word))
        Word.sorted <- Word.freq[order(Word.freq$Freq, decreasing = TRUE), ]
        return(Word.sorted)
    })
    #output$word.list <- renderPrint({
    #word.list()
    #})
    
	output$letter.plot <- renderPlot(plot(tagged.text(), what="letters"))
	output$desc <- renderTable({
		basic.desc.data <- as.data.frame(describe(tagged.text())[c("all.chars","normalized.space","chars.no.space", "letters.only","lines",
			"punct","digits","words","sentences","avg.sentc.length","avg.word.length")])
		syll.desc.data <- as.data.frame(describe(hyphenated.text())[c("num.syll", "avg.syll.word")])
		colnames(basic.desc.data) <- c("All characters","Normalized space","Characters (no space)", "Characters (letters only)","Lines",
			"Punctuation","Digits","Words","Sentences","Avg. sentence length","Avg. word length")
		colnames(syll.desc.data) <- c("Syllables", "Avg. syllable per word")
		desc.data <- cbind(basic.desc.data, syll.desc.data)
		rownames(desc.data) <- c("Value")
		t(desc.data)
	})
	output$desc.lttr.disrib <- renderTable({
		t(describe(tagged.text())[["lttr.distrib"]])
	})
	output$syll.disrib <- renderTable({
		t(describe(hyphenated.text())[["syll.distrib"]])
	})

	LD.results <- reactive(lex.div(tagged.text(), segment=input$LD.segment, factor.size=input$LD.factor, min.tokens=input$LD.minTokens,
			rand.sample=input$LD.random, window=input$LD.window, case.sens=input$LD.caseSens, detailed=FALSE, char=c(), quiet=TRUE))
	output$lexdiv.sum <- renderTable({
		summary(LD.results())
	})
	output$lexdiv.res <- renderPrint({
		LD.results()
	})

	RD.results <- reactive(readability(tagged.text(), hyphen=hyphenated.text(), index=input$RD.indices, quiet=TRUE))
	output$readability.sum <- renderTable({
		summary(RD.results())
	})
	output$readability.res <- renderPrint({
		RD.results()
	})

})
library(shiny)

shinyUI(
	pageWithSidebar(
		headerPanel("Text Analysis"),

		sidebarPanel(
			# limit the maximum amount of text to be analyzed
			includeHTML("./maxlength.html"),
			h4("Text to analyze:"),
			tags$textarea(id="text", rows=30, cols=35, maxlength=10000,
				onblur="if(this.value==\"\") this.value=\"(Paste your text here. Text limit is 10000 characters, but should at least have 100 words.)\";",
				onfocus="if(this.value==\"(Paste your text here. Text limit is 10000 characters, but should at least have 100 words.)\") this.value=\"\";",
				"(Paste your text here. Text limit is 10000 characters, but should at least have 100 words.)"),
			conditionalPanel("input.tab == 'chkLexdiv'",
				h4("Lexical diversity options:"),
				numericInput("LD.segment", "MSTTR segment size:", 100),
				sliderInput("LD.factor", "MTLD/MTLD-MA factor size:", min=0, max=1, value=0.72),
				numericInput("LD.minTokens", "MTLD-MA min. tokens/factor:", 9),
				numericInput("LD.random", "HD-D sample size:", 42),
				numericInput("LD.window", "MATTR moving window:", 100),
				checkboxInput("LD.caseSens", "Case sensitive", FALSE)
			),
			conditionalPanel("input.tab == 'chkReadability'",
				h4("Readability options:"),
				checkboxGroupInput("RD.indices", label="Measures to calculate",
					choices=c("ARI"="ARI",
						"ARI (NRI)"="ARI.NRI",
						"ARI (simplified)"="ARI.simple",
						"Coleman-Liau"="Coleman.Liau",
						"Danielson-Bryan"="Danielson.Bryan",
						"Dickes-Steiwer"="Dickes.Steiwer",
						"ELF"="ELF",
						"Farr-Jenkins-Paterson"="Farr.Jenkins.Paterson",
						"Farr-Jenkins-Paterson (Powers-Sumner-Kearl)"="Farr.Jenkins.Paterson.PSK",
						"Flesch"="Flesch",
						"Flesch (Powers-Sumner-Kearl)"="Flesch.PSK",
						"Flesch (DE, Amstad)"="Flesch.de",
						"Flesch (ES, Fernandez-Huerta)"="Flesch.es",
						"Flesch (FR, Kandel-Moles)"="Flesch.fr",
						"Flesch (NL, Douma)"="Flesch.nl",
						"Flesch-Kincaid"="Flesch.Kincaid",
						"FOG"="FOG",
						"FOG (Powers-Sumner-Kearl)"="FOG.PSK",
						"FOG (NRI)"="FOG.NRI",
						"FORCAST"="FORCAST",
						"FORCAST (reading grade level)"="FORCAST.RGL",
						"Fucks Stilcharakteristik"="Fucks",
						"Linsear-Write"="Linsear.Write",
						"LIX"="LIX",
						"Neue Wiener Sachtextformeln"="nWS",
						"RIX"="RIX",
						"SMOG"="SMOG",
						"SMOG (DE, Bamberger-Vanecek)"="SMOG.de",
						"SMOG (formula C)"="SMOG.C",
						"SMOG (simplified)"="SMOG.simple",
						"Strain"="Strain",
						"TRI"="TRI",
						"Wheeler-Smith"="Wheeler.Smith",
						"Wheeler-Smith (DE, Bamberger-Vanecek)"="Wheeler.Smith.de"),
					selected=c("ARI",
						"Coleman.Liau",
						"Danielson.Bryan",
						"Dickes.Steiwer",
						"ELF",
						"Farr.Jenkins.Paterson",
						"Flesch",
						"Flesch.Kincaid",
						"FOG",
						"FORCAST",
						"Fucks",
						"Linsear.Write",
						"LIX",
						"RIX",
						"SMOG",
						"Strain",
						"Wheeler.Smith"))
			)
#			submitButton("Update View")
		),



		mainPanel(
			tabsetPanel(
				tabPanel("Descriptive statistics",
					tableOutput("desc"),
					h5("Word length (letters)"),
					tableOutput("desc.lttr.disrib"),
					h5("Word length (syllables)"),
					tableOutput("syll.disrib"),
					plotOutput("letter.plot")
				),
                tabPanel("Word list",
                    tableOutput("word.list")
                ),
				tabPanel("Lexical diversity",
					h5("Summary"),
					tableOutput("lexdiv.sum"),
					h5("Details"),
					pre(textOutput("lexdiv.res")),
					value="chkLexdiv"
				),
				tabPanel("Readability",
					h5("Summary"),
					tableOutput("readability.sum"),
					h5("Details"),
					pre(textOutput("readability.res")),
					value="chkReadability"
				),


                tabPanel("About",
                    strong('Note'),
                        p('This web application is developed with',
                        a("Shiny.", href="http://www.rstudio.com/shiny/", target="_blank"),
                        ''),
                    br(),

                    strong('List of Packages Used'), br(),
                        code('library(shiny)'),br(),
                        code('library(koRpus)'),br(),

                    br(),

                    strong('Code'),
                        p('Source code for this application is mostly from',
                        a('koRpus: An R packge for text analysis.', href='http://reaktanz.de/?c=hacking&s=koRpus', target="_blank")),

                        p('The code for this web application is available at',
                        a('GitHub.', href='https://github.com/mizumot/corpus', target="_blank")),

                        p('If you want to run this code on your computer (in a local R session), run the code below:',
                        br(),
                        code('library(shiny)'),br(),
                        code('runGitHub("corpus","mizumot")')
                        ),

                    br(),

                    strong('Citation in Publications'),
                        p('Mizumoto, A. (2015). Langtest (Version 1.0) [Web application]. Retrieved from http://langtest.jp'),

                    br(),

                    strong('Article'),
                        p('Mizumoto, A., & Plonsky, L. (2015).', a("R as a lingua franca: Advantages of using R for quantitative research in applied linguistics.", href='http://applij.oxfordjournals.org/content/early/2015/06/24/applin.amv025.abstract', target="_blank"), em('Applied Linguistics,'), 'Advance online publication. doi:10.1093/applin/amv025'),

                    br(),

                    strong('Recommended'),
                        p('To learn more about R, I suggest this excellent and free e-book (pdf),',
                        a("A Guide to Doing Statistics in Second Language Research Using R,", href="http://cw.routledge.com/textbooks/9780805861853/guide-to-R.asp", target="_blank"),
                            'written by Dr. Jenifer Larson-Hall.'),

                        p('Also, if you are a cool Mac user and want to use R with GUI,',
                            a("MacR", href="https://sites.google.com/site/casualmacr/", target="_blank"),
                            'is defenitely the way to go!'),

                    br(),

                    strong('Author'),
                        p(a("Atsushi MIZUMOTO,", href="http://mizumot.com", target="_blank"),' Ph.D.',br(),
                        'Professor of Applied Linguistics',br(),
                        'Faculty of Foreign Language Studies /',br(),
                        'Graduate School of Foreign Language Education and Research,',br(),
                        'Kansai University, Osaka, Japan'),

                    br(),

                    a(img(src="http://i.creativecommons.org/p/mark/1.0/80x15.png"), target="_blank", href="http://creativecommons.org/publicdomain/mark/1.0/"),

                    p(br())

                )

            ))
))
Code license: GPL-3