# Companion file to "The Indian exception: complex prepositions in the Kolhapur Corpus" # Author: Guillaume Desagulier # Last updated: December 20th, 2017 # If you use this script for a talk or a publication, please cite this article as: Guillaume Desagulier, "The Indian exception: complex prepositions in the Kolhapur Corpus," in Around the word, 20/12/2017, https://corpling.hypotheses.org/284. expr.match <- "(a cut above|a la|abaft|aboard|about|above|absent|according to|across|afore|after|after the fashion of|against|agin|ahead of|all for|all over|along|along with|alongside|amid|amidst|among|anent|anti|apart from|apropos|apropos of|around|as|as far as|as for|as from|as of|as regards|as to|as well as|aside from|aslant|astraddle|astride|at|at a range of|at the hand of|at the hands of|at the heels of|athwart|atop|back of|bar|bare of|barring|because of|before|behind|below|beneath|beside|besides|between|betwixt|beyond|but|but for|by|by courtesy of|by dint of|by force of|by means of|by reason of|by the hand of|by the hands of|by the name of|by virtue of|by way of|care of|chez|circa|come|complete with|concerning|considering|contrary to|counting|courtesy of|cum|dehors|depending on|despite|down|due to|during|ere|ex|except|except for|excepting|excluding|exclusive of|failing|following|for|for all|for the benefit of|forbye|fore|fornent|frae|from|give or take|given|gone|having regard to|in|in accord with|in addition to|in advance of|in aid of|in back of|in bed with|in behalf of|in case of|in common with|in company with|in connection with|in consideration of|in contravention of|in default of|in excess of|in face of|in favor of|in favour of|in front of|in honor of|in honour of|in keeping with|in lieu of|in light of|in line with|in memoriam|in need of|in peril of|in place of|in proportion to|in re|in reference to|in regard to|in relation to|in respect of|in sight of|in spite of|in terms of|in the course of|in the face of|in the fashion of|in the grip of|in the light of|in the matter of|in the midst of|in the name of|in the pay of|in the person of|in the shape of|in the teeth of|in the throes of|in token of|in view of|in virtue of|including|inclusive of|inside|inside of|instead of|into|irrespective of|less|like|little short of|mid|midst|minus|mod|modulo|more like|near|near to|neath|next door to|next to|nigh|nothing short of|notwithstanding|of|of the name of|of the order of|off|on|on a level with|on a par with|on account of|on behalf of|on pain of|on the order of|on the part of|on the point of|on the score of|on the strength of|on the stroke of|on top of|onto|opposite|other than|out of|out of keeping with|out of line with|outboard of|outside|outside of|outta|outwith|over|over against|over and above|overtop|owing to|pace|past|pending|per|plus|preparatory to|previous to|prior to|pro|pursuant to|qua|re|regarding|regardless of|relative to|respecting|round|round about|sans|save|saving|short for|short of|since|subsequent to|than|thanks to|this side of|through|throughout|thru|thwart|till|to|to the accompaniment of|to the tune of|together with|touching|toward|towards|under|under cover of|under pain of|under sentence of|under the heel of|underneath|unlike|until|unto|up|up against|up and down|up before|up for|up to|upon|upside|upward of|upwards of|versus|via|vice|vis-a-vis|while|with|with reference to|with regard to|with respect to|with the exception of|within|within sight of|without)" rm(list=ls(all=TRUE)) # load each data frame kolhapur <- read.table("~/kolhapur.preps.txt", header=TRUE, sep="\t") brown <- read.table("~/brown.preps.txt", header=TRUE, sep="\t") lob <- read.table("~/lob.preps.txt", header=TRUE, sep="\t") # combine the data frames with rbind() all <- rbind(kolhapur, brown, lob) # split the corpus name and the text category identifier all$corpus <- lapply(strsplit(as.character(all$corpus.file), "_"), "[", 1) all$category <- lapply(strsplit(as.character(all$corpus.file), "_"), "[", 2) all$category <- gsub("\\.txt", "", as.character(all$category)) all$corpus.file <- NULL # replace the text category identifier with what it stands for all$category[all$category=="a"] <- "press_reportage" all$category[all$category=="b"] <- "press_editorial" all$category[all$category=="c"] <- "press_reviews" all$category[all$category=="d"] <- "religion" all$category[all$category=="e"] <- "skills_trades_hobbies" all$category[all$category=="f"] <- "popular_lore" all$category[all$category=="g"] <- "belles_lettres" all$category[all$category=="h"] <- "miscellaneous" all$category[all$category=="j"] <- "learned_scientific" all$category[all$category=="k"] <- "general_fiction" all$category[all$category=="l"] <- "mystery_detective_fiction" all$category[all$category=="m"] <- "science_fiction" all$category[all$category=="n"] <- "adventure_western_fiction" all$category[all$category=="p"] <- "romance_love_story" all$category[all$category=="r"] <- "humour" # make sure the modalities of the corpus and category variables are factors all$corpus <- as.factor(unlist(all$corpus)) all$category <- as.factor(all$category) rm(list=ls(all=TRUE)) # load the dataframe df <- read.table("~/all.preps.txt", header=TRUE, row.names=1, sep="\t") # convert the table of categorical data # into a table of counts # step 1: preposition x corpus dfc <- table(df$preposition, df$corpus) dfcm1 <- as.data.frame.matrix(dfc) # convert the table of categorical data # into a table of counts # step 2: preposition x text category dfc2 <- table(df$preposition, df$category) dfcm2 <- as.data.frame.matrix(dfc2) head(dfcm2) # combine the two tables of counts dfcm <- cbind(dfcm1, dfcm2) # convert the row names into factors dfcm$preposition <- as.factor(rownames(dfcm)) # count the number of words per preposition with the stringi package # this is done by counting the spaces between each word and adding 1 install.packages(stringi) library(stringi) dfcm$prep.length <- (stri_count(dfcm$preposition, regex=" ", opts_regex=stri_opts_regex(case_insensitive=TRUE)))+1 dfcm$prep.length <- as.factor(dfcm$prep.length) # once this is done, remove the dfcm$preposition column dfcm$preposition <- NULL # load and install FactoMineR install.packages("FactoMineR") library(FactoMineR) # run CA on dfcm # columns 4 to 18 are supplementary # column 19 is qualitative and therefore supplementary # do not plot the graph yet ca.object <- CA(dfcm, col.sup=4:18, quali.sup=19, graph=F) # plot the CA output plot.CA(ca.object, invisible="row", autoLab="yes", shadow=TRUE, cex=.8, col.col="magenta", col.col.sup="dodgerblue", title="Distribution of prepositions based on lexical complexity in three corpora:\n LOB (British English), Brown (US English), and Kolhapur (Indian English)", cex.main=.8)