# Companion file to "The Indian exception: complex prepositions in the Kolhapur Corpus" 
# Author: Guillaume Desagulier
# Last updated: December 20th, 2017
# If you use this script for a talk or a publication, please cite this article as: Guillaume Desagulier, "The Indian exception: complex prepositions in the Kolhapur Corpus," in Around the word, 20/12/2017, https://corpling.hypotheses.org/284.

expr.match <- "(a cut above|a la|abaft|aboard|about|above|absent|according to|across|afore|after|after the fashion of|against|agin|ahead of|all for|all over|along|along with|alongside|amid|amidst|among|anent|anti|apart from|apropos|apropos of|around|as|as far as|as for|as from|as of|as regards|as to|as well as|aside from|aslant|astraddle|astride|at|at a range of|at the hand of|at the hands of|at the heels of|athwart|atop|back of|bar|bare of|barring|because of|before|behind|below|beneath|beside|besides|between|betwixt|beyond|but|but for|by|by courtesy of|by dint of|by force of|by means of|by reason of|by the hand of|by the hands of|by the name of|by virtue of|by way of|care of|chez|circa|come|complete with|concerning|considering|contrary to|counting|courtesy of|cum|dehors|depending on|despite|down|due to|during|ere|ex|except|except for|excepting|excluding|exclusive of|failing|following|for|for all|for the benefit of|forbye|fore|fornent|frae|from|give or take|given|gone|having regard to|in|in accord with|in addition to|in advance of|in aid of|in back of|in bed with|in behalf of|in case of|in common with|in company with|in connection with|in consideration of|in contravention of|in default of|in excess of|in face of|in favor of|in favour of|in front of|in honor of|in honour of|in keeping with|in lieu of|in light of|in line with|in memoriam|in need of|in peril of|in place of|in proportion to|in re|in reference to|in regard to|in relation to|in respect of|in sight of|in spite of|in terms of|in the course of|in the face of|in the fashion of|in the grip of|in the light of|in the matter of|in the midst of|in the name of|in the pay of|in the person of|in the shape of|in the teeth of|in the throes of|in token of|in view of|in virtue of|including|inclusive of|inside|inside of|instead of|into|irrespective of|less|like|little short of|mid|midst|minus|mod|modulo|more like|near|near to|neath|next door to|next to|nigh|nothing short of|notwithstanding|of|of the name of|of the order of|off|on|on a level with|on a par with|on account of|on behalf of|on pain of|on the order of|on the part of|on the point of|on the score of|on the strength of|on the stroke of|on top of|onto|opposite|other than|out of|out of keeping with|out of line with|outboard of|outside|outside of|outta|outwith|over|over against|over and above|overtop|owing to|pace|past|pending|per|plus|preparatory to|previous to|prior to|pro|pursuant to|qua|re|regarding|regardless of|relative to|respecting|round|round about|sans|save|saving|short for|short of|since|subsequent to|than|thanks to|this side of|through|throughout|thru|thwart|till|to|to the accompaniment of|to the tune of|together with|touching|toward|towards|under|under cover of|under pain of|under sentence of|under the heel of|underneath|unlike|until|unto|up|up against|up and down|up before|up for|up to|upon|upside|upward of|upwards of|versus|via|vice|vis-a-vis|while|with|with reference to|with regard to|with respect to|with the exception of|within|within sight of|without)"

rm(list=ls(all=TRUE))

# load each data frame
kolhapur <- read.table("~/kolhapur.preps.txt", header=TRUE, sep="\t")
brown <- read.table("~/brown.preps.txt", header=TRUE, sep="\t")
lob <- read.table("~/lob.preps.txt", header=TRUE, sep="\t")

# combine the data frames with rbind()
all <- rbind(kolhapur, brown, lob)

# split the corpus name and the text category identifier
all$corpus <- lapply(strsplit(as.character(all$corpus.file), "_"), "[", 1)
all$category <- lapply(strsplit(as.character(all$corpus.file), "_"), "[", 2)
all$category <- gsub("\\.txt", "", as.character(all$category))
all$corpus.file <- NULL

# replace the text category identifier with what it stands for
all$category[all$category=="a"]  <- "press_reportage"
all$category[all$category=="b"]  <- "press_editorial"
all$category[all$category=="c"]  <- "press_reviews"
all$category[all$category=="d"]  <- "religion"
all$category[all$category=="e"]  <- "skills_trades_hobbies"
all$category[all$category=="f"]  <- "popular_lore"
all$category[all$category=="g"]  <- "belles_lettres"
all$category[all$category=="h"]  <- "miscellaneous"
all$category[all$category=="j"]  <- "learned_scientific"
all$category[all$category=="k"]  <- "general_fiction"
all$category[all$category=="l"]  <- "mystery_detective_fiction"
all$category[all$category=="m"]  <- "science_fiction"
all$category[all$category=="n"]  <- "adventure_western_fiction"
all$category[all$category=="p"]  <- "romance_love_story"
all$category[all$category=="r"]  <- "humour"

# make sure the modalities of the corpus and category variables are factors 
all$corpus <- as.factor(unlist(all$corpus))
all$category <- as.factor(all$category)

rm(list=ls(all=TRUE))

# load the dataframe
df <- read.table("~/all.preps.txt", header=TRUE, row.names=1, sep="\t")

# convert the table of categorical data
# into a table of counts
# step 1: preposition x corpus
dfc <- table(df$preposition, df$corpus)
dfcm1 <- as.data.frame.matrix(dfc)

# convert the table of categorical data
# into a table of counts
# step 2: preposition x text category
dfc2 <- table(df$preposition, df$category)
dfcm2 <- as.data.frame.matrix(dfc2)
head(dfcm2)

# combine the two tables of counts
dfcm <- cbind(dfcm1, dfcm2)

# convert the row names into factors
dfcm$preposition <- as.factor(rownames(dfcm))

# count the number of words per preposition with the stringi package
# this is done by counting the spaces between each word and adding 1
install.packages(stringi)
library(stringi)
dfcm$prep.length <- (stri_count(dfcm$preposition, regex=" ", opts_regex=stri_opts_regex(case_insensitive=TRUE)))+1
dfcm$prep.length <- as.factor(dfcm$prep.length)

# once this is done, remove the dfcm$preposition column
dfcm$preposition <- NULL

# load and install FactoMineR
install.packages("FactoMineR")
library(FactoMineR)

# run CA on dfcm
# columns 4 to 18 are supplementary
# column 19 is qualitative and therefore supplementary
# do not plot the graph yet
ca.object <- CA(dfcm, col.sup=4:18, quali.sup=19, graph=F)

# plot the CA output
plot.CA(ca.object, invisible="row", autoLab="yes", shadow=TRUE, 
cex=.8, col.col="magenta", col.col.sup="dodgerblue", 
title="Distribution of prepositions based on lexical complexity
 in three corpora:\n LOB (British English), Brown (US English),
 and Kolhapur (Indian English)", cex.main=.8)