--- title: 'TRAC at 30: A bibliometric analysis of TRAC’s Identity' author: "Nicky Garland" date: "`r format(Sys.time(), '%d %B, %Y')`" output: html_document: toc: yes toc_float: yes self_contained: yes abstract: "*Abstract* \n As TRAC approaches another milestone anniversary there is much to celebrate - a successful conference, a large body of publications and a thriving community. Part of this success stems from the unusually introspective and self-critical nature of the organisation. However, despite this tradition, there has been limited data-driven analysis of TRAC's successes and failures. This paper uses bibliometric data from the corpus of TRAC publications over the last 30 years to analyse whether the organisation has achieved its aims. Alongside data from comparable journals, this analysis will also determine whether TRAC is ahead or behind the wider academic world. This analysis provides insights into how diverse TRAC has become and how me might move forward in future." keywords: TRAC, TRAJ, retrospective, bibliometrics, publication, future --- ```{r setup, echo = FALSE, message = FALSE, warning = FALSE} knitr::opts_chunk$set( collapse = TRUE, warning = FALSE, message = FALSE, echo = FALSE, cache = FALSE, fig.path = "../figures/" ) ``` ```{r load_packages} library(ggplot2) library(dplyr) library(bibliometrix) library(gridExtra) library(gt) library(kableExtra) library(knitr) library(patchwork) library(readr) library(scales) library(stringr) library(quanteda) library(plyr) library(RColorBrewer) library(tidyr) library(tidytext) library(viridis) ``` # Suplementary Data 2 - R Code for producing plots and bibliometric analysis This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see . When you click the **Knit** button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this: ```{r traj-import-data} trac<-read.csv("trac_database.csv") ``` ## Tables ### Table 1: Heading and descriptions of fields in TRAC publication database ```{r traj-database-headings, echo=FALSE, results='asis'} Headings <- c("authors", "title", "journal", "year_pub", "pages", "journal_issue", "number_authors", "collaborative", "full_reference", "lead_author_institution", "institution_country", "country_topic", "institution_type", "theory_topic", "subject_topic", "method_topic", "type") Description <- c("Name of article author(s)", "Title of article", "Journal title", "Year of publication", "Page numbers of articles", "Volume number", "Number of authors", "Was the article multi-author?", "Full reference of article", "Institution name of first author of article", "County of lead author institution", "Countries used as case studies", "Type of author institution", "Theory topics used in article", "Subject of article", "Methods used in articles", "Type of article") trac_database<-data.frame(Headings, Description) kable(trac_database, caption = "TRAC Database headings") %>% kable_styling(latex_options=c("HOLD_position")) ``` ### Table 2: Institution types of publication authors ```{r} #create dataframe of totals for each age group trac_instit_type<-data.frame(table(trac$institution_type)) #add field representing percentage of total trac_instit_final<-trac_instit_type %>% arrange(desc(Freq)) #plot table trac_instit_final %>% gt() %>% cols_label(Var1 = "Institution type", Freq = "Article frequency") ``` ### Table 3: Ten largest contributing author institutions for all TRAC publications ```{r traj-top-ten-author-institution} #create table of order descending author institutions trac_institutions_final<-ddply(trac, .(trac$lead_author_institution), nrow) %>% arrange(desc(V1)) %>% slice_head(n=10) #rename columns names(trac_institutions_final) <- c("author_institution", "count") #plot table trac_institutions_final %>% gt() %>% cols_label(author_institution = "Institution", count = "Article frequency") ``` ### Table 4: Ten most frequent institution that have hosted a TRAC Conference ```{r traj_trac_conference_locations} Location <- c("Newcastle-upon-Tyne, UK", "Bradford, UK", "Glasgow, UK", "Durham, UK", "Reading, UK", "Sheffield, UK", "Nottingham, UK", "Leicester, UK", "London, UK", "Glasgow, UK", "Canterbury, UK", "Birmingham, UK", "Cambridge, UK", "Amsterdam, Netherlands", "Ann Arbor, USA" ,"Southampton, UK", "Oxford, UK", "Frankfurt am Main, Germany", "Rome, Italy", "Edinburgh, UK") No_conferences <- c(2, 1, 2, 4, 2, 1, 1, 3, 3, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1) trac_conference<-data.frame(Location, No_conferences) #arrange by descending values trac_confernce_final <- trac_conference %>% arrange(desc(No_conferences)) %>% filter(No_conferences>=2) #plot table trac_confernce_final %>% gt() %>% cols_label(Location = "Location", No_conferences = "Frequency") ``` ### Table 5: Ten most frequent case study locations utilised in TRAC publications ```{r traj_trac_casestudy_locations_top10} trac$country_topic<-as.character(trac$country_topic) country_corpus<-corpus(trac, docid_field = "doc_id",text_field = "country_topic") country_dfm<-dfm(country_corpus, tolower=TRUE, stem=FALSE, remove=stopwords("English"), remove_punct = TRUE) #frequency of words features_dfm_country <- textstat_frequency(country_dfm) #convert to dataframe country_topic_df<-as.data.frame(features_dfm_country) #select relevant features and top 10 entries country_topic_final <- country_topic_df %>% select(feature, frequency) %>% slice_head(n=10) #plot table country_topic_final %>% gt() %>% cols_label(feature = "Location", frequency = "Article frequency") ``` ### Table 6: Ten most frequent countries represented by author institutions in TRAC publications ```{r traj-top-ten-author-institution-locations} #create table of order descending author institutions trac_institution_country<-ddply(trac, .(trac$institution_country), nrow) %>% arrange(desc(V1)) %>% slice_head(n=10) #rename columns names(trac_institution_country) <- c("author_location", "count") #plot table trac_institution_country %>% gt() %>% cols_label(author_location = "Location", count = "Article frequency") ``` ## Figures ### Figure 1: Number and type of TRAC publications published per year ```{r traj-publications-per-year} #calculate count for article types per year #note ddpyr uses plyr package trac_pubs_year <- ddply(trac, .(trac$year_pub, trac$journal), nrow) #rename columns names(trac_pubs_year) <- c("year", "type", "count") #reorder the factors in the table to change display order of categories trac_pubs_year$type <- factor(trac_pubs_year$type, levels = c("Theoretical Roman Archaeology Conference Proceedings", "TRAC Themes in Roman Archaeology","Theoretical Roman Archaeology Journal")) cols <- c("Theoretical Roman Archaeology Conference Proceedings" = "#66C2A5", "TRAC Themes in Roman Archaeology" = "#3288BD", "Theoretical Roman Archaeology Journal" = "#F46D43") #plot data using ggplot function ggplot(trac_pubs_year)+ geom_col(aes(fill=type, x=factor(year), y=count), position="stack")+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5))+ labs(x="Publication Year",y="Number of articles", fill = "Publication type")+ scale_fill_manual(values = cols, labels = c("TRAC Proceedings", "TRAC Themes", "TRAJ"))+ ggsave("figs/fig1.jpeg", height = 5, width = 7, dpi = 300) ``` ### Figure 2: Institution types of publication authors (per year) ```{r traj-institution-types-percentage-per-year} #calculate count for article types per year trac_pubs_instit <- ddply(trac, .(trac$institution_type, trac$year_pub), nrow) #rename columns names(trac_pubs_instit) <- c("institution_type", "year", "count") #add field representing percentage of total trac_pubs_instit2<-group_by(trac_pubs_instit, year) %>% mutate(percent = count/sum(count)*100) #expand colour scheme nb.cols <- 14 mycolors <- colorRampPalette(brewer.pal(8, "Spectral"))(nb.cols) #plot stacked bar graph] ggplot(trac_pubs_instit2)+ geom_col(aes(fill=institution_type, x=factor(year), y=percent), position="fill")+ scale_y_continuous(labels=scales::percent)+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5))+ labs(x="year", y="pecrentage of total", fill = "Institution type")+ scale_fill_manual(values = mycolors)+ ggsave("figs/fig2.jpeg", height = 5, width = 7, dpi = 300) ``` ### Figure 3: Percentage of single and multi-author articles within TRAC publications published per year ```{r traj-single-multiauthor-publications-per-year} #calculate count for article types per year #note ddpyr uses plyr package trac_pubs_multi <- ddply(trac, .(trac$collaborative, trac$year_pub), nrow) #rename columns names(trac_pubs_multi) <- c("collaborative", "year", "count") #change to percentage values per year trac_multi_final<-group_by(trac_pubs_multi, year) %>% mutate(percent = count/sum(count)*100) #plot data using ggplot function ggplot(trac_multi_final) + geom_col(aes(fill=collaborative, x=factor(year), y=percent), position="fill")+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) + scale_y_continuous(labels=scales::percent)+ labs(x="Publication Year",y="percentage of articles", fill = "Collaborative?")+ scale_fill_manual(values = c("#66C2A5", "#F46D43"), labels = c("No", "Yes"))+ ggsave("figs/fig3.jpeg", height = 5, width = 7, dpi = 300) ``` ### Figure 4: Word frequency plot for ‘theory’ key words (inset table - five most frequent categories) ```{r traj_theory_topic_frequency} #pass theory topic data to dfm trac$theory_topic<-as.character(trac$theory_topic) theory_corpus<-corpus(trac, docid_field = "doc_id",text_field = "theory_topic") theory_dfm<-dfm(theory_corpus, tolower=TRUE, stem=FALSE, remove=stopwords("English"), remove_punct = TRUE) #frequency of words features_dfm_theory <- textstat_frequency(theory_dfm) # Sort by reverse frequency order features_dfm_theory$feature <- with(features_dfm_theory, reorder(feature, -frequency)) #create table - top 5 theory topics #convert dfm to dataframe theory_top_five<-textstat_frequency(theory_dfm, n=5) #extract frequency and feature theory_final<-subset(theory_top_five, select = c("feature", "frequency")) #plot using ggplot ggplot(features_dfm_theory, aes(x = feature, y = frequency)) + geom_point() + theme(axis.text.x = element_text(angle = 90, hjust = 1))+ labs(x="")+ annotation_custom(tableGrob(theory_final, theme = ttheme_default(base_size = 6), rows = NULL), xmin = 88, xmax = 100, ymin = 40, ymax = 45)+ ggsave("figs/fig4.jpeg", height=5, width=10, dpi = 300) ``` ### Figure 5: Frequency of ‘theory’ key words used in TRAC publications per year (all terms used more than five times per year are represented) ```{r traj_theory_topic_per_year} # Get frequency grouped by year published freq_grouped <- textstat_frequency(dfm(theory_corpus), groups = "year_pub") #subset data - keep all entries with more than 5 instances per year freq_grouped_final <- subset(freq_grouped, frequency >= 5) #expand colour scheme nb.cols1 <- 11 mycolors1 <- colorRampPalette(brewer.pal(8, "Spectral"))(nb.cols1) #plot theory usages per year ggplot(freq_grouped_final, aes(x = group, y= frequency, fill = feature), xlab="Age Group") + geom_bar(stat="identity", position = position_dodge2(width=0.9, preserve = "single"))+ scale_y_continuous(breaks = c(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12))+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5))+ labs(x="year", y="count", fill = "Theory")+ scale_fill_manual(values = mycolors1)+ ggsave("figs/fig5.jpeg", height=5, width=7, dpi = 300) ``` ### Figure 6: Word frequency plot for ‘method’ key words (inset table - five most frequent categories) ```{r traj_method_topic_frequency} #pass theory topic data to dfm trac$method_topic<-as.character(trac$method_topic) method_corpus<-corpus(trac, docid_field = "doc_id",text_field = "method_topic") method_dfm<-dfm(method_corpus, tolower=TRUE, stem=FALSE, remove=stopwords("English"), remove_punct = TRUE) #determine frequency of words features_dfm_method <- textstat_frequency(method_dfm) #sort by reverse frequency order features_dfm_method$feature <- with(features_dfm_method, reorder(feature, -frequency)) #creating table - top 5 method topics #convert dfm to dataframe method_top_five<-textstat_frequency(method_dfm, n=5) #extract frequency and feature method_final<-subset(method_top_five, select = c("feature", "frequency")) #plot frequency chart and table inset ggplot(features_dfm_method, aes(x = feature, y = frequency)) + geom_point() + theme(axis.text.x = element_text(angle = 90, hjust = 1))+ labs(x="")+ annotation_custom(tableGrob(method_final, theme = ttheme_default(base_size = 6), rows = NULL), xmin = 26, xmax = 28, ymin = 24, ymax = 26)+ ggsave("figs/fig6.jpeg", height=5, width=7, dpi = 300) ``` ### Figure 7: Frequency of ‘method’ key words used in publications per year (all terms used more than twice per year are represented) ```{r traj_method_topic_frequency_per_year} # Get frequency grouped by year published freq_grouped_method <- textstat_frequency(dfm(method_corpus), groups = "year_pub") #subset data - keep all entries with more than 5 instances per year freq_method_final <- subset(freq_grouped_method, frequency >= 2) #expand colour scheme nb.cols2 <- 9 mycolors2 <- colorRampPalette(brewer.pal(8, "Spectral"))(nb.cols2) #plot theory usages per year ggplot(freq_method_final, aes(x = group, y= frequency, fill = feature), xlab="Age Group") + geom_bar(stat="identity", position = position_dodge2(width=0.9, preserve = "single"))+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5))+ labs(x="year", y="count", fill = "Method")+ scale_fill_manual(values = mycolors2)+ ggsave("figs/fig7.jpeg", height=5, width=7, dpi = 300) ``` ### Figure 8: Word frequency plot for ‘subject’ key words (inset table - five most frequent categories) ```{r traj-subject-topics-top-ten, results='asis'} #pass subject topic data to dfm trac$subject_topic<-as.character(trac$subject_topic) subject_corpus<-corpus(trac, docid_field = "doc_id",text_field = "subject_topic") subject_dfm<-dfm(subject_corpus, tolower=TRUE, stem=FALSE, remove=stopwords("English"), remove_punct = TRUE) #frequency of words features_dfm_subject <- textstat_frequency(subject_dfm) # Sort by reverse frequency order features_dfm_subject$feature <- with(features_dfm_subject, reorder(feature, -frequency)) #creating table - top 5 subject topics #convert dfm to dataframe subject_top_five<-textstat_frequency(subject_dfm, n=5) #extract frequency and feature subject_final<-subset(subject_top_five, select = c("feature", "frequency")) #plot using ggplot ggplot(features_dfm_subject, aes(x = feature, y = frequency)) + geom_point() + theme(axis.text.x = element_text(angle = 90, hjust = 1))+ labs(x="")+ annotation_custom(tableGrob(subject_final, theme = ttheme_default(base_size = 6), rows = NULL), xmin = 76, xmax = 80, ymin = 42, ymax = 44)+ ggsave("figs/fig8.jpeg", height=5, width=10, dpi = 300) ``` ### Figure 9: Frequency of ‘subject’ key words used in publications per year (all terms used more than three times per year are represented) ```{r traj_subject_topic_frequency_per_year} # Get frequency grouped by year published freq_grouped_subject <- textstat_frequency(dfm(subject_corpus), groups = "year_pub") #subset data - keep all entries with more than 5 instances per year freq_subject_final <- subset(freq_grouped_subject, frequency >= 3) #expand colour scheme nb.cols3 <- 14 mycolors3 <- colorRampPalette(brewer.pal(8, "Spectral"))(nb.cols3) #plot theory usages per year ggplot(freq_subject_final, aes(x = group, y= frequency, fill = feature), xlab="Age Group") + geom_bar(stat="identity", position = position_dodge2(width=0.9, preserve = "single"))+ scale_y_continuous(breaks = c(0, 1, 2, 3, 4, 5, 6, 7, 8, 9))+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5))+ labs(x="year", y="count", fill = "Subject")+ scale_fill_manual(values = mycolors3)+ ggsave("figs/fig9.jpeg", height=5, width=7, dpi = 300) ``` ### Figure 10: Density map article case study location ```{r traj-case-study-location-map} #subset original database to include only country_topic and year published country_year<-trac %>% select(year_pub, country_topic) #convert to character field country_year$country_topic<-as.character(country_year$country_topic) #create corpus using quanteda country_corpus_year<-corpus(country_year, docid_field = "doc_id", text_field = "country_topic") #create dfm using quanteda - grouped by year published country_dfm_year<-dfm(country_corpus_year, tolower=TRUE, stem=FALSE, remove=stopwords("English"), remove_punct = TRUE, groups = "year_pub") #calculate frequency of words used - grouped by year published country_year_final<-textstat_frequency(country_dfm_year, groups = "year_pub") #import world map map.world <- map_data("world") #convert to dataframe country_study_a<-as.data.frame(country_year_final) #summarise countrys by count country_study<-country_study_a %>% select(feature) %>% group_by(feature) %>% count() # recode names to match map country_study$feature <- recode(country_study$feature, 'britain' = 'UK', 'algeria' = 'Algeria', 'austria' = 'Austria', 'belgium' = 'Belgium', 'croatia' = 'Croatia', 'cyprus' = 'Cyprus', 'denmark' = 'Denmark', 'egypt' = 'Egypt', 'ethiopia' = 'Ethiopia', 'france' = 'France', 'germany' = 'Germany', 'greece' = 'Greece', 'hungary' = 'Hungary', 'iraq' = 'Iraq', 'ireland' = 'Ireland', 'israel' = 'Israel', 'italy' = 'Italy', 'jordan' = 'Jordan', 'libya' = 'Libya', 'macedonia' = 'Macedonia', 'morocco' = 'Morocco', 'netherlands' = 'Netherlands', 'norway' = 'Norway', 'portugal' = 'Portugal', 'romania' = 'Romania', 'spain' = 'Spain', 'sweden' = 'Sweden', 'switzerland' = 'Switzerland', 'syria' = 'Syria', 'tunisia' = 'Tunisia', 'turkey' = 'Turkey', 'usa' = 'USA') #rename columns colnames(country_study) = c("country", "count") #left join between country map and world map map.world_joined2 <- left_join(map.world, country_study, by = c('region' = 'country')) #create joined map data map.world_joined3 <- map.world_joined2 %>% mutate(fill_flg = ifelse(is.na(rank),F,T)) #create plain theme for mapping plain <- theme( axis.text = element_blank(), axis.line = element_blank(), axis.ticks = element_blank(), panel.border = element_blank(), panel.grid = element_blank(), axis.title = element_blank(), panel.background = element_rect(fill = "white"), plot.title = element_text(hjust = 0.5) ) #plot map ggplot(map.world_joined3, mapping = aes(long, lat, group = group, fill = count)) + geom_polygon()+ coord_sf(xlim = c(-20, 55), ylim = c(10, 73))+ scale_fill_distiller(palette ="Blues", direction = 1, na.value="gray90") + theme(legend.position = "bottom")+ guides(fill = guide_colorbar(direction = "horizontal", barheight = unit(2, units = "mm"), #height of bar barwidth = unit(7.5, units = "cm"), #width of bar draw.ulim = F, title.position = 'top', #position of title on bar label.position="bottom", title.hjust = 0.5, #adjusts position of title to centre label.hjust = 0.5))+ plain+ ggsave("figs/fig10.jpeg", height=5, width=7, dpi = 300) ``` ### Figure 11: Location of case study locations for TRAC articles per year ```{r traj_case_study_location_per_year} #recode countries to European Union groupings country_year_final$feature<-recode(country_year_final$feature, 'britain' = 'UK', 'romania' = 'Central Europe', 'croatia' = 'Central Europe', 'hungary' = 'Central Europe', 'macedonia' = 'Central Europe', 'ethiopia' = 'East Africa', 'jordan' = 'Middle East', 'syria' = 'Middle East', 'israel' = 'Middle East', 'iraq' = 'Middle East', 'libya' = 'North Africa', 'algeria' = 'North Africa', 'morocco' = 'North Africa', 'egypt' = 'North Africa', 'tunisia' = 'North Africa', 'usa' = 'North America', 'denmark' = 'Northern Europe', 'norway' = 'Northern Europe', 'sweden' = 'Northern Europe', 'greece' = 'Southern Europe', 'italy' = 'Southern Europe', 'turkey' = 'Southern Europe', 'spain' = 'Southern Europe', 'portugal' = 'Southern Europe','cyprus' = 'Southern Europe', 'france' = 'Western Europe', 'germany' = 'Southern Europe', 'switzerland' = 'Southern Europe', 'austria' = 'Southern Europe', 'netherlands' = 'Southern Europe', 'belgium' = 'Southern Europe', 'ireland' = 'Southern Europe') #plot stacked bar graph ggplot(country_year_final)+ geom_col(aes(fill=factor(feature, levels = c("North Africa", "East Africa", "Middle East", "North America", "Central Europe", "Western Europe", "Northern Europe", "Southern Europe", "UK")), x=group, y=frequency), position="fill")+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5))+ scale_y_continuous(labels=scales::percent)+ labs(x="year", y="pecrentage of total", fill = "Case Study location")+ scale_fill_brewer(palette = "Spectral")+ ggsave("figs/fig11.jpeg", height=5, width=7, dpi = 300) ``` ### Figure 12: Author institution location per year (UK and rest of World) ```{r traj_author_location_uk/nonuk_per_year} fig15<-subset(trac, select =c("institution_country", "year_pub")) #create new column of UK and non-UK data fig15a<-fig15 %>% mutate(instit_country2 = c("Non-UK", "UK")[(institution_country %in% c("England", "Scotland", "Wales"))+1] ) #calculate counts of UK and non_UK per year instit_country <- ddply(fig15a, .(fig15a$instit_country2, fig15a$year_pub), nrow) #rename columns names(instit_country) <- c("country", "year", "count") #change to percentage values per year instit_country_final<-group_by(instit_country, year) %>% mutate(percent = count/sum(count)*100) #plot data using ggplot function ggplot(instit_country_final) + geom_col(aes(fill=country, x=factor(year), y=percent), position="fill")+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) + scale_y_continuous(labels=scales::percent)+ labs(x="Publication Year",y="percentage of articles", fill = "Author Institution?")+ scale_fill_manual(values = c("#66C2A5", "#F46D43"))+ ggsave("figs/fig12.jpeg", height=5, width=7, dpi = 300) ``` ## Bibliometric Anlaysis ### Figure 13: Britannia journal summary plot (a - word frequency from article titles, b - institution country top ten, c - author institution top ten, d - single vs multi-author over time) ```{r traj_britannia_summary_plots, echo=FALSE} #import bibtex data from Britannia file <- "britannia.bib" #convert to df brit_df <- convert2df(file, dbsource = "wos", format = "bibtex") #BiblioAnalysis brit_analysis<-biblioAnalysis(brit_df) #KEYWORDS brit_analysis_v2<-as.data.frame(brit_analysis$DE) #pass keywords into corpus and dfm brit_analysis_v2$Tab<-as.character(brit_analysis_v2$Tab) brit_corpus<-corpus(brit_analysis_v2, docid_field = "doc_id",text_field = "Tab") brit_dfm<-dfm(brit_corpus, tolower=TRUE, stem=FALSE, remove=stopwords("English"), remove_punct = TRUE) #calculate frequency of words brit_keywords_freq <- textstat_frequency(brit_dfm, n=50) #sort by reverse frequency order brit_keywords_freq$feature <- with(brit_keywords_freq, reorder(feature, -frequency)) #plot using ggplot p1<-ggplot(brit_keywords_freq, aes(x = feature, y = frequency)) + geom_point() + theme(axis.text.x = element_text(angle = 90, hjust = 1))+ labs(x="", y="")+ ggtitle('A') #AUTHOR COUNTRIES brit_country<-as.data.frame(brit_analysis$CO) #change field to character brit_country$`brit_analysis$CO`<-as.character(brit_country$`brit_analysis$CO`) #reorder descending and remove all values with 1 mention brit_country_final<-brit_country %>% dplyr::count(`brit_analysis$CO`) %>% arrange(desc(n)) %>% top_n(10) %>% dplyr::rename(country = `brit_analysis$CO`, count=n) #change order of factors to numeric order brit_country_final$country <- factor(brit_country_final$country, levels = brit_country_final$country [order(brit_country_final$count)]) #remove na values brit_country_final2<-na.omit(brit_country_final) #plot p2<-ggplot(brit_country_final2, aes(x=country, y=count))+ geom_bar(stat="identity", fill="#66C2A5")+ theme(axis.text.x = element_text(size = 8), axis.text.y = element_text(size = 8))+ labs(x="",y="")+ coord_flip()+ ggtitle('B') #AUTHOR AFFILIATIONS #extract relevant data brit_affil<-subset(brit_df, select=c("AU_UN", "AU")) #extract all values library(splitstackshape) brit_affil_all<-cSplit(brit_affil, "AU_UN", sep=";", type.convert=FALSE) #combine four columns together in one long list brit1<-subset(brit_affil_all, select=c("AU_UN_1", "AU")) brit2<-subset(brit_affil_all, select=c("AU_UN_2", "AU")) brit3<-subset(brit_affil_all, select=c("AU_UN_3", "AU")) brit4<-subset(brit_affil_all, select=c("AU_UN_4", "AU")) brit5<-rbind(brit1, brit2, brit3, brit4, use.names=FALSE) #remove na values brit6<-na.omit(brit5) #remove all refs to 'corresponding author' brit7<-brit6[- grep("CORRESPONDING AUTHOR", brit6$AU_UN_1),] #top 10 affilations and rename columns brit_affil_final<-brit7 %>% dplyr::count(brit7$AU_UN_1) %>% arrange(desc(n)) %>% top_n(10)%>% dplyr::rename(affiliation = 'brit7$AU_UN_1', count=n) #change order of factors to numeric order brit_affil_final$affiliation <- factor(brit_affil_final$affiliation, levels = brit_affil_final$affiliation [order(brit_affil_final$count)]) #plot p3<-ggplot(brit_affil_final, aes(x=affiliation , y=count))+ geom_bar(stat="identity", fill="#F46D43")+ theme(axis.text.x = element_text(size = 8), axis.text.y = element_text(size = 8))+ labs(x="",y="")+ coord_flip()+ ggtitle('C') #SINGLE VS MULTI AUTHORS PER YEAR #count numbers of authors in each field brit_df$count<-str_count(brit_df$AU, ';')+1 #order dataframe oer count and year of publication library(plyr) brit_df_author <- ddply(brit_df, .(brit_df$count, brit_df$PY), nrow) detach(package:plyr) #summarise author number (1 and more than 1) oer year brit_author_final<-brit_df_author %>% drop_na() %>% dplyr::rename(author_no=1, year=2, no=3) %>% group_by(no, year) %>% summarise(author_no>1) #plot p4<-ggplot(brit_author_final) + geom_col(aes(fill=`author_no > 1`, x=factor(year), y=no), position="fill")+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5), axis.text.y = element_text(size = 8))+ scale_y_continuous(labels=scales::percent)+ labs(x="",y="", fill = "Collaborative?")+ scale_fill_manual(values = c("#66C2A5", "#F46D43"), labels = c("No", "Yes"))+ ggtitle('D') p1 / (p2 | p3 | p4) ggsave("figs/fig13.jpeg", height=7, width=10, dpi = 300) ``` ### Figure 14: Frequency comparison plot of words used in article title – TRAC and Britannia (Note: words located above the red line are found more frequently in Britannia, compared to words located below that are more frequently found in TRAC publications) ```{r traj_britannia_trac_comparison_plot} #change trac database to correct format trac_journal <- trac %>% dplyr::rename(author = authors) %>% select(author, title, year_pub, journal) #change year published to character field trac_journal$year_pub<-as.character(trac_journal$year_pub) #import britannia journal data #export from Web of Science as 'Tab-delimited (Win UTF-8) britannia <- readr::read_tsv("britannia.txt", quote = "", col_types = cols(.default = col_character())) #import britannia journal data britannia <- read_tsv("britannia.txt", col_types = cols(.default = col_character())) britannia<-britannia[-1:-2,] #tidy tibble to select and rename relevant fields britannia_journal <- britannia %>% dplyr::rename(author = AU, title = TI, year_pub = PY) %>% select(author, title, year_pub) #combine with Britannia journal dataset roman_journals <- bind_rows(trac_journal %>% mutate(journal = "TRAC"), britannia_journal %>% mutate(journal = "Britannia")) #remove relevant word field and remove 'stop words' tidy_journal1 <- roman_journals %>% unnest_tokens(word, title) %>% anti_join(stop_words) #create word frequency count and summary word_frequency <-tidy_journal1 %>% group_by(journal) %>% dplyr::count(word, sort=TRUE) %>% left_join(tidy_journal1 %>% group_by(journal) %>% summarise(total = n())) %>% mutate(freq = n/total) word_frequency1 <- word_frequency %>% select(journal, word, freq) %>% spread(journal, freq) %>% arrange(TRAC, Britannia) #plot comparison ggplot(word_frequency1, aes(TRAC, Britannia)) + geom_jitter(alpha = 0.1, size = 2.5, width = 0.25, height = 0.25) + geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) + scale_x_log10(labels = percent_format()) + scale_y_log10(labels = percent_format()) + geom_abline(color = "red")+ ggsave("figs/fig14.jpeg", height=5, width=7, dpi = 300) ``` ### Journal of Social Archaeology summary plot (a - word frequency from article titles, b - institution country top ten, c - author institution top ten, d - single vs multi-author over time) ```{r traj_jsa_summary_plots, echo=FALSE} #import bibtex data from Journal of Social Archaeology file <- "jour_social.bib" #convert to df jsa_df <- convert2df(file, dbsource = "wos", format = "bibtex") #BiblioAnalysis jsa_analysis<-biblioAnalysis(jsa_df) #KEYWORDS jsa_analysis_v2<-as.data.frame(jsa_analysis$DE) #pass keywords into corpus and dfm jsa_analysis_v2$Tab<-as.character(jsa_analysis_v2$Tab) jsa_corpus<-corpus(jsa_analysis_v2, docid_field = "doc_id",text_field = "Tab") jsa_dfm<-dfm(jsa_corpus, tolower=TRUE, stem=FALSE, remove=stopwords("English"), remove_punct = TRUE) #calculate frequency of words jsa_keywords_freq <- textstat_frequency(jsa_dfm, n=50) #sort by reverse frequency order jsa_keywords_freq$feature <- with(jsa_keywords_freq, reorder(feature, -frequency)) #plot using ggplot p5<-ggplot(jsa_keywords_freq, aes(x = feature, y = frequency))+ geom_point()+ theme(axis.text.x = element_text(angle = 90, hjust = 1))+ labs(x="subject keyword")+ ggtitle('A') #AUTHOR COUNTRIES jsa_country<-as.data.frame(jsa_analysis$CO) #change field to character jsa_country$`jsa_analysis$CO`<-as.character(jsa_country$`jsa_analysis$CO`) #reorder descending and remove all values with 1 mention jsa_country_final<-jsa_country %>% dplyr::count(`jsa_analysis$CO`) %>% arrange(desc(n)) %>% top_n(10) %>% dplyr::rename(country = `jsa_analysis$CO`, count=n) #change order of factors to numeric order jsa_country_final$country <- factor(jsa_country_final$country, levels = jsa_country_final$country [order(jsa_country_final$count)]) #remove na values jsa_country_final2<-na.omit(jsa_country_final) #plot p6<-ggplot(jsa_country_final2, aes(x=country, y=count))+ geom_bar(stat="identity", fill="#66C2A5")+ theme(axis.text.x = element_text(size = 8), axis.text.y = element_text(size = 8))+ labs(x="",y="count")+ coord_flip()+ ggtitle('B') #AUTHOR AFFILIATIONS jsa_affil<-as.data.frame(jsa_analysis$FirstAffiliation) #change field to character jsa_affil$'jsa_analysis$FirstAffiliation'<-as.character(jsa_affil$'jsa_analysis$FirstAffiliation') #top 10 affilations and rename columns jsa_affil_final<-jsa_affil %>% dplyr::count(jsa_analysis$FirstAffiliation) %>% arrange(desc(n)) %>% top_n(10) %>% dplyr::rename(affiliation = 'jsa_analysis$FirstAffiliation', count=n) #change order of factors to numeric order jsa_affil_final$affiliation <- factor(jsa_affil_final$affiliation, levels = jsa_affil_final$affiliation [order(jsa_affil_final$count)]) #remove na values jsa_affil_final<-na.omit(jsa_affil_final) #plot p7<-ggplot(jsa_affil_final, aes(x=affiliation , y=count))+ geom_bar(stat="identity", fill="#F46D43")+ theme(axis.text.x = element_text(size = 8), axis.text.y = element_text(size = 8))+ labs(x="",y="count")+ coord_flip()+ ggtitle('C') #SINGLE VS MULTI AUTHORS PER YEAR #count numbers of authors in each field jsa_df$count<-str_count(jsa_df$AU, ';')+1 #order dataframe per count and year of publication library(plyr) jsa_df_author <- ddply(jsa_df, .(jsa_df$count, jsa_df$PY), nrow) detach(package:plyr) #summarise author number (1 and more than 1) oer year jsa_author_final<-jsa_df_author %>% drop_na() %>% dplyr::rename(author_no=1, year=2, no=3) %>% group_by(no, year) %>% summarise(author_no>1) #plot p8<-ggplot(jsa_author_final)+ geom_col(aes(fill=`author_no > 1`, x=factor(year), y=no), position="fill")+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, size = 8), axis.text.y = element_text(size = 8))+ scale_y_continuous(labels=scales::percent)+ labs(x="",y="percentage of articles", fill = "Collaborative?")+ scale_fill_manual(values = c("#66C2A5", "#F46D43"), labels = c("No", "Yes"))+ ggtitle('D') p5 / (p6 | p7 | p8) ggsave("figs/fig15.jpeg", height=7, width=10, dpi = 300) ``` ### Figure 16: Frequency comparison plot of words used in article title – TRAC and Journal of Social Archaeology ```{r traj_jsa_trac_comparison_plot} #import JSA journal data jsa <- read_tsv("jour_social.txt", col_types = cols(.default = col_character())) #tidy tibble to select and rename relevant fields jsa_journal <- jsa %>% dplyr::rename(author = AU, title = TI, year_pub = PY) %>% select(author, title, year_pub) #combine with Britannia journal dataset theory_journals <- bind_rows(trac_journal %>% mutate(journal = "TRAC"), jsa_journal %>% mutate(journal = "JSA")) #remove relevant word field and remove 'stop words' tidy_journal_theory <- theory_journals %>% unnest_tokens(word, title) %>% anti_join(stop_words) #create word frequency count and summary word_frequency_theory1 <-tidy_journal_theory %>% group_by(journal) %>% count(word, sort=TRUE) %>% left_join(tidy_journal_theory %>% group_by(journal) %>% summarise(total = n())) %>% mutate(freq = n/total) word_frequency_theory2 <- word_frequency_theory1 %>% select(journal, word, freq) %>% spread(journal, freq) %>% arrange(TRAC, JSA) #plot comparison ggplot(word_frequency_theory2, aes(TRAC, JSA)) + geom_jitter(alpha = 0.1, size = 2.5, width = 0.25, height = 0.25) + geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) + scale_x_log10(labels = percent_format()) + scale_y_log10(labels = percent_format()) + geom_abline(color = "red")+ ggsave("figs/fig16.jpeg", height=5, width=7, dpi = 300) ```