diff --git a/README.txt b/README.txt index e69de29..75213d9 100644 --- a/README.txt +++ b/README.txt @@ -0,0 +1,34 @@ +Um das Programm auszuführen, müssen zunächst die darin enthaltenen Pakete der +Julia-Umgebung hinzugefügt werden. Diese Pakete sind: + + - https://github.com/JuliaData/YAML.jl + - https://github.com/quinnj/Dates.jl + - https://github.com/JuliaData/DataFrames.jl + - https://github.com/JuliaData/CSV.jl + - https://github.com/JuliaPlots/Plots.jl + - https://github.com/JuliaLang/Statistics.jl + - https://github.com/JuliaStats/StatsBase.jl + - https://github.com/JuliaPlots/StatsPlots.jl + +Mittels der Datei readFile.jl kann das Programm gestartet werden. +Der Pfad der beiliegenden Datei settings.yaml muss allerdings angepasst werden. +In der Datei settings.yaml können die Einstellungen getroffen und +der Speicherort aller Output-Dateien kann angepasst werden. +Auch müssen in der Datei settings.yaml die Speicherorte der folgenden Dateien +aus dem Datensatz der Deutschen Bahn angepasst werden: + + - Bahnhofsdaten.csv + - 20170901-20171019_Alle_Sollereignisse_S-Bahn_Stuttgart.csv + - 20170901-20171019_Alle_Istmeldungen_S-Bahn_Stuttgart.csv + +Einstellungen für das Plotten können in der Datei plotting.jl getroffen werden. + +Das "@time" vor einigen Funktionen kann auswirkungslos entfernt werden. Es +dient allein der Zeitkontrolle. + +Bei folgender Fehlermeldung + +LoadError: MethodError: no method matching _show(::IOStream, + ::MIME{Symbol("application/pdf")} + +das Programm einfach erneut starten. diff --git a/betweenRegistrationPoints.jl b/betweenRegistrationPoints.jl new file mode 100644 index 0000000..699d9de --- /dev/null +++ b/betweenRegistrationPoints.jl @@ -0,0 +1,240 @@ +# approach 2: deviation is analyzed for time between the registration points +# time between two stations and time between arrival and departure + +module betweenRegistrationPoints + +include("./output.jl") + +using Statistics, CSV, Dates, DataFrames +using .output + +export getDifferences + +function getDifferences(modPerfData, settings) + df1, perfDataDirection1, df2, perfDataDirection2 = + prepareData(modPerfData, settings) + plotData1 = calculateDeviation(df1, perfDataDirection1, settings) + plotData2 = calculateDeviation(df2, perfDataDirection2, settings) + return plotData1, plotData2 +end + +""" +Function is preparing the data if necesary and calling another function +to prepare DataFrames to save the results of the following analyses. +""" + +function prepareData(modPerfData, settings) + + # determine current line and its station list + if settings.objectInFocus == "single line" + lineNr = 1 + else + lineNr = findall(x -> x == settings.analyzedLine, settings.allLines) + lineNr = lineNr[1] + end + + stationList1 = settings.stationLists[lineNr] + stationList2 = reverse(stationList1) + + df1 = createDataFrame(stationList1, settings) + df2 = createDataFrame(stationList2, settings) + + select!(modPerfData, Not(:rownumber)) + modPerfData = + filter(row -> row[:ZUGEREIGNIS_DS100] in stationList1, modPerfData) + modPerfData[!, :rownumber] = axes(modPerfData, 1) + df = modPerfData[modPerfData[:, :rownumber].!=0, :] + + # the station data of TSS has to be changed (somtimes wrong order) + if stationList1[1] == "TSS" || stationList2[1] == "TSS" + for row in eachrow(modPerfData) + if row.rownumber > 1 && row.ZUGEREIGNIS_DS100 == "TSS" + if modPerfData.ZUGEREIGNIS_DS100[row.rownumber-1] == "TSS" && ( + modPerfData.ZUGEREIGNIS_TYP[row.rownumber-1] == 10 || + modPerfData.ZUGEREIGNIS_TYP[row.rownumber-1] == 50 + ) + row1 = modPerfData[row.rownumber-1, :] + row2 = modPerfData[row.rownumber, :] + df[row.rownumber, :] = row1 + df[row.rownumber-1, :] = row2 + end + end + end + end + + perfDataDirection1 = df[df[:, :ZUGEREIGNIS_RICHTUNG].==stationList2[1], :] + perfDataDirection2 = df[df[:, :ZUGEREIGNIS_RICHTUNG].==stationList1[1], :] + + return df1, perfDataDirection1, df2, perfDataDirection2 +end + +function createDataFrame(stationList, settings) + + df = DataFrame() + point1 = Any[] + point2 = Any[] + counter = 0 + + if settings.objectInFocus != "Stuttgarter Stammstrecke" + if stationList[1] == "TSS" + pushfirst!(stationList, stationList[1]) + counter = convert(Int, (length(stationList) - 1) / 2) + sequence = repeat(["station", "section"], counter) + elseif stationList[size(stationList, 1)] == "TSS" + push!(stationList, stationList[size(stationList, 1)]) + counter = convert(Int, (length(stationList) - 1) / 2) + sequence = repeat(["section", "station"], counter) + else + counter = convert(Int, (length(stationList) - 1) / 2 + 0.5) + sequence = repeat(["section", "station"], counter) + pop!(sequence) + end + else + push!(stationList, stationList[length(stationList)]) + pushfirst!(stationList, stationList[1]) + counter = convert(Int, (length(stationList) - 1) / 2 + 0.5) + sequence = repeat(["station", "section"], counter) + pop!(sequence) + end + + #create station list for differences + for i = 2:length(stationList) + push!(point1, stationList[i-1]) + push!(point2, stationList[i]) + end + + + + df[!, :point1] = point1 + df[!, :point2] = point2 + df[!, :sequence] = sequence + + + return df +end + +""" +Function is calculating the deviation for each section and stop. Selected +quantiles are being created. +""" + +function calculateDeviation(df1, perfData, settings) + + deviationArray = Any[] + perfData[!, :row] = axes(perfData, 1) + for row in eachrow(df1) + deviationSequence = Any[] + for rowData in eachrow(perfData) + if rowData.row != 1 && + perfData.ZUGEREIGNIS_DS100[rowData.row-1] == row.point1 && + rowData.ZUGEREIGNIS_DS100 == row.point2 + if ( + rowData.ZUGEREIGNIS_TYP == 10 || + rowData.ZUGEREIGNIS_TYP == 40 + ) && ( + perfData.ZUGEREIGNIS_TYP[rowData.row-1] == 20 || + perfData.ZUGEREIGNIS_TYP[rowData.row-1] == 50 + ) + actual = Second( + convert( + Dates.Second, + Dates.DateTime( + perfData.ZUGEREIGNIS_ISTZEIT[rowData.row], + "dd.mm.yyyy HH:MM", + ) - Dates.DateTime( + perfData.ZUGEREIGNIS_ISTZEIT[rowData.row-1], + "dd.mm.yyyy HH:MM", + ), + ), + ) + estimated = Second( + convert( + Dates.Second, + Dates.DateTime( + perfData.ZUGEREIGNIS_SOLLZEIT[rowData.row], + "dd.mm.yyyy HH:MM", + ) - Dates.DateTime( + perfData.ZUGEREIGNIS_SOLLZEIT[rowData.row-1], + "dd.mm.yyyy HH:MM", + ), + ), + ) + deviation = Dates.value(actual - estimated) + push!(deviationSequence, deviation) + elseif ( + rowData.ZUGEREIGNIS_TYP == 20 || + rowData.ZUGEREIGNIS_TYP == 50 + ) && ( + perfData.ZUGEREIGNIS_TYP[rowData.row-1] == 10 || + perfData.ZUGEREIGNIS_TYP[rowData.row-1] == 40 + ) + actual = Second( + convert( + Dates.Second, + Dates.DateTime( + perfData.ZUGEREIGNIS_ISTZEIT[rowData.row], + "dd.mm.yyyy HH:MM", + ) - Dates.DateTime( + perfData.ZUGEREIGNIS_ISTZEIT[rowData.row-1], + "dd.mm.yyyy HH:MM", + ), + ), + ) + estimated = Second( + convert( + Dates.Second, + Dates.DateTime( + perfData.ZUGEREIGNIS_SOLLZEIT[rowData.row], + "dd.mm.yyyy HH:MM", + ) - Dates.DateTime( + perfData.ZUGEREIGNIS_SOLLZEIT[rowData.row-1], + "dd.mm.yyyy HH:MM", + ), + ), + ) + deviation = Dates.value(actual - estimated) + push!(deviationSequence, deviation) + end + end + end + if length(deviationSequence) == 0 + deviationSequence = [10] + row.sequence = "noData" + end + push!(deviationArray, deviationSequence) + end + quantile = "" + quantile = settings.quantile[1] + quantArray = Any[] + average = Any[] + meridian = Float64[] + for row in deviationArray + x = quantile!(row, parse(Float64, quantile) / 100) + y = mean(row) + z = median(row) + push!(quantArray, x) + push!(average, y) + push!(meridian, z) + end + nameColumn = "deviation_" * quantile + df1[!, nameColumn] = quantArray + df1[!, :averageDeviation] = average + df1[!, :median] = meridian + + points = String[] + for row in eachrow(df1) + if row.point1 != row.point2 + push!(points, row.point1 * "-" * row.point2) + else + push!(points, row.point1) + end + end + select!(df1, Not(:point1)) + df1[!, :points] = points + df1 = df1[df1[:, :sequence].!="noData", :] + + return df1 + +end + +end diff --git a/checkPerformance.jl b/checkPerformance.jl new file mode 100644 index 0000000..5d79945 --- /dev/null +++ b/checkPerformance.jl @@ -0,0 +1,498 @@ +module performance +# main module +# functions are being called +# data is being sorted and organized + +include("./types.jl") +include("./output.jl") +include("./plotting.jl") +include("./betweenRegistrationPoints.jl") +include("./registrationPoints.jl") + +import YAML + +using CSV, Dates, DataFrames, Statistics, StatsBase +using .types +using .output +using .plotting +using .betweenRegistrationPoints +using .registrationPoints + +export startAnalyzing + +function startAnalyzing(filePathSettings) + settings = readSettings(filePathSettings) + memory = Memory() + + # if needed the match days are collected + if settings.timePeriod[1] == "match day" + df = DataFrame(CSV.File(settings.estimatedTimesPath)) + df_station = df[df[:, :ZUGEREIGNIS_LINIE].==11, :] + settings.gamedays = df_station.SERVICE_START_ZEIT + unique!(settings.gamedays) + end + # performance data is being sorted + perfData = @time readPerfData(settings) + + """ + The following conditional evaluation calls the different functions. + Possible modes: "statistical variation", "black list", "train number". + For "statistical variation" two different approaches are being used. + For "black list" or a single "train number" all selected line numbers are + analyzed to get detailed information about the deviation for each + registration point in one DataFrame. The function "top100" creates "black + lists" for each week, the function "top1" analyzes a single train number + for the average deviation and the median. + """ + + if settings.mode != "statistical variation" + #for "black list or single train number + allPerfData = DataFrame() + currentLine = 1 + for line in settings.allLines + settings.analyzedLine = line + perfDataLine = @time createFiles(perfData, settings) + settings.commonStations = intersect( + settings.commonStations, + settings.stationLists[currentLine], + ) + if currentLine == 1 + allPerfData = perfDataLine + else + append!(allPerfData, perfDataLine) + end + currentLine += 1 + println("") + end + if settings.mode == "black list" + @time top100(allPerfData, settings) + else + @time top1(allPerfData, settings) + end + else # settings.mode == "statistical variation" + if settings.approach == "registration points" + # deviation at each registration point + settings.commonStations = collect(keys(settings.stationDict)) + if settings.objectInFocus == "single line" + allPerfData = @time createFiles(perfData, settings) + quantileD1, quantileD2 = + @time analyzeStatisticalVariation(allPerfData, settings) + @time plotEverything(quantileD1, quantileD2, settings, memory) + elseif settings.objectInFocus == "all lines" + for line in settings.allLines + settings.analyzedLine = line + linePerfData = @time createFiles(perfData, settings) + q1, q2 = @time analyzeStatisticalVariation( + linePerfData, + settings, + ) + @time plotEverything(q1, q2, settings, memory) + println("") + end + end + elseif settings.approach == "between registration points" + settings.commonStations = collect(keys(settings.stationDict)) + if settings.objectInFocus == "single line" + allPerfData = @time createFiles(perfData, settings) + plotData1, plotData2 = + @time getDifferences(allPerfData, settings) + @time plotEverything(plotData1, plotData2, settings, memory) + elseif settings.objectInFocus == "all lines" + for line in settings.allLines + settings.analyzedLine = line + allPerfData = createFiles(perfData, settings) + plotData1, plotData2 = + @time getDifferences(allPerfData, settings) + @time plotEverything(plotData1, plotData2, settings, memory) + println("") + end + end + else + error("ERROR: No approach has been selected. Please do so.") + end + end + + +end + +""" +The function is sorting the performance data and deleting duplicates. If only +specific days are needed, other days will be deleted or marked. +""" + +function readPerfData(settings) + perfData = + DataFrame(CSV.File(settings.realTimeDataPath; header = 1, delim = ";")) + + sizePerfData = size(perfData, 1) + println( + "The file ", + settings.realTimeDataPath, + " has ", + sizePerfData, + " rows.", + ) + + if settings.timePeriod[1] != "no" + perfData = selectSpecificDays(perfData, settings) + end + + # duplicates are being deleted + select!(perfData, Not(:QUELLE_SENDER)) + select!(perfData, Not(:EINGANGSZEIT)) + perfData[!, :single] = ((nonunique(perfData))) + perfData = perfData[perfData[:, :single].==false, :] + select!(perfData, Not(:single)) + + for row in eachrow(perfData) + if row.ZUGEREIGNIS_DS100 == "TS" + row.ZUGEREIGNIS_DS100 == "TS T" + end + end + + println( + "Performance data has been sorted and saved. ", + sizePerfData - size(perfData, 1), + " row(s) has/have been deleted.", + ) + + return perfData + +end + +function selectSpecificDays(df1, settings) + if settings.timePeriod[1] == "match day" # days with match are being marked + gamedays = Any[] + day = Any[] + game = Any[] + for day in settings.gamedays + push!(gamedays, Dates.Date(Dates.DateTime(day, "dd.mm.yyyy HH:MM"))) + unique!(gamedays) + end + settings.gamedays = copy(gamedays) + for row in eachrow(df1) + currentDay = Dates.Date( + Dates.DateTime(row.ZUGEREIGNIS_SOLLZEIT, "dd.mm.yyyy HH:MM"), + ) + push!(day, currentDay) + if currentDay in settings.gamedays + push!(game, "yes") + else + push!(game, "no") + end + end + df1[!, :day] = day + df1[!, :game] = game + df_new = copy(df1) + #df_day = filter(row -> row[:day] in settings.gamedays, df1) + + elseif settings.timePeriod[1] == "rush hour" # rush hour or not + rushHour = Any[] + startM = parse(Float64, settings.timePeriod[2]) + endM = parse(Float64, settings.timePeriod[3]) + startE = parse(Float64, settings.timePeriod[4]) + endE = parse(Float64, settings.timePeriod[5]) + for row in eachrow(df1) + currentH = Dates.Hour( + Dates.DateTime(row.ZUGEREIGNIS_SOLLZEIT, "dd.mm.yyyy HH:MM"), + ) + currentM = Dates.Minute( + Dates.DateTime(row.ZUGEREIGNIS_SOLLZEIT, "dd.mm.yyyy HH:MM"), + ) + current = real(currentH.value) + real(currentM.value) / 100 + if (current >= startM && current <= endM) || + (current >= startE && current <= endE) + push!(rushHour, "yes") + else + push!(rushHour, "no") + end + end + df1[!, :rushHour] = rushHour + df_new = copy(df1) + saveOutput(df_new, settings) + else # comparison of two weekdays + df1[!, :dayname] = fill("day undefined", size(df1, 1)) + for row in eachrow(df1) + if Dates.dayname( + Dates.DateTime(row.ZUGEREIGNIS_SOLLZEIT, "dd.mm.yyyy HH:MM"), + ) == settings.timePeriod[1] + row.dayname = settings.timePeriod[1] # day 1 + elseif Dates.dayname( + Dates.DateTime(row.ZUGEREIGNIS_SOLLZEIT, "dd.mm.yyyy HH:MM"), + ) == settings.timePeriod[2] + row.dayname = settings.timePeriod[2] # day 2 + end + end + df_new = df1[df1[:, :dayname].!="day undefined", :] + end + + return df_new + +end + +""" +Settings are being saved and a dictionary for the station names is being created. +""" + +function readSettings(filePathSettings) + data = YAML.load(open(filePathSettings)) + setting = Settings() + + if haskey(data["settings"], "outputFilePath") + setting.outputFilePath = data["settings"]["outputFilePath"] + delete!(data["settings"], "outputFilePath") + else + error( + "ERROR at reading the settings yaml file: The keyword outputFilePath + is missing. It has to be added.", + ) + end + + if haskey(data["settings"], "objectInFocus") + setting.objectInFocus = data["settings"]["objectInFocus"] + delete!(data["settings"], "objectInFocus") + else + error( + "ERROR at reading the settings yaml file: The keyword objectInFocus + is missing. It has to be added.", + ) + end + + if haskey(data["settings"], "timePeriod") + setting.timePeriod = data["settings"]["timePeriod"] + delete!(data["settings"], "timePeriod") + else + error( + "ERROR at reading the settings yaml file: The keyword timePeriod is + missing. It has to be added.", + ) + end + + if haskey(data["settings"], "analyzedLine") + setting.analyzedLine = data["settings"]["analyzedLine"] + delete!(data["settings"], "analyzedLine") + else + error( + "ERROR at reading the settings yaml file: The keyword analyzedLine + is missing. It has to be added.", + ) + end + + if haskey(data["settings"], "estimatedTimesPath") + setting.estimatedTimesPath = data["settings"]["estimatedTimesPath"] + delete!(data["settings"], "estimatedTimesPath") + else + error( + "ERROR at reading the settings yaml file: The keyword + estimatedTimesPath is missing. It has to be added.", + ) + end + + if haskey(data["settings"], "realTimeDataPath") + setting.realTimeDataPath = data["settings"]["realTimeDataPath"] + delete!(data["settings"], "realTimeDataPath") + else + error( + "ERROR at reading the settings yaml file: The keyword realTimeData + is missing. It has to be added.", + ) + end + + if haskey(data["settings"], "stationsListPath") + setting.stationsListPath = data["settings"]["stationsListPath"] + delete!(data["settings"], "stationsListPath") + else + error( + "ERROR at reading the settings yaml file: The keyword + stationsListPath is missing. It has to be added.", + ) + end + + if haskey(data["settings"], "mode") + setting.mode = data["settings"]["mode"] + delete!(data["settings"], "mode") + else + error( + "ERROR at reading the settings yaml file: The keyword mode is + missing. It has to be added.", + ) + end + + if haskey(data["settings"], "allLines") + setting.allLines = data["settings"]["allLines"] + delete!(data["settings"], "allLines") + else + error( + "ERROR at reading the settings yaml file: The keyword allLines is + missing. It has to be added.", + ) + end + + if haskey(data["settings"], "quantile") + setting.quantile = data["settings"]["quantile"] + delete!(data["settings"], "quantile") + else + error( + "ERROR at reading the settings yaml file: The keyword quantile is + missing. It has to be added.", + ) + end + + if haskey(data["settings"], "approach") + setting.approach = data["settings"]["approach"] + delete!(data["settings"], "approach") + else + error( + "ERROR at reading the settings yaml file: The keyword singleQuantile + is missing. It has to be added.", + ) + end + + + # station dict for DS100 => name of station + stationDict = createStationDict(readlines(open(setting.stationsListPath))) + stationDict["TFL"] = "Stuttgart Flughafen Messe" + stationDict["TBO"] = "Boeblingen" + setting.stationDict = stationDict + + return setting +end + +function createStationDict(stationDict) + dic = Dict() + for x in stationDict + substring = (split(x, ";")) + push!(dic, substring[2] => substring[3]) + end + return dic +end + +""" +For the selected line number the estimated times are being checked. The station +sequence is being read and a direction is assigned to each train number. +""" + +function createFiles(perfData, settings) + trainNumber = readLineData(settings) + perfData = editFile(settings, perfData, trainNumber) + return perfData +end + +function readLineData(settings) + df = DataFrame(CSV.File(settings.estimatedTimesPath)) + df1 = df[df[:, :ZUGEREIGNIS_LINIE].==parse(Int, settings.analyzedLine), :] + trainNumber = unique(df1.ZUGEREIGNIS_ZUGNUMMER) + + # sort the data in train sets + df1 = sort!(df1, [:SERVICE_ID, :SERVICE_START_ZEIT], rev = (false, true)) + + #row count for a better organisation + df1[!, :rownumber] = axes(df1, 1) + + maxHALT_NR = maximum(df1.SERVICE_HALTNR) + newTrains = findall(x -> x == 1, df1.SERVICE_HALTNR) + + endOfMaxStopsTrains = findall(x -> x == maxHALT_NR, df1.SERVICE_HALTNR) + endOfMaxStopsTrains = filter!(x -> x >= newTrains[1], endOfMaxStopsTrains) + + i = 0 + for x in newTrains + if x == endOfMaxStopsTrains[1] - maxHALT_NR + 1 + i += 1 + break + else + i += 1 + end + end + + + # station lists for both directions are being created + i = newTrains[i] + + stationsList = Any[] + while df1.SERVICE_HALTNR[i] != maxHALT_NR + push!(stationsList, df1.ZUGEREIGNIS_DS100[i]) + i += 1 + end + push!(stationsList, df1.ZUGEREIGNIS_DS100[i]) + + #saving the stationList in settings + push!(settings.stationLists, stationsList) + + stationsListOneWay = unique(stationsList) + stationsListOtherWay = reverse(stationsList) + + println( + "Line ", + settings.analyzedLine, + " is connecting ", + settings.stationDict[stationsListOneWay[1]], + " and ", + settings.stationDict[stationsListOneWay[size(stationsListOneWay, 1)]], + ) + + return trainNumber +end + +function editFile(settings, perfData, trainNumber) + perfData = + filter(row -> row[:ZUGEREIGNIS_ZUGNUMMER] in trainNumber, perfData) + + if settings.objectInFocus == "single line" + lineNr = 1 + else + lineNr = findall(x -> x == settings.analyzedLine, settings.allLines) + lineNr = lineNr[1] + end + + stationList = settings.stationLists[lineNr] + directionE = "" # direction of trains with even train numbers + directionU = "" # direction of trains with uneven train numbers + direction = Any[] + + perfData[!, :rownumber] = axes(perfData, 1) + for row in eachrow(perfData) + if row.ZUGEREIGNIS_TYP == 10 && row.ZUGEREIGNIS_DS100 == stationList[1] + if iseven(row.ZUGEREIGNIS_ZUGNUMMER) + directionE = stationList[length(stationList)] + directionU = stationList[1] + else + directionU = stationList[length(stationList)] + directionE = stationList[1] + end + break + end + end + + for row in eachrow(perfData) + if iseven(row.ZUGEREIGNIS_ZUGNUMMER) + push!(direction, directionE) + else + push!(direction, directionU) + end + end + + perfData[!, :ZUGEREIGNIS_RICHTUNG] = direction + + perfData = sort!( + perfData, + [:SERVICE_ID, :ZUGEREIGNIS_SOLLZEIT], + rev = (true, false), + ) + + + perfData[!, :ZUGEREIGNIS_LINIE] = + fill(settings.analyzedLine, size(perfData, 1)) + + println( + "Performance Data for line " * + settings.analyzedLine * + " has been modified.", + ) + + return perfData +end + + + +end diff --git a/output.jl b/output.jl new file mode 100644 index 0000000..557e7a5 --- /dev/null +++ b/output.jl @@ -0,0 +1,45 @@ + +module output + +using DelimitedFiles, CSV, Dates, DataFrames + +export saveOutput, saveDataFrame + +function saveOutput(perfData, settings) + dateString = Dates.format(Dates.now(), "yyyy-mm-dd_HH.MM.SS") + if settings.mode == "black list" + path = settings.outputFilePath * dateString * "_Top 100.csv" + CSV.write(path, perfData, header = true) + elseif settings.mode == "statistical variation" + # for settings.objectInFocus == "single line" || + # settings.objectInFocus == "all lines" + path = + settings.outputFilePath * + dateString * + "_" * + settings.mode * + "_" * + settings.objectInFocus * + ".csv" + CSV.write(path, perfData, header = true) + else + println("ATTENTION: No output has been created.") + end + +end + +""" +Function can be called from every module to save DataFrame. +""" + +function saveDataFrame(perfData, settings, x) + dateString = Dates.format(Dates.now(), "yyyy-mm-dd_HH.MM.SS") + path = + settings.outputFilePath * + "_Linie_" * + settings.analyzedLine * + dateString *x*".csv" + CSV.write(path, perfData, header = true) +end + +end diff --git a/plotting.jl b/plotting.jl new file mode 100644 index 0000000..57bc0c1 --- /dev/null +++ b/plotting.jl @@ -0,0 +1,592 @@ +module plotting + +using StatsPlots, Plots, DataFrames, Statistics, Dates + +export plotEverything, plotBars, plotAllDistributions + +function plotEverything(df1, df2, settings, memory) + println("Data is being plotted.") + dateString = Dates.format(Dates.now(), "yyyy-mm-dd_HH.MM.SS") + memory.focus = "" + memory.yTicks = ((0-240):60:2000) #change for case + memory.xRotation = 90.0 + memory.xLabel = "stations" + memory.yLabel = "" + memory.color = [:chartreuse :yellow2 :lightskyblue :purple] + memory.linewidth = 0 + memory.barwidth = 0.7 + memory.tickSize = 13 + memory.guidefontsize = 20 + memory.legendSize = 13 + memory.titleSize = 23 + memory.legendPosition = :outerright + if settings.approach == "registration points" + memory.yLabel = "deviation in seconds" + memory.focus = "Line " * settings.analyzedLine + memory.size = (2000, 1300) + memory.direction1 = + settings.stationDict[df1.station[length(df1.station)]] + memory.direction2 = + settings.stationDict[df2.station[length(df2.station)]] + if settings.timePeriod[1] != "no" && + settings.timePeriod[1] != "match day" && + settings.timePeriod[1] != "rush hour" && + settings.analyzedLine != "11" + memory.title = + memory.focus * + " - Direction " * + memory.direction1 * + " - " * + settings.timePeriod[1] * + "/ " * + settings.timePeriod[2] * + " - " * + settings.quantile[1] * + " %-Quantile" + p1 = plotBarsDays(df1, settings, memory) + memory.title = + memory.focus * + " - Direction " * + memory.direction2 * + " - " * + settings.timePeriod[1] * + "/ " * + settings.timePeriod[2] * + " - " * + settings.quantile[1] * + " %-Quantile" + p2 = plotBarsDays(df2, settings, memory) + elseif settings.timePeriod[1] == "match day" && + settings.analyzedLine != "11" + memory.size = (900, 750) + memory.title = + memory.focus * + " - Direction " * + memory.direction1 * + " - Match Day - " * + settings.quantile[1] * + " %-Quantile" + p1 = plotBarsGameOrRushHour(df1, settings, memory) + memory.title = + memory.focus * + " - Direction " * + memory.direction2 * + " - Match Day - " * + settings.quantile[1] * + " %-Quantile" + p2 = plotBarsGameOrRushHour(df2, settings, memory) + elseif settings.timePeriod[1] == "rush hour" && + settings.analyzedLine != "11" + memory.title = + memory.focus * + " - Direction " * + memory.direction1 * + " - Rush Hour - " * + settings.quantile[1] * + " %-Quantile" + p1 = plotBarsGameOrRushHour(df1, settings, memory) + memory.title = + memory.focus * + " - Direction " * + memory.direction2 * + " - Rush Hour - " * + settings.quantile[1] * + " % Quantile" + p2 = plotBarsGameOrRushHour(df2, settings, memory) + elseif settings.timePeriod[1] == "no" #no extra settings + memory.title = memory.focus * " - Direction " * memory.direction1 + p1 = plotBarsMultQuant(df1, settings, memory) + memory.title = memory.focus * " - Direction " * memory.direction2 + p2 = plotBarsMultQuant(df2, settings, memory) + memory.title = memory.focus * " - Distribution" + #plotDistributionInSec(df1, df2, settings, memory) + plotDistributionInMin(df1, df2, settings, memory) + if length(settings.allLines) == length(memory.distributionMin) + #plotAllDistributions(settings, memory) + #memory.title = "Distribution S-Bahn Stuttgart - 'new' quantiles" + memory.title = "Distribution S-Bahn Stuttgart - 'danish' quantiles" + p3 = plotAllDistributions(settings, memory) + settings.allLines = ["1", "2", "3", "4", "5", "6", "60"] + pop!(memory.distributionMin) + memory.title = "" + p4 = plotAllDistributions(settings, memory) + all = plot(p3, p4, layout = (2, 1), legend = :bottomright) + savefig( + all, + settings.outputFilePath * + "\\Plots\\all_Lines" * + "_" * + dateString * + ".pdf", + ) + end + end + dateString = Dates.format(Dates.now(), "yyyy-mm-dd_HH.MM.SS") + all = plot(p1, p2, layout = (2, 1), legend = memory.legendPosition) + savefig( + all, + settings.outputFilePath * + "\\Plots\\" * + memory.focus * + " " * + settings.approach * + "_" * + dateString * + ".pdf", + ) + + + elseif settings.approach == "between registration points" + myTitle1 = "" + myTitle2 = "" + memory.yLabel = "deviation - median (seconds)" + memory.size = (900, 750) + memory.focus = "Line " * settings.analyzedLine + memory.direction1 = settings.stationDict[df1.point2[length(df1.point2)]] + memory.direction2 = settings.stationDict[df2.point2[length(df2.point2)]] + memory.title = memory.focus * " - Direction " * memory.direction1 + p1 = plotBarsLineSection(df1, settings, memory) + memory.title = memory.focus * " - Direction " * memory.direction2 + p2 = plotBarsLineSection(df2, settings, memory) + all = plot(p1, p2, layout = (2, 1), legend = false, size = (800, 600)) + dateString = Dates.format(Dates.now(), "yyyy-mm-dd_HH.MM.SS") + savefig( + all, + settings.outputFilePath * + "\\Plots\\" * + memory.focus * + " " * + settings.approach * + "_" * + dateString * + ".pdf", + ) + + + end +end + +function plotAllDistributions(settings, memory) + Plots.pyplot() + + y = Any[] + x = Any[] + + for quantile in settings.quantile + push!(y, parse(Int, quantile)) + end + x = memory.distributionMin[1] + + #average september and october; source: DB + a = [3, 6] + b = [84.9, 95.8] + + tickX = (0:1:20) + + z = plot( + a, + b, + label = "average deviation sept/ oct 2017", + xlabel = "Deviation (min)", + ylabel = "Quantile (%)", + marker = true, + legend = :bottomright, + color = :red, + xticks = tickX, + size = (840, 600), + ) + colors = [ + :gray71, + :slateblue, + :goldenrod, + :darkcyan, + :magenta4, + :aqua, + :deeppink, + :tan4, + ] + z = plot!( + x, + y, + title = memory.title, + label = "line " * settings.allLines[1], + marker = true, + color = colors[1], + ) + + for i = 2:length(memory.distributionMin) + x = memory.distributionMin[i] + colorN = colors[i] + z = plot!( + x, + y, + marker = true, + color = colorN, + label = "line " * settings.allLines[i], + ) + end + + dateString = Dates.format(Dates.now(), "yyyy-mm-dd_HH.MM.SS") + + savefig( + z, + settings.outputFilePath * + "\\Plots\\all_lines_deviation(min)_" * + dateString * + ".pdf", + ) + + return z + +end + +function plotBarsMultQuant(df, settings, memory) + + pyplot() + + stations = Array{Int,1}() + depOrArr = Array{String,1}() + quantile = Array{Float64,1}() + type = Array{String,1}() + + stationList = Any[] + + for index = 1:length(df.station) + push!(stationList, df.station[index] * "_" * df.DepOrArr[index]) + for numberQuant = 1:length(settings.quantile) + push!(stations, index) + currentColumn = df[!, "quantile"*string(numberQuant)] + push!(quantile, currentColumn[index]) + push!(type, settings.quantile[numberQuant] * "% quantile") + end + end + + df_new = + DataFrame(station = (stations), quantile = (quantile), type = (type)) + + memory.stationList = copy(stationList) + + x = plotBars(df_new, memory, settings) + + x = plot!( + df.AverageDelay, + linewidth = 3, + linecolor = :blue, + marker = true, + label = "Average", + ) + + return x +end + +function plotBarsDays(df, settings, memory) + + pyplot() + + stations = Array{Int,1}() + depOrArr = Array{String,1}() + quantile = Array{Float64,1}() + type = Array{String,1}() + + stationList = Any[] + + for index = 1:length(df.station) + push!(stationList, df.station[index] * "_" * df.DepOrArr[index]) + for day in settings.timePeriod + push!(stations, index) + currentColumn = df[!, "quantile"*settings.quantile[1]*"_"*day] + push!(quantile, currentColumn[index]) + push!(type, day) + end + end + + df_new = + DataFrame(station = (stations), quantile = (quantile), type = (type)) + memory.stationList = copy(stationList) + x = plotBars(df_new, memory, settings) + + x = plot!( + df[!, "average_"*settings.timePeriod[1]], + linewidth = 3, + linecolor = :orange, + label = "Average Delay " * settings.timePeriod[1], + ) + x = plot!( + df[!, "average_"*settings.timePeriod[2]], + linewidth = 3, + linecolor = :blue, + label = "Average Delay " * settings.timePeriod[2], + ) + + + return x + +end + +function plotBarsLineSection(df, settings, memory) + pyplot() + + x = bar( + df.median, + xticks = ([1:1:length(df.points);], df.points), + yticks = ((0-270):30:210), + legend = false, + title = memory.title, + ylabel = memory.yLabel, + xlabel = memory.xLabel, + size = (800, 300), + bar_width = 1.0, + xtickfontrotation = memory.xRotation, + ) + + return x + +end + +function plotDistributionInSec(df1, df2, settings, memory) + + mean1 = (mean(df1.quantile1) + mean(df2.quantile1)) / 2 + mean2 = (mean(df1.quantile2) + mean(df2.quantile2)) / 2 + mean3 = (mean(df1.quantile3) + mean(df2.quantile3)) / 2 + mean4 = (mean(df1.quantile4) + mean(df2.quantile4)) / 2 + + pyplot() + y = Any[] + for quantile in settings.quantile + push!(y, parse(Int, quantile)) + end + + x = [mean1, mean2, mean3, mean4] + + #Jahrendurschnitt Quelle: DB + a = [3 * 60, 6 * 60] + b = [84.9, 95.8] + + z = plot( + a, + b, + title = memory.title, + label = "average deviation september/october 2017", + xlabel = "Deviation (sec)", + ylabel = "Quantile", + marker = true, + legend = :bottomright, + color = :red, + ) + + z = plot!( + x, + y, + marker = true, + label = "Deviation Line" * settings.analyzedLine, + ) + + dateString = Dates.format(Dates.now(), "yyyy-mm-dd_HH.MM.SS") + + savefig( + z, + settings.outputFilePath * + "\\Plots\\Line_" * + settings.analyzedLine * + "_Deviation(sec)_" * + dateString * + ".pdf", + ) + +end + +function plotDistributionInMin(df1, df2, settings, memory) + + mean1 = (mean(df1.quantile1) + mean(df2.quantile1)) / 2 / 60 + mean2 = (mean(df1.quantile2) + mean(df2.quantile2)) / 2 / 60 + mean3 = (mean(df1.quantile3) + mean(df2.quantile3)) / 2 / 60 + mean4 = (mean(df1.quantile4) + mean(df2.quantile4)) / 2 / 60 + + pyplot() + y = Any[] + for quantile in settings.quantile + push!(y, parse(Int, quantile)) + end + x = [mean1, mean2, mean3, mean4] + push!(memory.distributionMin, x) + + #Jahrendurschnitt Quelle: DB + a = [3, 6] + b = [84.9, 95.8] + + z = plot( + a, + b, + title = memory.title, + label = "average deviation september/october 2017", + xlabel = "Deviation (min)", + ylabel = "Quantile", + marker = true, + legend = :bottomright, + color = :red, + ) + + z = plot!( + x, + y, + marker = true, + label = "Deviation " * memory.focus, + color = :blue, + ) + + + + dateString = Dates.format(Dates.now(), "yyyy-mm-dd_HH.MM.SS") + + savefig( + z, + settings.outputFilePath * + "\\Plots\\" * + memory.title * + dateString * + ".pdf", + ) + +end + +function plotRailwayNetworkDistr(df1, df2, settings) + mean1 = (mean(df1.quantile1) + mean(df2.quantile1)) / 2 / 60 + mean2 = (mean(df1.quantile2) + mean(df2.quantile2)) / 2 / 60 + mean3 = (mean(df1.quantile3) + mean(df2.quantile3)) / 2 / 60 + mean4 = (mean(df1.quantile4) + mean(df2.quantile4)) / 2 / 60 + + pyplot() + y = Any[] + for quantile in settings.quantile + push!(y, parse(Int, quantile)) + end + x = [mean1, mean2, mean3, mean4] + + z = plot( + x, + y, + title = settings.objectInFocus * " - Distribution", + label = "Deviation " * settings.objectInFocus, + xlabel = "Deviation (min)", + ylabel = "Quantile", + marker = true, + legend = :bottomright, + ) + + #Durchschnitt September und Oktober Quelle: DB + a = [3, 6] + b = [84.9, 95.8] + + + z = plot!(a, b, marker = true, label = "Average Deviation 2017") + + + dateString = Dates.format(Dates.now(), "yyyy-mm-dd_HH.MM.SS") + + savefig( + z, + settings.outputFilePath * + "\\Plots\\" * + settings.objectInFocus * + "_Deviation(min)_" * + dateString * + ".pdf", + ) + + + +end + +function plotBarsGameOrRushHour(df, settings, memory) + + + pyplot() + + stations = Array{Int,1}() + depOrArr = Array{String,1}() + quantile = Array{Float64,1}() + type = Array{String,1}() + + stationList = Any[] + + for index = 1:length(df.station) + push!(stationList, df.station[index] * "_" * df.DepOrArr[index]) + for indicator in ["yes", "no"] + push!(stations, index) + currentColumn = df[!, "quantile"*settings.quantile[1]*"_"*indicator] + push!(quantile, currentColumn[index]) + push!(type, indicator) + end + end + newType = Any[] + if settings.timePeriod[1] == "rush hour" + label1 = "Rush Hour" + label2 = "'Normal' Time" + elseif settings.timePeriod[1] == "match day" + label1 = "Match Day" + label2 = "'Normal' Day" + end + for x in type + if x == "yes" + push!(newType, label1) + else + push!(newType, label2) + end + end + + df_new = + DataFrame(station = (stations), quantile = (quantile), type = (newType)) + + memory.size = (2000, 1000) + memory.stationList = copy(stationList) + + x = plotBars(df_new, memory, settings) + + x = plot!( + df[!, "average_yes"], + linewidth = 3, + linecolor = :orange, + label = "Average Delay " * label1, + ) + x = plot!( + df[!, "average_no"], + linewidth = 3, + linecolor = :blue, + label = "Average Delay " * label2, + ) + + return x + +end + +function plotBars(df, memory, settings) + + if settings.analyzedLine == "11" && + memory.title == "Line 11 - Direction Herrenberg" + memory.yTicks = ((0-240):120:3000) + end + + x = groupedbar( + df.quantile, + xticks = ([1:1:size(memory.stationList, 1);], memory.stationList), + yticks = memory.yTicks, + group = df.type, + ylabel = memory.yLabel, + xlabel = memory.xLabel, + title = memory.title, + size = memory.size, + bar_width = memory.barwidth, + linewidth = memory.linewidth, + tickfontsize = memory.tickSize, + legendfontsize = memory.legendSize, + guidefontsize = memory.guidefontsize, + titlefontsize = memory.titleSize, + xtickfontrotation = memory.xRotation, + legend = memory.legendPosition, + ) + + return x + +end + +end diff --git a/readFile.jl b/readFile.jl new file mode 100644 index 0000000..badfec7 --- /dev/null +++ b/readFile.jl @@ -0,0 +1,14 @@ +module readFile + +include("./checkPerformance.jl") + +using .performance + +print("\n") + +settings = +"C:\\Users\\its\\Documents\\Ana\\UNI\\Bachelor\\DataAnalyzing\\settings.yaml" + +startAnalyzing(settings) + +end diff --git a/registrationPoints.jl b/registrationPoints.jl new file mode 100644 index 0000000..f3eca26 --- /dev/null +++ b/registrationPoints.jl @@ -0,0 +1,875 @@ +# approach 1: deviation is analyzed for each registration point + +module registrationPoints + +include("./output.jl") + +using Statistics, CSV, Dates, DataFrames, StatsBase +using .output + +export analyzeStatisticalVariation, allDataQuantile, top1, top100 + +## + +function analyzeStatisticalVariation(allPerfData, settings) + quantileD1, quantileD2 = calculateQuantiles(allPerfData, settings) + return quantileD1, quantileD2 +end + +""" +Function is preparing the new dataframes with the stations which are supposed to +be analyzed. +""" + +function calculateQuantiles(perfData, settings) + if settings.objectInFocus == "single line" + lineNr = 1 + else + lineNr = findall(x -> x == settings.analyzedLine, settings.allLines) + lineNr = lineNr[1] + end + + stationList = settings.stationLists[lineNr] + + dataDirection1 = DataFrame() + dataDirection1[!, :station] = stationList + dataDirection1[!, :DepOrArr] = fill("D", size(dataDirection1, 1)) + + dataDirection2 = DataFrame() + dataDirection2[!, :station] = reverse(stationList) + dataDirection2[!, :DepOrArr] = fill("D", size(dataDirection2, 1)) + + # sorting the performance data by directions + perfDataDirection1 = perfData[ + perfData[:, :ZUGEREIGNIS_RICHTUNG].==stationList[size(stationList, 1)], + :, + ] + perfDataDirection2 = + perfData[perfData[:, :ZUGEREIGNIS_RICHTUNG].==stationList[1], :] + + if settings.timePeriod[1] == "no" + dataDirection1 = calculateQuantileForDirection( + dataDirection1, + perfDataDirection1, + settings, + ) + dataDirection2 = calculateQuantileForDirection( + dataDirection2, + perfDataDirection2, + settings, + ) + elseif settings.timePeriod[1] == "match day" && + settings.analyzedLine != "11" + dataDirection1 = calculateQuantileForGame( + dataDirection1, + perfDataDirection1, + settings, + ) + dataDirection2 = calculateQuantileForGame( + dataDirection2, + perfDataDirection2, + settings, + ) + elseif settings.timePeriod[1] == "rush hour" && + settings.analyzedLine != "11" + dataDirection1 = calculateQuantileForTimePeriod( + dataDirection1, + perfDataDirection1, + settings, + ) + dataDirection2 = calculateQuantileForTimePeriod( + dataDirection2, + perfDataDirection2, + settings, + ) + elseif settings.analyzedLine != "11" + dataDirection1 = calculateQuantileForDay( + dataDirection1, + perfDataDirection1, + settings, + ) + dataDirection2 = calculateQuantileForDay( + dataDirection2, + perfDataDirection2, + settings, + ) + end + + return dataDirection1, dataDirection2 + +end + +""" +Function is calculating the selected quantiles for each registration point for +both directions. +""" + +function calculateQuantileForDirection( + dataDirection1, + perfDataDirection1, + settings, +) + deviationArray = Any[] # for deviation shown with quantile + marker = 1 # to make clear if railway object is arriving or departing + + # registration points with no data are being deleted + dataDirection1 = deleteEmptyStations(dataDirection1, perfDataDirection1) + + + for station in eachrow(dataDirection1) + stationPerfData = perfDataDirection1[ + perfDataDirection1[:, :ZUGEREIGNIS_DS100].==station.station, + :, + ] + + if length(stationPerfData.ZUGEREIGNIS_DS100) == 0 + dataDirection1 = + dataDirection1[dataDirection1[:, :station].!=station.station, :] + marker = 0 + else + + if iseven(marker) + station.DepOrArr = "A" + end + + + deviationStation = Any[] + totalDeviation = 0 + for row in eachrow(stationPerfData) + deviation = 0 + if iseven(marker) + if row.ZUGEREIGNIS_TYP == 20 || row.ZUGEREIGNIS_TYP == 50 + deviation = Dates.value( + Second( + convert( + Dates.Second, + Dates.DateTime( + row.ZUGEREIGNIS_ISTZEIT, + "dd.mm.yyyy HH:MM", + ) - Dates.DateTime( + row.ZUGEREIGNIS_SOLLZEIT, + "dd.mm.yyyy HH:MM", + ), + ), + ), + ) + push!(deviationStation, deviation) + end + elseif !iseven(marker) + if row.ZUGEREIGNIS_TYP == 10 || row.ZUGEREIGNIS_TYP == 40 + deviation = Dates.value( + Second( + convert( + Dates.Second, + Dates.DateTime( + row.ZUGEREIGNIS_ISTZEIT, + "dd.mm.yyyy HH:MM", + ) - Dates.DateTime( + row.ZUGEREIGNIS_SOLLZEIT, + "dd.mm.yyyy HH:MM", + ), + ), + ), + ) + push!(deviationStation, deviation) + end + end + end + push!(deviationArray, deviationStation) + + end + marker += 1 + end + + totalDeviationArray = Any[] + trainRuns = Any[] + quantileNr = 1 + for quantile in settings.quantile + quantileLine = Any[] + for row in deviationArray + x = quantile!(row, parse(Float64, quantile) / 100) + push!(quantileLine, x) + if quantileNr == 1 + y = mean(row) + push!(totalDeviationArray, y) + z = size(row, 1) + push!(trainRuns, z) + end + end + nameColumn = "quantile" * string(quantileNr) + dataDirection1[!, nameColumn] = quantileLine + quantileNr += 1 + end + + dataDirection1[!, :AverageDelay] = totalDeviationArray + dataDirection1[!, :TrainRuns] = trainRuns + + #saveDataFrame(dataDirection1, settings, "dataframe") + + return dataDirection1 +end + +""" +Function is calculating the selected quantiles for each registration point for +both directions. Only the two selected days are being checked. +""" + +function calculateQuantileForDay(dataDirection1, perfDataDirection1, settings) + + if size(perfDataDirection1, 1) != 0 + + deviationArray1 = Any[] #for deviation shown with quantile + totalDeviationArray1 = Any[] #for average deviation + deviationArray2 = Any[] #for deviation shown with quantile + totalDeviationArray2 = Any[] #for average deviation + marker = 1 #to make clear if railway object is arriving or departing + + for station in eachrow(dataDirection1) + stationPerfData = perfDataDirection1[ + perfDataDirection1[:, :ZUGEREIGNIS_DS100].==station.station, + :, + ] + + + if iseven(marker) + station.DepOrArr = "A" + end + + + deviationStation1 = Any[] + deviationStation2 = Any[] + for row in eachrow(stationPerfData) + if row.dayname == settings.timePeriod[1] + deviation = 0 + if iseven(marker) + if row.ZUGEREIGNIS_TYP == 20 || + row.ZUGEREIGNIS_TYP == 50 + deviation = Dates.value( + Second( + convert( + Dates.Second, + Dates.DateTime( + row.ZUGEREIGNIS_ISTZEIT, + "dd.mm.yyyy HH:MM", + ) - Dates.DateTime( + row.ZUGEREIGNIS_SOLLZEIT, + "dd.mm.yyyy HH:MM", + ), + ), + ), + ) + push!(deviationStation1, deviation) + end + elseif !iseven(marker) + if row.ZUGEREIGNIS_TYP == 10 || + row.ZUGEREIGNIS_TYP == 40 + deviation = Dates.value( + Second( + convert( + Dates.Second, + Dates.DateTime( + row.ZUGEREIGNIS_ISTZEIT, + "dd.mm.yyyy HH:MM", + ) - Dates.DateTime( + row.ZUGEREIGNIS_SOLLZEIT, + "dd.mm.yyyy HH:MM", + ), + ), + ), + ) + push!(deviationStation1, deviation) + end + end + elseif row.dayname == settings.timePeriod[2] + deviation = 0 + if iseven(marker) + if row.ZUGEREIGNIS_TYP == 20 || + row.ZUGEREIGNIS_TYP == 50 + deviation = Dates.value( + Second( + convert( + Dates.Second, + Dates.DateTime( + row.ZUGEREIGNIS_ISTZEIT, + "dd.mm.yyyy HH:MM", + ) - Dates.DateTime( + row.ZUGEREIGNIS_SOLLZEIT, + "dd.mm.yyyy HH:MM", + ), + ), + ), + ) + push!(deviationStation2, deviation) + end + elseif !iseven(marker) + if row.ZUGEREIGNIS_TYP == 10 || + row.ZUGEREIGNIS_TYP == 40 + deviation = Dates.value( + Second( + convert( + Dates.Second, + Dates.DateTime( + row.ZUGEREIGNIS_ISTZEIT, + "dd.mm.yyyy HH:MM", + ) - Dates.DateTime( + row.ZUGEREIGNIS_SOLLZEIT, + "dd.mm.yyyy HH:MM", + ), + ), + ), + ) + push!(deviationStation2, deviation) + end + end + #println(station.station) + #println(deviation) + end + end + + if length(deviationStation1) == 0 + deviationStation1 = [0] + elseif length(deviationStation2) == 0 + deviationStation2 = [0] + end + + push!(deviationArray1, deviationStation1) + push!(deviationArray2, deviationStation2) + + + marker += 1 + end + + ###################################################### + d = 1 + for day in settings.timePeriod + if d == 1 + deviationArray = deviationArray1 + else + deviationArray = deviationArray2 + end + dayTime = Any[] + averageDay = Any[] + trainRuns = Any[] + for row in deviationArray + x = quantile!(row, parse(Float64, settings.quantile[1]) / 100) + push!(dayTime, x) + y = mean(row) + push!(averageDay, y) + z = size(row, 1) + if z == 1 + z = 0 + end #stations with no data + push!(trainRuns, z) + end + nameColumn1 = "quantile" * settings.quantile[1] * "_" * day + dataDirection1[!, nameColumn1] = dayTime + nameColumn1 = "average_" * day + dataDirection1[!, nameColumn1] = averageDay + dataDirection1[!, "train runs "*day] = trainRuns + d += 1 + end + + #saveDataFrame(dataDirection1, settings, "dataframe") + + return dataDirection1 + + + else + println("There is no data for this direction and this line.") + data = DataFrame() + return data + end +end + +""" +Function is calculating the selected quantiles for each registration point for +both directions. Match days are compared with "normal" days. +""" + +function calculateQuantileForGame(dataDirection1, perfDataDirection1, settings) + + if size(perfDataDirection1, 1) != 0 + + deviationArray1 = Any[] #for deviation shown with quantile + totalDeviationArray1 = Any[] #for average deviation + deviationArray2 = Any[] #for deviation shown with quantile + totalDeviationArray2 = Any[] #for average deviation + marker = 1 #to make clear if railway object is arriving or departing + + for station in eachrow(dataDirection1) + stationPerfData = perfDataDirection1[ + perfDataDirection1[:, :ZUGEREIGNIS_DS100].==station.station, + :, + ] + + + if iseven(marker) + station.DepOrArr = "A" + end + + + deviationStation1 = Any[] + deviationStation2 = Any[] + for row in eachrow(stationPerfData) + if row.day in settings.gamedays + deviation = 0 + if iseven(marker) + if row.ZUGEREIGNIS_TYP == 20 || + row.ZUGEREIGNIS_TYP == 50 + deviation = Dates.value( + Second( + convert( + Dates.Second, + Dates.DateTime( + row.ZUGEREIGNIS_ISTZEIT, + "dd.mm.yyyy HH:MM", + ) - Dates.DateTime( + row.ZUGEREIGNIS_SOLLZEIT, + "dd.mm.yyyy HH:MM", + ), + ), + ), + ) + push!(deviationStation1, deviation) + end + elseif !iseven(marker) + if row.ZUGEREIGNIS_TYP == 10 || + row.ZUGEREIGNIS_TYP == 40 + deviation = Dates.value( + Second( + convert( + Dates.Second, + Dates.DateTime( + row.ZUGEREIGNIS_ISTZEIT, + "dd.mm.yyyy HH:MM", + ) - Dates.DateTime( + row.ZUGEREIGNIS_SOLLZEIT, + "dd.mm.yyyy HH:MM", + ), + ), + ), + ) + push!(deviationStation1, deviation) + end + end + else + deviation = 0 + if iseven(marker) + if row.ZUGEREIGNIS_TYP == 20 || + row.ZUGEREIGNIS_TYP == 50 + deviation = Dates.value( + Second( + convert( + Dates.Second, + Dates.DateTime( + row.ZUGEREIGNIS_ISTZEIT, + "dd.mm.yyyy HH:MM", + ) - Dates.DateTime( + row.ZUGEREIGNIS_SOLLZEIT, + "dd.mm.yyyy HH:MM", + ), + ), + ), + ) + push!(deviationStation2, deviation) + end + elseif !iseven(marker) + if row.ZUGEREIGNIS_TYP == 10 || + row.ZUGEREIGNIS_TYP == 40 + deviation = Dates.value( + Second( + convert( + Dates.Second, + Dates.DateTime( + row.ZUGEREIGNIS_ISTZEIT, + "dd.mm.yyyy HH:MM", + ) - Dates.DateTime( + row.ZUGEREIGNIS_SOLLZEIT, + "dd.mm.yyyy HH:MM", + ), + ), + ), + ) + push!(deviationStation2, deviation) + end + end + #println(station.station) + #println(deviation) + end + end + + if length(deviationStation1) == 0 + deviationStation1 = [0] + elseif length(deviationStation2) == 0 + deviationStation2 = [0] + end + + push!(deviationArray1, deviationStation1) + push!(deviationArray2, deviationStation2) + + + marker += 1 + end + + ###################################################### + d = 1 + for game in ["yes", "no"] + if d == 1 + deviationArray = deviationArray1 + else + deviationArray = deviationArray2 + end + dayTime = Any[] + averageDay = Any[] + for row in deviationArray + x = quantile!(row, parse(Float64, settings.quantile[1]) / 100) + push!(dayTime, x) + y = mean(row) + push!(averageDay, y) + end + nameColumn1 = "quantile" * settings.quantile[1] * "_" * game + dataDirection1[!, nameColumn1] = dayTime + nameColumn1 = "average_" * game + dataDirection1[!, nameColumn1] = averageDay + d += 1 + end + + return dataDirection1 + + else + println("There is no data for this direction and this line.") + data = DataFrame() + return data + end +end + +""" +Function is calculating the selected quantiles for each registration point for +both directions. Only registration points within the selected time period are +being checked. +""" + +function calculateQuantileForTimePeriod( + dataDirection1, + perfDataDirection1, + settings, +) + + if size(perfDataDirection1, 1) != 0 + + deviationArray1 = Any[] #for deviation shown with quantile + totalDeviationArray1 = Any[] #for average deviation + deviationArray2 = Any[] #for deviation shown with quantile + totalDeviationArray2 = Any[] #for average deviation + marker = 1 #to make clear if railway object is arriving or departing + + for station in eachrow(dataDirection1) + stationPerfData = perfDataDirection1[ + perfDataDirection1[:, :ZUGEREIGNIS_DS100].==station.station, + :, + ] + + + if iseven(marker) + station.DepOrArr = "A" + end + + + deviationStation1 = Any[] + deviationStation2 = Any[] + for row in eachrow(stationPerfData) + if row.rushHour == "yes" + deviation = 0 + if iseven(marker) + if row.ZUGEREIGNIS_TYP == 20 || + row.ZUGEREIGNIS_TYP == 50 + deviation = Dates.value( + Second( + convert( + Dates.Second, + Dates.DateTime( + row.ZUGEREIGNIS_ISTZEIT, + "dd.mm.yyyy HH:MM", + ) - Dates.DateTime( + row.ZUGEREIGNIS_SOLLZEIT, + "dd.mm.yyyy HH:MM", + ), + ), + ), + ) + push!(deviationStation1, deviation) + end + elseif !iseven(marker) + if row.ZUGEREIGNIS_TYP == 10 || + row.ZUGEREIGNIS_TYP == 40 + deviation = Dates.value( + Second( + convert( + Dates.Second, + Dates.DateTime( + row.ZUGEREIGNIS_ISTZEIT, + "dd.mm.yyyy HH:MM", + ) - Dates.DateTime( + row.ZUGEREIGNIS_SOLLZEIT, + "dd.mm.yyyy HH:MM", + ), + ), + ), + ) + push!(deviationStation1, deviation) + end + end + else + deviation = 0 + if iseven(marker) + if row.ZUGEREIGNIS_TYP == 20 || + row.ZUGEREIGNIS_TYP == 50 + deviation = Dates.value( + Second( + convert( + Dates.Second, + Dates.DateTime( + row.ZUGEREIGNIS_ISTZEIT, + "dd.mm.yyyy HH:MM", + ) - Dates.DateTime( + row.ZUGEREIGNIS_SOLLZEIT, + "dd.mm.yyyy HH:MM", + ), + ), + ), + ) + push!(deviationStation2, deviation) + end + elseif !iseven(marker) + if row.ZUGEREIGNIS_TYP == 10 || + row.ZUGEREIGNIS_TYP == 40 + deviation = Dates.value( + Second( + convert( + Dates.Second, + Dates.DateTime( + row.ZUGEREIGNIS_ISTZEIT, + "dd.mm.yyyy HH:MM", + ) - Dates.DateTime( + row.ZUGEREIGNIS_SOLLZEIT, + "dd.mm.yyyy HH:MM", + ), + ), + ), + ) + push!(deviationStation2, deviation) + end + end + #println(station.station) + #println(deviation) + end + end + + if length(deviationStation1) == 0 + deviationStation1 = [0] + elseif length(deviationStation2) == 0 + deviationStation2 = [0] + end + + push!(deviationArray1, deviationStation1) + push!(deviationArray2, deviationStation2) + + + marker += 1 + end + + ###################################################### + d = 1 + for rushHour in ["yes", "no"] + if d == 1 + deviationArray = deviationArray1 + else + deviationArray = deviationArray2 + end + time = Any[] + average = Any[] + for row in deviationArray + x = quantile!(row, parse(Float64, settings.quantile[1]) / 100) + push!(time, x) + y = mean(row) + push!(average, y) + end + nameColumn1 = "quantile" * settings.quantile[1] * "_" * rushHour + dataDirection1[!, nameColumn1] = time + nameColumn1 = "average_" * rushHour + dataDirection1[!, nameColumn1] = average + d += 1 + end + return dataDirection1 + + else + println("There is no data for this direction and this line.") + data = DataFrame() + return data + end + +end + +""" +Stations with no data at all are deleted from the station list and the list +is being modfied. +""" + +function deleteEmptyStations(dataDirection1, perfDataDirection1) + lengthData = size(dataDirection1.station, 1) + + for station in eachrow(dataDirection1) + + stationPerfData = perfDataDirection1[ + perfDataDirection1[:, :ZUGEREIGNIS_DS100].==station.station, + :, + ] + + if length(stationPerfData.ZUGEREIGNIS_DS100) == 0 + dataDirection1 = + dataDirection1[dataDirection1[:, :station].!=station.station, :] + end + + end + + marker = lengthData - size(dataDirection1.station, 1) + + if !iseven(marker) && marker != 0 + stationList = (dataDirection1.station) + popfirst!(stationList) + dataDirection = DataFrame() + dataDirection[!, :station] = stationList + dataDirection[!, :DepOrArr] = fill("D", size(dataDirection, 1)) + return dataDirection + + else + return dataDirection1 + end +end + +""" +Function creates a "black list" for each week and saves how often each train +number and train number-station combination is represented in the weekly list. +""" + +function top100(df1, settings) + # only departures are being analyzed + perfData = df1[df1[:, :ZUGEREIGNIS_TYP].==40, :] + x = df1[df1[:, :ZUGEREIGNIS_TYP].==10, :] + append!(perfData, x) + # first and final day of analysis; each week is being checked + finalDay = Dates.Date("11.10.2017", "dd.mm.yyyy") + firstDay = Dates.Date("01.09.2017", "dd.mm.yyyy HH:MM") + lastDay = (Dates.Date(firstDay) + Dates.Week(1) - Dates.Day(1)) + d = firstDay:Dates.Day(1):lastDay + fDreached = false # is the final day already reached? + topAll = Any[] + topNum = Any[] + + while fDreached == false + d = firstDay:Dates.Day(1):lastDay + #println(d) + #println(size(perfData, 1)) + week = filter( + row -> + Dates.Date(row.ZUGEREIGNIS_SOLLZEIT, "dd.mm.yyyy HH:MM") in d, + perfData, + ) + devA = Any[] + deviation = 0 + + for row in eachrow(week) + deviation = Dates.value( + Second( + convert( + Dates.Second, + Dates.DateTime( + row.ZUGEREIGNIS_ISTZEIT, + "dd.mm.yyyy HH:MM", + ) - Dates.DateTime( + row.ZUGEREIGNIS_SOLLZEIT, + "dd.mm.yyyy HH:MM", + ), + ), + ), + ) + push!(devA, deviation / 60) + end + + + week[!, :deviation] = devA + # for each week the data is being sorted by deviation + sort!(week, :deviation, rev = true) + + i = 1 + numbers = unique(week.ZUGEREIGNIS_ZUGNUMMER) + # for lowest deviation: + #reverse!(numbers) + #sort!(week, :deviation, rev = false) + for row in eachrow(week) + if i <= 25 + push!(topNum, numbers[i]) + i += 1 + push!( + topAll, + row.ZUGEREIGNIS_DS100 * + "," * + string(row.ZUGEREIGNIS_ZUGNUMMER), + ) + end + end + saveOutput(week, settings) + fDreached = (finalDay in d) + firstDay = lastDay + Dates.Day(1) + lastDay = firstDay + Dates.Week(1) - Dates.Day(1) + end + df = DataFrame(countmap(topAll)) + #println(countmap(topAll)) + #println(countmap(topNum)) + df3 = countmap(topNum) + saveOutput(df, settings) + sleep(1) + saveOutput(df3, settings) +end + +""" +A single train number is being analyzed. +""" + +function top1(df, settings) + # single train number is being analyzed + perfData = df[df[:, :ZUGEREIGNIS_ZUGNUMMER].==parse(Int, settings.mode), :] + #perfData = perfData[perfData[:, :ZUGEREIGNIS_DS100] .== "TRX",:] + y = perfData[perfData[:, :ZUGEREIGNIS_TYP].==40, :] + x = perfData[perfData[:, :ZUGEREIGNIS_TYP].==10, :] + perfData = append!(y, x) + devA = Any[] + for row in eachrow(perfData) + deviation = Dates.value( + Second( + convert( + Dates.Second, + Dates.DateTime( + row.ZUGEREIGNIS_ISTZEIT, + "dd.mm.yyyy HH:MM", + ) - Dates.DateTime( + row.ZUGEREIGNIS_SOLLZEIT, + "dd.mm.yyyy HH:MM", + ), + ), + ), + ) + push!(devA, deviation) + end + med = median(devA) + av = mean(devA) + println("Median: "*string(med)) + println("Average :"*string(av)) + println("Train Runs: "*string(length(devA))) + #println(settings.mode) +end + + + + +end diff --git a/settings.yaml b/settings.yaml new file mode 100644 index 0000000..1bbbd84 --- /dev/null +++ b/settings.yaml @@ -0,0 +1,33 @@ +--- +settings: + outputFilePath: "C:\\Users\\its\\Documents\\Ana\\UNI\\Bachelor\\DataAnalyzing\\output\\" + stationsListPath: "C:\\Users\\its\\Documents\\Ana\\UNI\\Bachelor\\DataAnalyzing\\S-BahnStuttgartDaten\\Bahnhofsdaten.csv" + estimatedTimesPath: "C:\\Users\\its\\Documents\\Ana\\UNI\\Bachelor\\DataAnalyzing\\S-BahnStuttgartDaten\\20170901-20171019_Alle_Sollereignisse_S-Bahn_Stuttgart.csv" + realTimeDataPath: "C:\\Users\\its\\Documents\\Ana\\UNI\\Bachelor\\DataAnalyzing\\S-BahnStuttgartDaten\\20170901-20171019_Alle_Istmeldungen_S-Bahn_Stuttgart.csv" + + #objectInFocus: "single line" + objectInFocus: "all lines" #single line is being repeated + + timePeriod: ["no"] + #timePeriod: ["rush hour","16.00","19.00","06.00","09.00"] + #timePeriod: ["match day"] #(Linie 11) + #timePeriod: ["Thursday","Sunday"] #always 2 to compare + #timePeriod: ["Thursday","Friday"] + + mode: "statistical variation" + #mode: "black list" + #mode: "7048" + + #approach: "between registration points" + approach: "registration points" + + quantile: ["84","90","96","99"] + #quantile: ["50","70","80","90"] + #quantile: ["88"] + + analyzedLine: "4" + #allLines: ["1","2","3","4","5","6","60","11"] + allLines: ["1","2","3","4","5","6","60"] + #allLines: ["1","4","6","60","11"] + #allLines: ["1","4","6","60"] +... diff --git a/types.jl b/types.jl new file mode 100644 index 0000000..8e76b51 --- /dev/null +++ b/types.jl @@ -0,0 +1,75 @@ +module types + +using Dates + +export Settings, Memory + +mutable struct Settings + outputFilePath::String + objectInFocus::String + analyzedLine::String + estimatedTimesPath::String + realTimeDataPath::String + stationsListPath::String + mode::String + allLines::Vector{String} + stationDict::Dict{Any, Any} + totallines::Int + stationLists::Vector{Any} + commonStations::Vector{Any} + quantile::Vector{Any} + timePeriod::Vector{String} + approach::String + gamedays::Vector{Any} +end + +Settings()=Settings("","","","","","","",[],Dict(),0,[],[],[],[],"",[]) + +mutable struct Memory + distributionMin::Vector{Any} + title::String + titleSize::Int + stationList::Vector{Any} + focus::String + direction1::String + direction2::String + size::Tuple{Any,Any} + legendSize::Int + legendPosition::Symbol + linewidth::Int + barwidth::Float64 + tickSize::Int + guidefontsize:: Int + xRotation::Float64 + xLabel::String + yTicks::StepRange{Any,Any} + yLabel::String + color::Array + +end + +Memory() = Memory( + [], + "", + 0, + ["x","x"], + "", + "", + "", + (200, 100), + 0, + :outerbottom, + 0, + 0.0, + 0, + 0, + 0.0, + "", + 0:1:2, + "", + [:blue], +) + + + +end