1
0
Fork 0
Auswertung_Archiv-Daten_S-B.../checkPerformance.jl

499 lines
16 KiB
Julia
Raw Normal View History

2021-09-16 15:22:28 +02:00
module performance
# main module
# functions are being called
# data is being sorted and organized
include("./types.jl")
include("./output.jl")
include("./plotting.jl")
include("./betweenRegistrationPoints.jl")
include("./registrationPoints.jl")
import YAML
using CSV, Dates, DataFrames, Statistics, StatsBase
using .types
using .output
using .plotting
using .betweenRegistrationPoints
using .registrationPoints
export startAnalyzing
function startAnalyzing(filePathSettings)
settings = readSettings(filePathSettings)
memory = Memory()
# if needed the match days are collected
if settings.timePeriod[1] == "match day"
df = DataFrame(CSV.File(settings.estimatedTimesPath))
df_station = df[df[:, :ZUGEREIGNIS_LINIE].==11, :]
settings.gamedays = df_station.SERVICE_START_ZEIT
unique!(settings.gamedays)
end
# performance data is being sorted
perfData = @time readPerfData(settings)
"""
The following conditional evaluation calls the different functions.
Possible modes: "statistical variation", "black list", "train number".
For "statistical variation" two different approaches are being used.
For "black list" or a single "train number" all selected line numbers are
analyzed to get detailed information about the deviation for each
registration point in one DataFrame. The function "top100" creates "black
lists" for each week, the function "top1" analyzes a single train number
for the average deviation and the median.
"""
if settings.mode != "statistical variation"
#for "black list or single train number
allPerfData = DataFrame()
currentLine = 1
for line in settings.allLines
settings.analyzedLine = line
perfDataLine = @time createFiles(perfData, settings)
settings.commonStations = intersect(
settings.commonStations,
settings.stationLists[currentLine],
)
if currentLine == 1
allPerfData = perfDataLine
else
append!(allPerfData, perfDataLine)
end
currentLine += 1
println("")
end
if settings.mode == "black list"
@time top100(allPerfData, settings)
else
@time top1(allPerfData, settings)
end
else # settings.mode == "statistical variation"
if settings.approach == "registration points"
# deviation at each registration point
settings.commonStations = collect(keys(settings.stationDict))
if settings.objectInFocus == "single line"
allPerfData = @time createFiles(perfData, settings)
quantileD1, quantileD2 =
@time analyzeStatisticalVariation(allPerfData, settings)
@time plotEverything(quantileD1, quantileD2, settings, memory)
elseif settings.objectInFocus == "all lines"
for line in settings.allLines
settings.analyzedLine = line
linePerfData = @time createFiles(perfData, settings)
q1, q2 = @time analyzeStatisticalVariation(
linePerfData,
settings,
)
@time plotEverything(q1, q2, settings, memory)
println("")
end
end
elseif settings.approach == "between registration points"
settings.commonStations = collect(keys(settings.stationDict))
if settings.objectInFocus == "single line"
allPerfData = @time createFiles(perfData, settings)
plotData1, plotData2 =
@time getDifferences(allPerfData, settings)
@time plotEverything(plotData1, plotData2, settings, memory)
elseif settings.objectInFocus == "all lines"
for line in settings.allLines
settings.analyzedLine = line
allPerfData = createFiles(perfData, settings)
plotData1, plotData2 =
@time getDifferences(allPerfData, settings)
@time plotEverything(plotData1, plotData2, settings, memory)
println("")
end
end
else
error("ERROR: No approach has been selected. Please do so.")
end
end
end
"""
The function is sorting the performance data and deleting duplicates. If only
specific days are needed, other days will be deleted or marked.
"""
function readPerfData(settings)
perfData =
DataFrame(CSV.File(settings.realTimeDataPath; header = 1, delim = ";"))
sizePerfData = size(perfData, 1)
println(
"The file ",
settings.realTimeDataPath,
" has ",
sizePerfData,
" rows.",
)
if settings.timePeriod[1] != "no"
perfData = selectSpecificDays(perfData, settings)
end
# duplicates are being deleted
select!(perfData, Not(:QUELLE_SENDER))
select!(perfData, Not(:EINGANGSZEIT))
perfData[!, :single] = ((nonunique(perfData)))
perfData = perfData[perfData[:, :single].==false, :]
select!(perfData, Not(:single))
for row in eachrow(perfData)
if row.ZUGEREIGNIS_DS100 == "TS"
row.ZUGEREIGNIS_DS100 == "TS T"
end
end
println(
"Performance data has been sorted and saved. ",
sizePerfData - size(perfData, 1),
" row(s) has/have been deleted.",
)
return perfData
end
function selectSpecificDays(df1, settings)
if settings.timePeriod[1] == "match day" # days with match are being marked
gamedays = Any[]
day = Any[]
game = Any[]
for day in settings.gamedays
push!(gamedays, Dates.Date(Dates.DateTime(day, "dd.mm.yyyy HH:MM")))
unique!(gamedays)
end
settings.gamedays = copy(gamedays)
for row in eachrow(df1)
currentDay = Dates.Date(
Dates.DateTime(row.ZUGEREIGNIS_SOLLZEIT, "dd.mm.yyyy HH:MM"),
)
push!(day, currentDay)
if currentDay in settings.gamedays
push!(game, "yes")
else
push!(game, "no")
end
end
df1[!, :day] = day
df1[!, :game] = game
df_new = copy(df1)
#df_day = filter(row -> row[:day] in settings.gamedays, df1)
elseif settings.timePeriod[1] == "rush hour" # rush hour or not
rushHour = Any[]
startM = parse(Float64, settings.timePeriod[2])
endM = parse(Float64, settings.timePeriod[3])
startE = parse(Float64, settings.timePeriod[4])
endE = parse(Float64, settings.timePeriod[5])
for row in eachrow(df1)
currentH = Dates.Hour(
Dates.DateTime(row.ZUGEREIGNIS_SOLLZEIT, "dd.mm.yyyy HH:MM"),
)
currentM = Dates.Minute(
Dates.DateTime(row.ZUGEREIGNIS_SOLLZEIT, "dd.mm.yyyy HH:MM"),
)
current = real(currentH.value) + real(currentM.value) / 100
if (current >= startM && current <= endM) ||
(current >= startE && current <= endE)
push!(rushHour, "yes")
else
push!(rushHour, "no")
end
end
df1[!, :rushHour] = rushHour
df_new = copy(df1)
saveOutput(df_new, settings)
else # comparison of two weekdays
df1[!, :dayname] = fill("day undefined", size(df1, 1))
for row in eachrow(df1)
if Dates.dayname(
Dates.DateTime(row.ZUGEREIGNIS_SOLLZEIT, "dd.mm.yyyy HH:MM"),
) == settings.timePeriod[1]
row.dayname = settings.timePeriod[1] # day 1
elseif Dates.dayname(
Dates.DateTime(row.ZUGEREIGNIS_SOLLZEIT, "dd.mm.yyyy HH:MM"),
) == settings.timePeriod[2]
row.dayname = settings.timePeriod[2] # day 2
end
end
df_new = df1[df1[:, :dayname].!="day undefined", :]
end
return df_new
end
"""
Settings are being saved and a dictionary for the station names is being created.
"""
function readSettings(filePathSettings)
data = YAML.load(open(filePathSettings))
setting = Settings()
if haskey(data["settings"], "outputFilePath")
setting.outputFilePath = data["settings"]["outputFilePath"]
delete!(data["settings"], "outputFilePath")
else
error(
"ERROR at reading the settings yaml file: The keyword outputFilePath
is missing. It has to be added.",
)
end
if haskey(data["settings"], "objectInFocus")
setting.objectInFocus = data["settings"]["objectInFocus"]
delete!(data["settings"], "objectInFocus")
else
error(
"ERROR at reading the settings yaml file: The keyword objectInFocus
is missing. It has to be added.",
)
end
if haskey(data["settings"], "timePeriod")
setting.timePeriod = data["settings"]["timePeriod"]
delete!(data["settings"], "timePeriod")
else
error(
"ERROR at reading the settings yaml file: The keyword timePeriod is
missing. It has to be added.",
)
end
if haskey(data["settings"], "analyzedLine")
setting.analyzedLine = data["settings"]["analyzedLine"]
delete!(data["settings"], "analyzedLine")
else
error(
"ERROR at reading the settings yaml file: The keyword analyzedLine
is missing. It has to be added.",
)
end
if haskey(data["settings"], "estimatedTimesPath")
setting.estimatedTimesPath = data["settings"]["estimatedTimesPath"]
delete!(data["settings"], "estimatedTimesPath")
else
error(
"ERROR at reading the settings yaml file: The keyword
estimatedTimesPath is missing. It has to be added.",
)
end
if haskey(data["settings"], "realTimeDataPath")
setting.realTimeDataPath = data["settings"]["realTimeDataPath"]
delete!(data["settings"], "realTimeDataPath")
else
error(
"ERROR at reading the settings yaml file: The keyword realTimeData
is missing. It has to be added.",
)
end
if haskey(data["settings"], "stationsListPath")
setting.stationsListPath = data["settings"]["stationsListPath"]
delete!(data["settings"], "stationsListPath")
else
error(
"ERROR at reading the settings yaml file: The keyword
stationsListPath is missing. It has to be added.",
)
end
if haskey(data["settings"], "mode")
setting.mode = data["settings"]["mode"]
delete!(data["settings"], "mode")
else
error(
"ERROR at reading the settings yaml file: The keyword mode is
missing. It has to be added.",
)
end
if haskey(data["settings"], "allLines")
setting.allLines = data["settings"]["allLines"]
delete!(data["settings"], "allLines")
else
error(
"ERROR at reading the settings yaml file: The keyword allLines is
missing. It has to be added.",
)
end
if haskey(data["settings"], "quantile")
setting.quantile = data["settings"]["quantile"]
delete!(data["settings"], "quantile")
else
error(
"ERROR at reading the settings yaml file: The keyword quantile is
missing. It has to be added.",
)
end
if haskey(data["settings"], "approach")
setting.approach = data["settings"]["approach"]
delete!(data["settings"], "approach")
else
error(
"ERROR at reading the settings yaml file: The keyword singleQuantile
is missing. It has to be added.",
)
end
# station dict for DS100 => name of station
stationDict = createStationDict(readlines(open(setting.stationsListPath)))
stationDict["TFL"] = "Stuttgart Flughafen Messe"
stationDict["TBO"] = "Boeblingen"
setting.stationDict = stationDict
return setting
end
function createStationDict(stationDict)
dic = Dict()
for x in stationDict
substring = (split(x, ";"))
push!(dic, substring[2] => substring[3])
end
return dic
end
"""
For the selected line number the estimated times are being checked. The station
sequence is being read and a direction is assigned to each train number.
"""
function createFiles(perfData, settings)
trainNumber = readLineData(settings)
perfData = editFile(settings, perfData, trainNumber)
return perfData
end
function readLineData(settings)
df = DataFrame(CSV.File(settings.estimatedTimesPath))
df1 = df[df[:, :ZUGEREIGNIS_LINIE].==parse(Int, settings.analyzedLine), :]
trainNumber = unique(df1.ZUGEREIGNIS_ZUGNUMMER)
# sort the data in train sets
df1 = sort!(df1, [:SERVICE_ID, :SERVICE_START_ZEIT], rev = (false, true))
#row count for a better organisation
df1[!, :rownumber] = axes(df1, 1)
maxHALT_NR = maximum(df1.SERVICE_HALTNR)
newTrains = findall(x -> x == 1, df1.SERVICE_HALTNR)
endOfMaxStopsTrains = findall(x -> x == maxHALT_NR, df1.SERVICE_HALTNR)
endOfMaxStopsTrains = filter!(x -> x >= newTrains[1], endOfMaxStopsTrains)
i = 0
for x in newTrains
if x == endOfMaxStopsTrains[1] - maxHALT_NR + 1
i += 1
break
else
i += 1
end
end
# station lists for both directions are being created
i = newTrains[i]
stationsList = Any[]
while df1.SERVICE_HALTNR[i] != maxHALT_NR
push!(stationsList, df1.ZUGEREIGNIS_DS100[i])
i += 1
end
push!(stationsList, df1.ZUGEREIGNIS_DS100[i])
#saving the stationList in settings
push!(settings.stationLists, stationsList)
stationsListOneWay = unique(stationsList)
stationsListOtherWay = reverse(stationsList)
println(
"Line ",
settings.analyzedLine,
" is connecting ",
settings.stationDict[stationsListOneWay[1]],
" and ",
settings.stationDict[stationsListOneWay[size(stationsListOneWay, 1)]],
)
return trainNumber
end
function editFile(settings, perfData, trainNumber)
perfData =
filter(row -> row[:ZUGEREIGNIS_ZUGNUMMER] in trainNumber, perfData)
if settings.objectInFocus == "single line"
lineNr = 1
else
lineNr = findall(x -> x == settings.analyzedLine, settings.allLines)
lineNr = lineNr[1]
end
stationList = settings.stationLists[lineNr]
directionE = "" # direction of trains with even train numbers
directionU = "" # direction of trains with uneven train numbers
direction = Any[]
perfData[!, :rownumber] = axes(perfData, 1)
for row in eachrow(perfData)
if row.ZUGEREIGNIS_TYP == 10 && row.ZUGEREIGNIS_DS100 == stationList[1]
if iseven(row.ZUGEREIGNIS_ZUGNUMMER)
directionE = stationList[length(stationList)]
directionU = stationList[1]
else
directionU = stationList[length(stationList)]
directionE = stationList[1]
end
break
end
end
for row in eachrow(perfData)
if iseven(row.ZUGEREIGNIS_ZUGNUMMER)
push!(direction, directionE)
else
push!(direction, directionU)
end
end
perfData[!, :ZUGEREIGNIS_RICHTUNG] = direction
perfData = sort!(
perfData,
[:SERVICE_ID, :ZUGEREIGNIS_SOLLZEIT],
rev = (true, false),
)
perfData[!, :ZUGEREIGNIS_LINIE] =
fill(settings.analyzedLine, size(perfData, 1))
println(
"Performance Data for line " *
settings.analyzedLine *
" has been modified.",
)
return perfData
end
end