Auswertung_Archiv-Daten_S-B.../checkPerformance.jl

module performance
# main module
# functions are being called
# data is being sorted and organized

include("./types.jl")
include("./output.jl")
include("./plotting.jl")
include("./betweenRegistrationPoints.jl")
include("./registrationPoints.jl")

import YAML

using CSV, Dates, DataFrames, Statistics, StatsBase
using .types
using .output
using .plotting
using .betweenRegistrationPoints
using .registrationPoints

export startAnalyzing

function startAnalyzing(filePathSettings)
    settings = readSettings(filePathSettings)
    memory = Memory()

    # if needed the match days are collected
    if settings.timePeriod[1] == "match day"
        df = DataFrame(CSV.File(settings.estimatedTimesPath))
        df_station = df[df[:, :ZUGEREIGNIS_LINIE].==11, :]
        settings.gamedays = df_station.SERVICE_START_ZEIT
        unique!(settings.gamedays)
    end
    # performance data is being sorted
    perfData = @time readPerfData(settings)

    """
    The following conditional evaluation calls the different functions.
    Possible modes: "statistical variation", "black list", "train number".
    For "statistical variation" two different approaches are being used.
    For "black list" or a single "train number" all selected line numbers are
    analyzed to get detailed information about the deviation for each
    registration point in one DataFrame. The function "top100" creates "black
    lists" for each week, the function "top1" analyzes a single train number
    for the average deviation and the median.
    """

    if settings.mode != "statistical variation"
        #for "black list or single train number
        allPerfData = DataFrame()
        currentLine = 1
        for line in settings.allLines
            settings.analyzedLine = line
            perfDataLine = @time createFiles(perfData, settings)
            settings.commonStations = intersect(
                settings.commonStations,
                settings.stationLists[currentLine],
            )
            if currentLine == 1
                allPerfData = perfDataLine
            else
                append!(allPerfData, perfDataLine)
            end
            currentLine += 1
            println("")
        end
        if settings.mode == "black list"
            @time top100(allPerfData, settings)
        else
            @time top1(allPerfData, settings)
        end
    else # settings.mode == "statistical variation"
        if settings.approach == "registration points"
            # deviation at each registration point
            settings.commonStations = collect(keys(settings.stationDict))
            if settings.objectInFocus == "single line"
                allPerfData = @time createFiles(perfData, settings)
                quantileD1, quantileD2 =
                    @time analyzeStatisticalVariation(allPerfData, settings)
                @time plotEverything(quantileD1, quantileD2, settings, memory)
            elseif settings.objectInFocus == "all lines"
                for line in settings.allLines
                    settings.analyzedLine = line
                    linePerfData = @time createFiles(perfData, settings)
                    q1, q2 = @time analyzeStatisticalVariation(
                        linePerfData,
                        settings,
                    )
                    @time plotEverything(q1, q2, settings, memory)
                    println("")
                end
            end
        elseif settings.approach == "between registration points"
            settings.commonStations = collect(keys(settings.stationDict))
            if settings.objectInFocus == "single line"
                allPerfData = @time createFiles(perfData, settings)
                plotData1, plotData2 =
                    @time getDifferences(allPerfData, settings)
                @time plotEverything(plotData1, plotData2, settings, memory)
            elseif settings.objectInFocus == "all lines"
                for line in settings.allLines
                    settings.analyzedLine = line
                    allPerfData = createFiles(perfData, settings)
                    plotData1, plotData2 =
                        @time getDifferences(allPerfData, settings)
                    @time plotEverything(plotData1, plotData2, settings, memory)
                    println("")
                end
            end
        else
            error("ERROR: No approach has been selected. Please do so.")
        end
    end


end

"""
The function is sorting the performance data and deleting duplicates. If only
specific days are needed, other days will be deleted or marked.
"""

function readPerfData(settings)
    perfData =
        DataFrame(CSV.File(settings.realTimeDataPath; header = 1, delim = ";"))

    sizePerfData = size(perfData, 1)
    println(
        "The file ",
        settings.realTimeDataPath,
        " has ",
        sizePerfData,
        " rows.",
    )

    if settings.timePeriod[1] != "no"
        perfData = selectSpecificDays(perfData, settings)
    end

    # duplicates are being deleted
    select!(perfData, Not(:QUELLE_SENDER))
    select!(perfData, Not(:EINGANGSZEIT))
    perfData[!, :single] = ((nonunique(perfData)))
    perfData = perfData[perfData[:, :single].==false, :]
    select!(perfData, Not(:single))

    for row in eachrow(perfData)
        if row.ZUGEREIGNIS_DS100 == "TS"
            row.ZUGEREIGNIS_DS100 == "TS  T"
        end
    end

    println(
        "Performance data has been sorted and saved. ",
        sizePerfData - size(perfData, 1),
        " row(s) has/have been deleted.",
    )

    return perfData

end

function selectSpecificDays(df1, settings)
    if settings.timePeriod[1] == "match day" # days with match are being marked
        gamedays = Any[]
        day = Any[]
        game = Any[]
        for day in settings.gamedays
            push!(gamedays, Dates.Date(Dates.DateTime(day, "dd.mm.yyyy HH:MM")))
            unique!(gamedays)
        end
        settings.gamedays = copy(gamedays)
        for row in eachrow(df1)
            currentDay = Dates.Date(
                Dates.DateTime(row.ZUGEREIGNIS_SOLLZEIT, "dd.mm.yyyy HH:MM"),
            )
            push!(day, currentDay)
            if currentDay in settings.gamedays
                push!(game, "yes")
            else
                push!(game, "no")
            end
        end
        df1[!, :day] = day
        df1[!, :game] = game
        df_new = copy(df1)
        #df_day = filter(row -> row[:day] in settings.gamedays, df1)

    elseif settings.timePeriod[1] == "rush hour" # rush hour or not
        rushHour = Any[]
        startM = parse(Float64, settings.timePeriod[2])
        endM = parse(Float64, settings.timePeriod[3])
        startE = parse(Float64, settings.timePeriod[4])
        endE = parse(Float64, settings.timePeriod[5])
        for row in eachrow(df1)
            currentH = Dates.Hour(
                Dates.DateTime(row.ZUGEREIGNIS_SOLLZEIT, "dd.mm.yyyy HH:MM"),
            )
            currentM = Dates.Minute(
                Dates.DateTime(row.ZUGEREIGNIS_SOLLZEIT, "dd.mm.yyyy HH:MM"),
            )
            current = real(currentH.value) + real(currentM.value) / 100
            if (current >= startM && current <= endM) ||
               (current >= startE && current <= endE)
                push!(rushHour, "yes")
            else
                push!(rushHour, "no")
            end
        end
        df1[!, :rushHour] = rushHour
        df_new = copy(df1)
        saveOutput(df_new, settings)
    else # comparison of two weekdays
        df1[!, :dayname] = fill("day undefined", size(df1, 1))
        for row in eachrow(df1)
            if Dates.dayname(
                Dates.DateTime(row.ZUGEREIGNIS_SOLLZEIT, "dd.mm.yyyy HH:MM"),
            ) == settings.timePeriod[1]
                row.dayname = settings.timePeriod[1] # day 1
            elseif Dates.dayname(
                Dates.DateTime(row.ZUGEREIGNIS_SOLLZEIT, "dd.mm.yyyy HH:MM"),
            ) == settings.timePeriod[2]
                row.dayname = settings.timePeriod[2] # day 2
            end
        end
        df_new = df1[df1[:, :dayname].!="day undefined", :]
    end

    return df_new

end

"""
Settings are being saved and a dictionary for the station names is being created.
"""

function readSettings(filePathSettings)
    data = YAML.load(open(filePathSettings))
    setting = Settings()

    if haskey(data["settings"], "outputFilePath")
        setting.outputFilePath = data["settings"]["outputFilePath"]
        delete!(data["settings"], "outputFilePath")
    else
        error(
            "ERROR at reading the settings yaml file: The keyword outputFilePath
            is missing. It has to be added.",
        )
    end

    if haskey(data["settings"], "objectInFocus")
        setting.objectInFocus = data["settings"]["objectInFocus"]
        delete!(data["settings"], "objectInFocus")
    else
        error(
            "ERROR at reading the settings yaml file: The keyword objectInFocus
            is missing. It has to be added.",
        )
    end

    if haskey(data["settings"], "timePeriod")
        setting.timePeriod = data["settings"]["timePeriod"]
        delete!(data["settings"], "timePeriod")
    else
        error(
            "ERROR at reading the settings yaml file: The keyword timePeriod is
            missing. It has to be added.",
        )
    end

    if haskey(data["settings"], "analyzedLine")
        setting.analyzedLine = data["settings"]["analyzedLine"]
        delete!(data["settings"], "analyzedLine")
    else
        error(
            "ERROR at reading the settings yaml file: The keyword analyzedLine
            is missing. It has to be added.",
        )
    end

    if haskey(data["settings"], "estimatedTimesPath")
        setting.estimatedTimesPath = data["settings"]["estimatedTimesPath"]
        delete!(data["settings"], "estimatedTimesPath")
    else
        error(
            "ERROR at reading the settings yaml file: The keyword
            estimatedTimesPath is missing. It has to be added.",
        )
    end

    if haskey(data["settings"], "realTimeDataPath")
        setting.realTimeDataPath = data["settings"]["realTimeDataPath"]
        delete!(data["settings"], "realTimeDataPath")
    else
        error(
            "ERROR at reading the settings yaml file: The keyword realTimeData
            is missing. It has to be added.",
        )
    end

    if haskey(data["settings"], "stationsListPath")
        setting.stationsListPath = data["settings"]["stationsListPath"]
        delete!(data["settings"], "stationsListPath")
    else
        error(
            "ERROR at reading the settings yaml file: The keyword
            stationsListPath is missing. It has to be added.",
        )
    end

    if haskey(data["settings"], "mode")
        setting.mode = data["settings"]["mode"]
        delete!(data["settings"], "mode")
    else
        error(
            "ERROR at reading the settings yaml file: The keyword mode is
            missing. It has to be added.",
        )
    end

    if haskey(data["settings"], "allLines")
        setting.allLines = data["settings"]["allLines"]
        delete!(data["settings"], "allLines")
    else
        error(
            "ERROR at reading the settings yaml file: The keyword allLines is
            missing. It has to be added.",
        )
    end

    if haskey(data["settings"], "quantile")
        setting.quantile = data["settings"]["quantile"]
        delete!(data["settings"], "quantile")
    else
        error(
            "ERROR at reading the settings yaml file: The keyword quantile is
            missing. It has to be added.",
        )
    end

    if haskey(data["settings"], "approach")
        setting.approach = data["settings"]["approach"]
        delete!(data["settings"], "approach")
    else
        error(
            "ERROR at reading the settings yaml file: The keyword singleQuantile
            is missing. It has to be added.",
        )
    end


    # station dict for DS100 => name of station
    stationDict = createStationDict(readlines(open(setting.stationsListPath)))
    stationDict["TFL"] = "Stuttgart Flughafen Messe"
    stationDict["TBO"] = "Boeblingen"
    setting.stationDict = stationDict

    return setting
end

function createStationDict(stationDict)
    dic = Dict()
    for x in stationDict
        substring = (split(x, ";"))
        push!(dic, substring[2] => substring[3])
    end
    return dic
end

"""
For the selected line number the estimated times are being checked. The station
sequence is being read and a direction is assigned to each train number.
"""

function createFiles(perfData, settings)
    trainNumber = readLineData(settings)
    perfData = editFile(settings, perfData, trainNumber)
    return perfData
end

function readLineData(settings)
    df = DataFrame(CSV.File(settings.estimatedTimesPath))
    df1 = df[df[:, :ZUGEREIGNIS_LINIE].==parse(Int, settings.analyzedLine), :]
    trainNumber = unique(df1.ZUGEREIGNIS_ZUGNUMMER)

    # sort the data in train sets
    df1 = sort!(df1, [:SERVICE_ID, :SERVICE_START_ZEIT], rev = (false, true))

    #row count for a better organisation
    df1[!, :rownumber] = axes(df1, 1)

    maxHALT_NR = maximum(df1.SERVICE_HALTNR)
    newTrains = findall(x -> x == 1, df1.SERVICE_HALTNR)

    endOfMaxStopsTrains = findall(x -> x == maxHALT_NR, df1.SERVICE_HALTNR)
    endOfMaxStopsTrains = filter!(x -> x >= newTrains[1], endOfMaxStopsTrains)

    i = 0
    for x in newTrains
        if x == endOfMaxStopsTrains[1] - maxHALT_NR + 1
            i += 1
            break
        else
            i += 1
        end
    end


    # station lists for both directions are being created
    i = newTrains[i]

    stationsList = Any[]
    while df1.SERVICE_HALTNR[i] != maxHALT_NR
        push!(stationsList, df1.ZUGEREIGNIS_DS100[i])
        i += 1
    end
    push!(stationsList, df1.ZUGEREIGNIS_DS100[i])

    #saving the stationList in settings
    push!(settings.stationLists, stationsList)

    stationsListOneWay = unique(stationsList)
    stationsListOtherWay = reverse(stationsList)

    println(
        "Line ",
        settings.analyzedLine,
        " is connecting ",
        settings.stationDict[stationsListOneWay[1]],
        " and ",
        settings.stationDict[stationsListOneWay[size(stationsListOneWay, 1)]],
    )

    return trainNumber
end

function editFile(settings, perfData, trainNumber)
    perfData =
        filter(row -> row[:ZUGEREIGNIS_ZUGNUMMER] in trainNumber, perfData)

    if settings.objectInFocus == "single line"
        lineNr = 1
    else
        lineNr = findall(x -> x == settings.analyzedLine, settings.allLines)
        lineNr = lineNr[1]
    end

    stationList = settings.stationLists[lineNr]
    directionE = ""  # direction of trains with even train numbers
    directionU = ""  # direction of trains with uneven train numbers
    direction = Any[]

    perfData[!, :rownumber] = axes(perfData, 1)
    for row in eachrow(perfData)
        if row.ZUGEREIGNIS_TYP == 10 && row.ZUGEREIGNIS_DS100 == stationList[1]
            if iseven(row.ZUGEREIGNIS_ZUGNUMMER)
                directionE = stationList[length(stationList)]
                directionU = stationList[1]
            else
                directionU = stationList[length(stationList)]
                directionE = stationList[1]
            end
            break
        end
    end

    for row in eachrow(perfData)
        if iseven(row.ZUGEREIGNIS_ZUGNUMMER)
            push!(direction, directionE)
        else
            push!(direction, directionU)
        end
    end

    perfData[!, :ZUGEREIGNIS_RICHTUNG] = direction

    perfData = sort!(
        perfData,
        [:SERVICE_ID, :ZUGEREIGNIS_SOLLZEIT],
        rev = (true, false),
    )


    perfData[!, :ZUGEREIGNIS_LINIE] =
        fill(settings.analyzedLine, size(perfData, 1))

    println(
        "Performance Data for line " *
        settings.analyzedLine *
        " has been modified.",
    )

    return perfData
end


end