Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GSOC'24 Amharic chapter] Implement Date time parser for Ethiopian Calendar #763

Merged
merged 12 commits into from
Dec 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ object DateTimeParserConfig
val monthsMap = Map(
// For "ar" configuration, right-to-left rendering may seem like a bug, but it's not.
// Don't change this unless you know how it is done.
"am" -> Map("january"->1,"february"->2,"march"->3,"april"->4,"may"->5,"june"->6,"july"->7,"august"->8,"september"->9,"october"->10,"november"->11,"december"->12,
jimkont marked this conversation as resolved.
Show resolved Hide resolved
"ጃንዩወሪ" -> 1, "ፌብሩወሪ" -> 2,"ማርች" -> 3,"ኤፕሪል" -> 4,"ሜይ" -> 5,"ጁን" -> 6,"ጁላይ" -> 7,"ኦገስት" -> 8,"ሴፕተምበር" -> 9,"ኦክቶበር" -> 10,"ኖቬምበር" -> 11,"ዲሴምበር" -> 12),
"ar" -> Map("جانفي"->1,"فيفري"->2,"مارس"->3,"أفريل"->4,"ماي"->5,"جوان"->6,"جويلية"->7,"أوت"->8,"سبتمبر"->9,"أكتوبر"->10,"نوفمبر"->11,"ديسمبر"->12,
"يناير"->1,"فبراير"->2,"أبريل"->4,"مايو"->5,"يونيو"->6,"يوليو"->7,"يوليوز"->7,"أغسطس"->8,"غشت"->8,"شتنبر"->9,"نونبر"->11,"دجنبر"->12),
"bg" -> Map("януари"->1,"февруари"->2,"март"->3,"април"->4,"май"->5,"юни"->6,"юли"->7,"август"->8,"септември"->9,"октомври"->10,"ноември"->11,"декември"->12),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
package org.dbpedia.extraction.config.dataparser

object EthiopianDateParserConfig {
val geezNumberDateMap = Map(
1 -> "፩",
2 -> "፪",
3 -> "፫",
4 -> "፬",
5 -> "፭",
6 -> "፮",
7 -> "፯",
8 -> "፰",
9 -> "፱",
10 -> "፲",
11 -> "፲፩",
12 -> "፲፪",
13 -> "፲፫",
14 -> "፲፬",
15 -> "፲፭",
16 -> "፲፮",
17 -> "፲፯",
18 -> "፲፰",
19 -> "፲፱",
20 -> "፳",
21 -> "፳፩",
22 -> "፳፪",
23 -> "፳፫",
24 -> "፳፬",
25 -> "፳፭",
26 -> "፳፮",
27 -> "፳፯",
28 -> "፳፰",
29 -> "፳፱",
30 -> "፴"
)

val monthsMap = Map(
"መስከረም" -> 1,
"ጥቅምት" -> 2,
"ኅዳር" -> 3,
"ታኅሳስ" -> 4,
"ጥር" -> 5,
"የካቲት" -> 6,
"መጋቢት" -> 7,
"ሚያዝያ" -> 8,
"ግንቦት" -> 9,
"ሰኔ" -> 10,
"ሐምሌ" -> 11,
"ነሐሴ" -> 12,
"ጳጉሜ" -> 13
)

}
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,14 @@ class DateTimeParser ( context :
@transient private val logger = Logger.getLogger(getClass.getName)

// language-specific configurations

private val language = if(DateTimeParserConfig.supportedLanguages.contains(context.language.wikiCode)) context.language.wikiCode else "en"

private val months = DateTimeParserConfig.monthsMap.getOrElse(language, DateTimeParserConfig.monthsMap("en"))
private val eraStr = DateTimeParserConfig.eraStrMap.getOrElse(language, DateTimeParserConfig.eraStrMap("en"))
private val cardinalityRegex = DateTimeParserConfig.cardinalityRegexMap.getOrElse(language, DateTimeParserConfig.cardinalityRegexMap("en"))
private val templates = DateTimeParserConfig.templateDateMap.getOrElse(language, Map())

private val ethiopianDateParser = new EthiopianDateParser(datatype:Datatype, strict:Boolean);
// parse logic configurations

override val splitPropertyNodeRegex: String = if (DataParserConfig.splitPropertyNodeRegexDateTime.contains(language))
Expand Down Expand Up @@ -189,7 +189,17 @@ class DateTimeParser ( context :
}

private def findDate(input: String) : Option[Date] =

{

// scan for Ethiopian (geez) calendar dates
if(language == "am"){
for(date <- ethiopianDateParser.findGeezDate(input))
{
return Some(date)
}
}

for(date <- catchDate(input))
{
return Some(date)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,217 @@
package org.dbpedia.extraction.dataparser
import java.util.logging.{Logger, Level}
import scala.util.matching.Regex
import org.dbpedia.extraction.config.dataparser.{
EthiopianDateParserConfig,
DateTimeParserConfig
}
import org.dbpedia.extraction.util.{Language, Date}
import org.dbpedia.extraction.util.{GeezNumberUtils}
import org.dbpedia.extraction.ontology.datatypes.Datatype

class EthiopianDateParser(datatype: Datatype, val strict: Boolean = false) {
require(datatype != null, "datatype != null")
@transient private val logger = Logger.getLogger(getClass.getName)

val geezNumberParser = new GeezNumberUtils()
private val monthsMap = EthiopianDateParserConfig.monthsMap
private val monthsName = monthsMap.keys.mkString("|")
private val geezNumberDate =
EthiopianDateParserConfig.geezNumberDateMap.values.mkString("|")

private val gregorianDateIndicator = s""".*(እ.ኤ.አ).*""".r
private val prefix = if (strict) """\s*""" else """.*?"""
private val postfix = if (strict) """\s*""" else ".*"

// catches dd-mm-yyyy including a 13th month 21 13 2013, 21-13-2013, 21/13/2013, 21-13-2013, 21/13/2013
private val dateRegex1: Regex =
s"""$prefix\\b(0?[1-9]|[12][0-9]|3[01])\\b[-/\\s]\\b(0?[1-9]|1[0-2]|13)\\b[-/\\s](\\d{4}|[\\u1369-\\u137C]+)$postfix""".r

// Regex for dates containing geez characters
// catches dates like ጥቅምት-21-2013 or ጥቅምት/21/2013 or ጥቅምት 21 2013
private val dateRegex2: Regex =
s"""$prefix($monthsName)[\\s/-](\\b(0?[1-9]|[12][0-9]|3[01])\\b)[\\s/-](\\d{4}|[\\u1369-\\u137C]+)$postfix""".r

// catches dates dd-month-yyyy like 21-ጥቅምት-2013 or 21/ጥቅምት/2013 or 21 ጥቅምት 2013
private val dateRegex3: Regex =
s"""$prefix(\\b(0?[1-9]|[12][0-9]|3[01])\\b)[\\s/-]($monthsName)[\\s/-](\\d{4}|[\\u1369-\\u137C]+)$postfix""".r

// catches dates month-dd-yyyy ጥቅምት ፳፩ ፳፻፲፫ or ጥቅምት/፳፩/፳፻፲፫ or ጥቅምት ፳፩ ፳፻፲፫ mmmm-dd-yyyy
private val dateRegex4: Regex =
s"""$prefix(\\b$monthsName)[\\s/-]($geezNumberDate|0?[1-9]|[12][0-9]|3[01])[\\s/-](\\d{4}|[\\u1369-\\u137C]+)$postfix""".r

// catches dates like ፳፩ ጥቅምት ፳፻፲፫ or ፳፩/ጥቅምት/፳፻፲፫ or 21/ጥቅምት/2013 dd-mmmm-yyyy
private val dateRegex5: Regex =
s"""$prefix(\\b$geezNumberDate|0?[1-9]|[12][0-9]|3[01])[\\s/-]($monthsName)[\\s/-](\\d{4}|[\\u1369-\\u137C]+)$postfix""".r

def catchGeezDate(dateString: String): Option[(String, String, String)] = {

for (dateRegex1(day, month, year) <- List(dateString)) {
return Some((year, month, day))
}

// Amharic month names (month-day-year)
for (dateRegex2(month, day, year) <- List(dateString)) {
return Some((year, month, day))
}

// Amharic month names (day-month-year)
for (dateRegex3(day, month, year) <- List(dateString)) {
return Some((year, month, day))
}

// dates that contain geez/Amharic numbers (month-day-year)
for (dateRegex4(month, day, year) <- List(dateString)) {
return Some((year, month, day))
}

// dates that contain geez/Amharic numbers (day-month-year)
for (dateRegex5(day, month, year) <- List(dateString)) {
return Some((year, month, day))
}

None
}

def isLeapYear(year: Int): Boolean = {
return (year % 4 == 3)
}

def isValidEthiopianCalendarDate(year: Int, month: Int, day: Int): Boolean = {
// Validate year
if (year <= 0) {
logger.log(Level.FINE, "Year must be greater than 0.")
return false
}

// Validate month
if (month < 1 || month > 13) {
logger.log(
Level.FINE,
s"Month must be between 1 and 13. Provided month: $month."
)
return false
}

// Validate day
if (day < 1 || day > 30) {
logger.log(
Level.FINE,
s"Day must be between 1 and 30. Provided day: $day."
)
return false
}

// Validate case for Pagume (month 13 in Ethiopian Calendar)
if (month == 13) {
if (day > 6) {
logger.log(
Level.FINE,
s"Day in Pagume cannot exceed 6. Provided day: $day."
)
return false
}
if (!isLeapYear(year) && day > 5) {
logger.log(
Level.FINE,
s"Pagume only has 5 days in non-leap years. Provided day: $day."
)
return false
}
}

true
}

private def ethiopianDateToJDN(year: Int, month: Int, day: Int): Double = {
val EPOCH: Long = 1723856
val julianDayNumber: Double =
(EPOCH + 365) + 365 * (year - 1) + (year / 4).toInt + 30 * month + day - 31
return julianDayNumber
}

def geezToGregorianDateConverter(
year: Int,
month: Int,
day: Int,
datatype: Datatype
): Option[Date] = {
val JDN: Double = ethiopianDateToJDN(year, month, day)
val Q: Double = JDN + 0.5
val Z: Long = Q.toLong
val W: Long = ((Z - 1867216.25) / 36524.25).toLong
val X: Long = (W / 4).toLong
val A: Long = Z + 1 + W - X
val B: Long = A + 1524
val C: Long = ((B - 122.1) / 365.25).toLong
val D: Long = (365.25 * C).toLong
val E: Long = ((B - D) / 30.6001).toLong
val F: Long = (30.6001 * E).toLong
val gregorianDay: Int = (B - D - F + (Q - Z)).toInt
val gregorianMonth: Long = if (E - 1 <= 12) E - 1 else E - 13
val gregorianYear: Long = if (month <= 2) C - 4715 else C - 4716

Some(
new Date(
Some(gregorianYear.toInt),
Some(gregorianMonth.toInt),
Some(gregorianDay.toInt),
datatype
)
)
}

def isArabicNumeral(str: String): Boolean = {
str.forall(c => c.isDigit)
}

def formatDate(
dateString: Option[(String, String, String)]
): Option[(Int, Int, Int)] = {

dateString.flatMap { case (year, month, day) =>
val yearNum =
if (isArabicNumeral(year)) year.toInt
else geezNumberParser.convertGeezToArabicNumeral(year).getOrElse(0)
val monthNum =
if (isArabicNumeral(month)) month.toInt
else {
EthiopianDateParserConfig.monthsMap.getOrElse(
month,
geezNumberParser.convertGeezToArabicNumeral(month).getOrElse(0)
)
}
val dayNum =
if (isArabicNumeral(day)) day.toInt
else geezNumberParser.convertGeezToArabicNumeral(day).getOrElse(0)

return Some((yearNum, monthNum, dayNum))

}
}

def findGeezDate(input: String): Option[Date] = {
val isGregorianDate = (gregorianDateIndicator.findFirstIn(input)).isDefined

if (isGregorianDate) {
return None
}

val dateString: Option[(String, String, String)] = catchGeezDate(input)
val (yearNum, monthNum, dayNum) =
formatDate(dateString).getOrElse((0, 0, 0))

if (!isValidEthiopianCalendarDate(yearNum, monthNum, dayNum)) {
return None
}

for (
date <- geezToGregorianDateConverter(yearNum, monthNum, dayNum, datatype)
) {

return Some(date)
}
None

}
}
Loading
Loading