Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update InfoboxReferencesExtractor.scala #656

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,20 @@ import org.dbpedia.extraction.util.RichString.wrapString
import org.dbpedia.extraction.ontology.Ontology
import org.dbpedia.extraction.util._
import org.dbpedia.extraction.config.mappings.InfoboxExtractorConfig
import org.dbpedia.extraction.wikiparser.WikiPage

import scala.collection.mutable.ArrayBuffer
import org.dbpedia.extraction.config.dataparser.DataParserConfig
import org.dbpedia.extraction.mappings.MappingsLoader.loadOntologyProperty
import org.dbpedia.extraction.wikiparser.impl.simple.SimpleWikiParser
import org.dbpedia.iri.UriUtils

import scala.collection.mutable
import scala.language.reflectiveCalls

class InfoboxReferencesExtractor(
context : {
def mappings : Mappings
def ontology : Ontology
def language : Language
def redirects : Redirects
Expand All @@ -36,6 +41,8 @@ extends PageNodeExtractor

private val language = context.language

private val redirects = context.redirects

private val wikiCode = language.wikiCode

private val minRatioOfExplicitPropertyKeys = InfoboxExtractorConfig.minRatioOfExplicitPropertyKeys
Expand All @@ -52,10 +59,16 @@ extends PageNodeExtractor

private val TrailingNumberRegex = InfoboxExtractorConfig.TrailingNumberRegex

override val datasets = Set(DBpediaDatasets.InfoboxTest)
private val templateMappings = context.mappings.templateMappings

private val resolvedMappings: Map[String, Extractor[TemplateNode]] = context.redirects.resolveMap(templateMappings)

override val datasets = Set(DBpediaDatasets.InfoboxReferences,DBpediaDatasets.MappedInfoboxReferences)

override def extract(node : PageNode, subjectUri : String) : Seq[Quad] =
{
var refCount = 0
val citExt = new CitationExtractor(context)

if(node.title.namespace != Namespace.Main && !ExtractorUtils.titleContainsCommonsMetadata(node.title)) return Seq.empty

Expand Down Expand Up @@ -169,38 +182,92 @@ extends PageNodeExtractor
if !ignoreTemplatesRegex.exists(regex => regex.unapplySeq(getTitle).isDefined)
}
{
val grapH = extractMappings(template, subjectUri)

var propertiesFound = false

val propertyList = template.children.filterNot(property => ignoreProperties.get(wikiCode).getOrElse(ignoreProperties("en")).contains(property.key.toLowerCase))



val countPropertyKeys = propertyList.count(property => !property.key.forall(_.isDigit))
if ((countPropertyKeys >= minPropertyCount) && (countPropertyKeys.toDouble / propertyList.size) > minRatioOfExplicitPropertyKeys)
{
for(property <- propertyList) {
val propertyUri = getPropertyUri(property.key)

try {
if (infoboxesRaw contains template.title.decoded) {
if (infoboxesRaw(template.title.decoded) contains property.key) {

var tT = "template="+template.title.decoded.replace(" ","_")+"&property="+property.key.replace(" ","_")
var isMapped = ""
for(qd<-grapH) {
if (qd.context.contains("template="+template.title.decoded.replace(" ","_")+"&property="+property.key.replace(" ","_"))) {
quads += new Quad(language, DBpediaDatasets.MappedInfoboxReferences, qd.subject, qd.predicate, qd.value, qd.context, rdfLangStrDt)
isMapped = qd.predicate
}
}

val rawParameter = infoboxesRaw(template.title.decoded)(property.key)

val refWithContent = reg.findAllMatchIn(rawParameter).toArray
for (ref <- refWithContent) {
//val refB = ref.group("refn")
refCount+=1
val refValue = ref.group("refv")
quads += new Quad(language, DBpediaDatasets.InfoboxTest, subjectUri, propertyUri, refValue, property.sourceIri, rdfLangStrDt)
quads += new Quad(language, DBpediaDatasets.InfoboxReferences, subjectUri+"#ref:"+refCount, propertyUri, refValue, property.sourceIri, rdfLangStrDt)
if (isMapped != "") {
quads += new Quad(language, DBpediaDatasets.MappedInfoboxReferences, subjectUri+"#ref:"+refCount, isMapped, refValue, property.sourceIri, rdfLangStrDt)
}
//CitationExtractor need to be fixed: template names without whitespace are ignored (eq. "{{citeweb|...}}
val templateToExt = new WikiPage(node.title,refValue.replace("|", " |"))
val citExtRes = citExt.extract(templateToExt,"")
for (qwa <- citExtRes) {
var qWa2 = new Quad(language, DBpediaDatasets.InfoboxReferences, qwa.subject, qwa.predicate, qwa.value, qwa.context, rdfLangStrDt)

if (qWa2.predicate.contains("isCitedBy")){
qWa2 = new Quad(language, DBpediaDatasets.InfoboxReferences, qwa.subject, qwa.predicate, subjectUri+"#ref:"+refCount, qwa.context, rdfLangStrDt)
}
quads += qWa2
if (isMapped != "") {
var qWa3 = new Quad(language, DBpediaDatasets.MappedInfoboxReferences, qwa.subject, isMapped, qwa.value, qwa.context, rdfLangStrDt)
if (qWa3.predicate.contains("isCitedBy")){
qWa3 = new Quad(language, DBpediaDatasets.MappedInfoboxReferences, qwa.subject, isMapped, subjectUri+"#ref:"+refCount, qwa.context, rdfLangStrDt)
}
quads += qWa3
}
}
}


val refWithOnlyName = regz.findAllMatchIn(rawParameter).toArray
if (refWithOnlyName.length > 0) {

for (refi <- refWithOnlyName) {
val nameRe = """name[ '\\"]?=[ ]?[ '\\"]?([^>'\\"]+)""".r("nm").findFirstMatchIn(refi.group("refz")).get.group("nm")
if (refNames.contains(nameRe)) {
quads += new Quad(language, DBpediaDatasets.InfoboxTest, subjectUri, propertyUri, refNames(nameRe), property.sourceIri, rdfLangStrDt)
refCount+=1
quads += new Quad(language, DBpediaDatasets.InfoboxReferences, subjectUri+"#ref:"+refCount, propertyUri, refNames(nameRe), property.sourceIri, rdfLangStrDt)
if (isMapped != "") {
quads += new Quad(language, DBpediaDatasets.MappedInfoboxReferences, subjectUri+"#ref:"+refCount, isMapped, refNames(nameRe), property.sourceIri, rdfLangStrDt)
}
val templateToExt = new WikiPage(node.title,refNames(nameRe).replace("|", " |"))
val citExtRes = citExt.extract(templateToExt,"")
for (qwa <- citExtRes) {
var qWa2 = new Quad(language, DBpediaDatasets.InfoboxReferences, qwa.subject, qwa.predicate, qwa.value, qwa.context, rdfLangStrDt)

if (qWa2.predicate.contains("isCitedBy")){
qWa2 = new Quad(language, DBpediaDatasets.InfoboxReferences, qwa.subject, qwa.predicate, subjectUri+"#ref:"+refCount, qwa.context, rdfLangStrDt)
}
quads += qWa2
if (isMapped != "") {
var qWa3 = new Quad(language, DBpediaDatasets.MappedInfoboxReferences, qwa.subject, isMapped, qwa.value, qwa.context, rdfLangStrDt)
if (qWa3.predicate.contains("isCitedBy")){
qWa3 = new Quad(language, DBpediaDatasets.MappedInfoboxReferences, qwa.subject, isMapped, subjectUri+"#ref:"+refCount, qwa.context, rdfLangStrDt)
}
quads += qWa3
}
}
}

}
Expand All @@ -209,11 +276,34 @@ extends PageNodeExtractor

val refWithOnlyName2 = regz2.findFirstMatchIn(rawParameter).toArray
if (refWithOnlyName2.length > 0) {

for (refi <- refWithOnlyName2) {
val refK = refi.group("refc").split("\\|")
for (refiz <- refK) { // why if name contains \?
if (refNames.contains(refiz)) {
quads += new Quad(language, DBpediaDatasets.InfoboxTest, subjectUri, propertyUri, refNames(refiz.replace("\\", "")), property.sourceIri, rdfLangStrDt)
refCount+=1
quads += new Quad(language, DBpediaDatasets.InfoboxReferences, subjectUri+"#ref:"+refCount, propertyUri, refNames(refiz.replace("\\", "")), property.sourceIri, rdfLangStrDt)
if (isMapped != "") {
quads += new Quad(language, DBpediaDatasets.MappedInfoboxReferences, subjectUri+"#ref:"+refCount, isMapped, refNames(refiz.replace("\\", "")), property.sourceIri, rdfLangStrDt)
}
val templateToExt = new WikiPage(node.title,refNames(refiz.replace("\\", "").replace("|", " |")))
val citExtRes = citExt.extract(templateToExt,"")
for (qwa <- citExtRes) {
var qWa2 = new Quad(language, DBpediaDatasets.InfoboxReferences, qwa.subject, qwa.predicate, qwa.value, qwa.context, rdfLangStrDt)
if (qWa2.predicate.contains("isCitedBy")){
qWa2 = new Quad(language, DBpediaDatasets.InfoboxReferences, qwa.subject, qwa.predicate, subjectUri+"#ref:"+refCount, qwa.context, rdfLangStrDt)
}
quads += qWa2
if (isMapped != "") {
var qWa3 = new Quad(language, DBpediaDatasets.MappedInfoboxReferences, qwa.subject, isMapped, qwa.value, qwa.context, rdfLangStrDt)
if (qWa3.predicate.contains("isCitedBy")){
qWa3 = new Quad(language, DBpediaDatasets.MappedInfoboxReferences, qwa.subject, isMapped, subjectUri+"#ref:"+refCount, qwa.context, rdfLangStrDt)
}
quads += qWa3
}

}

}

}
Expand Down Expand Up @@ -257,6 +347,32 @@ extends PageNodeExtractor
language.propertyUri.append(result)
}

private def extractMappings(node : Node, subjectUri : String) : Seq[Quad] =
{
val graph = node match
{
case templateNode : TemplateNode =>
{
resolvedMappings.get(templateNode.title.decoded) match
{
case Some(mapping) => {
mapping.extract(templateNode, subjectUri)
}
case None => Seq.empty
}
}
case _ => Seq.empty
}

if(graph.isEmpty)
{
node.children.flatMap(child => extractMappings(child, subjectUri))
}
else
{
graph
}
}

}

Expand Down