Correct html cleanup functions

Somehow a bunch of the cleanup functions are return nil probably due to some debuggin that got stuck. This caused Wordpress responses not have a body, which is now fixed. Resolves #31
PushOCCRP · May 27, 2020 · 0b58fb9 · 0b58fb9
1 parent 6ffbfda
commit 0b58fb9
Show file tree

Hide file tree

Showing 4 changed files with 14 additions and 9 deletions.
diff --git a/app/controllers/articles_controller.rb b/app/controllers/articles_controller.rb
@@ -201,7 +201,6 @@ def clean_up_response(articles)
         article["body"] = scrubScriptTagsFromHTMLString article["body"]
         article["body"] = scrubJSCommentsFromHTMLString article["body"]
         article["body"] = scrubSpecialCharactersFromSingleLinesInHTMLString article["body"]
-        article["body"] = scrubHTMLSpecialCharactersInHTMLString article["body"]
         article["headline"] = HTMLEntities.new.decode(article["headline"])
       end
 

diff --git a/app/models/cms.rb b/app/models/cms.rb
@@ -405,10 +405,12 @@ def self.scrubWordpressTagsFromHTMLString(html_string) # rubocop:disable Naming/
       html_string
     end
 
-    def self.scrubCDataTags(html_string) # rubocop:disable Naming/MethodName
-      # scrubbed = html_string.gsub("// <![CDATA[", "")
-      # scrubbed = scrubbed.gsub("// ]]", "")
-    end
+    # For some reason this is commented out, I'm going to comment the whole thing, and if it breaks
+    # stuff we'll get errors at least
+    # def self.scrubCDataTags(html_string) # rubocop:disable Naming/MethodName
+    #   # scrubbed = html_string.gsub("// <![CDATA[", "")
+    #   # scrubbed = scrubbed.gsub("// ]]", "")
+    # end
 
     # \/\/.+
     def self.scrubJSCommentsFromHTMLString(html_string) # rubocop:disable Naming/MethodName
@@ -421,8 +423,12 @@ def self.scrubSpecialCharactersFromSingleLinesInHTMLString(html_string) # ruboco
       scrubbed
     end
 
+    # For some reason this is commented out, I'm going to comment the whole thing, and if it breaks
+    # stuff we'll get errors at least
     def self.scrubHTMLSpecialCharactersInHTMLString(html_string) # rubocop:disable Naming/MethodName
       # scrubbed = html_string.gsub(/^&[a-z0-9]+;/, "")
+      # scrubbed
+      html_string
     end
 
     def self.scrubScriptTagsFromHTMLString(html_string) # rubocop:disable Naming/MethodName

diff --git a/app/models/joomla_occrp.rb b/app/models/joomla_occrp.rb
@@ -172,7 +172,9 @@ def self.language_parameter(language)
 
     def self.clean_up_for_wordpress(articles)
       articles.each do |article|
-        article["body"] = scrubCDataTags article["body"]
+        # This is being commented out for archive purposes instead of deleting it. Please try
+        # Joomla before deleting fully
+        # article["body"] = scrubCDataTags article["body"]
         article["body"] = scrubScriptTagsFromHTMLString article["body"]
         article["body"] = scrubWordpressTagsFromHTMLString article["body"]
         # article['body'] = cleanUpNewLines article['body']

diff --git a/app/models/wordpress.rb b/app/models/wordpress.rb
@@ -138,7 +138,7 @@ def self.get_url(path, language, options = {})
 
     def self.make_request(url)
       logger.debug("Making request to #{url}")
-      response = HTTParty.get(CGI.encode(url))
+      response = HTTParty.get(url)
 
       begin
         body = JSON.parse response.body
@@ -202,13 +202,11 @@ def self.language_parameter(language)
 
     def self.clean_up_for_wordpress(articles)
       articles.each do |article|
-        article["body"] = scrubCDataTags article["body"]
         article["body"] = scrubScriptTagsFromHTMLString article["body"]
         article["body"] = scrubWordpressTagsFromHTMLString article["body"]
         # article['body'] = cleanUpNewLines article['body']
         article["body"] = scrubJSCommentsFromHTMLString article["body"]
         article["body"] = scrubSpecialCharactersFromSingleLinesInHTMLString article["body"]
-        article["body"] = scrubHTMLSpecialCharactersInHTMLString article["body"]
         article["body"] = normalizeSpacing article["body"]
         article["body"] = handle_paragraph_tags article["body"]