mirror of
https://git.youjo.love/youjo/youjo-be.git
synced 2024-11-20 13:59:55 +01:00
Make Floki use fast_html
This commit is contained in:
parent
58299fcfb4
commit
ea1631d7e6
7 changed files with 30 additions and 13 deletions
|
@ -612,6 +612,8 @@ config :pleroma, :modules, runtime_dir: "instance/modules"
|
|||
|
||||
config :pleroma, configurable_from_database: false
|
||||
|
||||
config :floki, :html_parser, Floki.HTMLParser.FastHtml
|
||||
|
||||
# Import environment specific config. This must remain at the bottom
|
||||
# of this file so it overrides the configuration defined above.
|
||||
import_config "#{Mix.env()}.exs"
|
||||
|
|
|
@ -108,6 +108,7 @@ defmodule Pleroma.HTML do
|
|||
Cachex.fetch!(:scrubber_cache, key, fn _key ->
|
||||
result =
|
||||
content
|
||||
|> Floki.parse_fragment!()
|
||||
|> Floki.filter_out("a.mention,a.hashtag,a[rel~=\"tag\"]")
|
||||
|> Floki.attribute("a", "href")
|
||||
|> Enum.at(0)
|
||||
|
|
|
@ -17,6 +17,7 @@ defmodule Pleroma.Web.ActivityPub.MRF.AntiLinkSpamPolicy do
|
|||
# does the post contain links?
|
||||
defp contains_links?(%{"content" => content} = _object) do
|
||||
content
|
||||
|> Floki.parse_fragment!()
|
||||
|> Floki.filter_out("a.mention,a.hashtag,a[rel~=\"tag\"],a.zrl")
|
||||
|> Floki.attribute("a", "href")
|
||||
|> length() > 0
|
||||
|
|
|
@ -8,8 +8,10 @@ defmodule Pleroma.Web.Metadata.Providers.RelMe do
|
|||
|
||||
@impl Provider
|
||||
def build_tags(%{user: user}) do
|
||||
(Floki.attribute(user.bio, "link[rel~=me]", "href") ++
|
||||
Floki.attribute(user.bio, "a[rel~=me]", "href"))
|
||||
bio_tree = Floki.parse_fragment!(user.bio)
|
||||
|
||||
(Floki.attribute(bio_tree, "link[rel~=me]", "href") ++
|
||||
Floki.attribute(bio_tree, "a[rel~=me]", "href"))
|
||||
|> Enum.map(fn link ->
|
||||
{:link, [rel: "me", href: link], []}
|
||||
end)
|
||||
|
|
|
@ -27,9 +27,10 @@ defmodule Pleroma.Web.RelMe do
|
|||
defp parse_url(url) do
|
||||
with {:ok, %Tesla.Env{body: html, status: status}} when status in 200..299 <-
|
||||
Pleroma.HTTP.get(url, [], adapter: @hackney_options),
|
||||
{:ok, html_tree} <- Floki.parse_document(html),
|
||||
data <-
|
||||
Floki.attribute(html, "link[rel~=me]", "href") ++
|
||||
Floki.attribute(html, "a[rel~=me]", "href") do
|
||||
Floki.attribute(html_tree, "link[rel~=me]", "href") ++
|
||||
Floki.attribute(html_tree, "a[rel~=me]", "href") do
|
||||
{:ok, data}
|
||||
end
|
||||
rescue
|
||||
|
|
|
@ -81,18 +81,18 @@ defmodule Pleroma.Web.RichMedia.Parser do
|
|||
{:ok, %Tesla.Env{body: html}} = Pleroma.HTTP.get(url, [], adapter: @hackney_options)
|
||||
|
||||
html
|
||||
|> parse_html
|
||||
|> parse_html()
|
||||
|> maybe_parse()
|
||||
|> Map.put(:url, url)
|
||||
|> clean_parsed_data()
|
||||
|> check_parsed_data()
|
||||
rescue
|
||||
e ->
|
||||
{:error, "Parsing error: #{inspect(e)}"}
|
||||
{:error, "Parsing error: #{inspect(e)} #{inspect(__STACKTRACE__)}"}
|
||||
end
|
||||
end
|
||||
|
||||
defp parse_html(html), do: Floki.parse(html)
|
||||
defp parse_html(html), do: Floki.parse_document!(html)
|
||||
|
||||
defp maybe_parse(html) do
|
||||
Enum.reduce_while(parsers(), %{}, fn parser, acc ->
|
||||
|
|
|
@ -7,11 +7,14 @@ defmodule Pleroma.Web.RichMedia.Parsers.TwitterCardTest do
|
|||
alias Pleroma.Web.RichMedia.Parsers.TwitterCard
|
||||
|
||||
test "returns error when html not contains twitter card" do
|
||||
assert TwitterCard.parse("", %{}) == {:error, "No twitter card metadata found"}
|
||||
assert TwitterCard.parse([{"html", [], [{"head", [], []}, {"body", [], []}]}], %{}) ==
|
||||
{:error, "No twitter card metadata found"}
|
||||
end
|
||||
|
||||
test "parses twitter card with only name attributes" do
|
||||
html = File.read!("test/fixtures/nypd-facial-recognition-children-teenagers3.html")
|
||||
html =
|
||||
File.read!("test/fixtures/nypd-facial-recognition-children-teenagers3.html")
|
||||
|> Floki.parse_document!()
|
||||
|
||||
assert TwitterCard.parse(html, %{}) ==
|
||||
{:ok,
|
||||
|
@ -26,7 +29,9 @@ defmodule Pleroma.Web.RichMedia.Parsers.TwitterCardTest do
|
|||
end
|
||||
|
||||
test "parses twitter card with only property attributes" do
|
||||
html = File.read!("test/fixtures/nypd-facial-recognition-children-teenagers2.html")
|
||||
html =
|
||||
File.read!("test/fixtures/nypd-facial-recognition-children-teenagers2.html")
|
||||
|> Floki.parse_document!()
|
||||
|
||||
assert TwitterCard.parse(html, %{}) ==
|
||||
{:ok,
|
||||
|
@ -45,7 +50,9 @@ defmodule Pleroma.Web.RichMedia.Parsers.TwitterCardTest do
|
|||
end
|
||||
|
||||
test "parses twitter card with name & property attributes" do
|
||||
html = File.read!("test/fixtures/nypd-facial-recognition-children-teenagers.html")
|
||||
html =
|
||||
File.read!("test/fixtures/nypd-facial-recognition-children-teenagers.html")
|
||||
|> Floki.parse_document!()
|
||||
|
||||
assert TwitterCard.parse(html, %{}) ==
|
||||
{:ok,
|
||||
|
@ -73,7 +80,8 @@ defmodule Pleroma.Web.RichMedia.Parsers.TwitterCardTest do
|
|||
"YTQ5MF9EQVIgZXhodW1hdGlvbiBvZiBNYXJnYXJldCBDb3JiaW4gZ3JhdmUgMTkyNi5qcGciXSxbInAiLCJjb252ZXJ0IiwiIl0sWyJwIiwiY29udmVydCIsIi1xdWFsaXR5IDgxIC1hdXRvLW9" <>
|
||||
"yaWVudCJdLFsicCIsInRodW1iIiwiNjAweD4iXV0/DAR%20exhumation%20of%20Margaret%20Corbin%20grave%201926.jpg"
|
||||
|
||||
html = File.read!("test/fixtures/margaret-corbin-grave-west-point.html")
|
||||
html =
|
||||
File.read!("test/fixtures/margaret-corbin-grave-west-point.html") |> Floki.parse_document!()
|
||||
|
||||
assert TwitterCard.parse(html, %{}) ==
|
||||
{:ok,
|
||||
|
@ -87,7 +95,9 @@ defmodule Pleroma.Web.RichMedia.Parsers.TwitterCardTest do
|
|||
end
|
||||
|
||||
test "takes first founded title in html head if there is html markup error" do
|
||||
html = File.read!("test/fixtures/nypd-facial-recognition-children-teenagers4.html")
|
||||
html =
|
||||
File.read!("test/fixtures/nypd-facial-recognition-children-teenagers4.html")
|
||||
|> Floki.parse_document!()
|
||||
|
||||
assert TwitterCard.parse(html, %{}) ==
|
||||
{:ok,
|
||||
|
|
Loading…
Reference in a new issue