From 3f0440ac3c38b88fe449da9b8281d1dbadfa36d1 Mon Sep 17 00:00:00 2001
From: Sir_Boops <admin@boops.me>
Date: Sun, 15 Apr 2018 17:37:51 -0600
Subject: [PATCH] Dedupe uploads

---
 lib/mix/tasks/sample_config.eex              |   3 +-
 lib/pleroma/upload.ex                        | 116 ++++++++++++++-----
 lib/pleroma/web/activity_pub/activity_pub.ex |   2 +-
 test/upload_test.exs                         |  45 +++----
 4 files changed, 117 insertions(+), 49 deletions(-)

diff --git a/lib/mix/tasks/sample_config.eex b/lib/mix/tasks/sample_config.eex
index e37c864c0..d57591d53 100644
--- a/lib/mix/tasks/sample_config.eex
+++ b/lib/mix/tasks/sample_config.eex
@@ -8,7 +8,8 @@ config :pleroma, :instance,
   name: "<%= name %>",
   email: "<%= email %>",
   limit: 5000,
-  registrations_open: true
+  registrations_open: true,
+  dedupe_media: true
 
 config :pleroma, :media_proxy,
   enabled: false,
diff --git a/lib/pleroma/upload.ex b/lib/pleroma/upload.ex
index e5df94009..ab4bd16f0 100644
--- a/lib/pleroma/upload.ex
+++ b/lib/pleroma/upload.ex
@@ -2,20 +2,21 @@ defmodule Pleroma.Upload do
   alias Ecto.UUID
   alias Pleroma.Web
 
-  def store(%Plug.Upload{} = file) do
-    uuid = UUID.generate()
-    upload_folder = Path.join(upload_path(), uuid)
-    File.mkdir_p!(upload_folder)
-    result_file = Path.join(upload_folder, file.filename)
-    File.cp!(file.path, result_file)
+  def store(%Plug.Upload{} = file, should_dedupe) do
+    content_type = get_content_type(file.path)
+    uuid = get_uuid(file, should_dedupe)
+    name = get_name(file, uuid, content_type, should_dedupe)
+    upload_folder = get_upload_path(uuid, should_dedupe)
+    url_path = get_url(name, uuid, should_dedupe)
 
-    # fix content type on some image uploads
-    content_type =
-      if file.content_type in [nil, "application/octet-stream"] do
-        get_content_type(file.path)
-      else
-        file.content_type
-      end
+    File.mkdir_p!(upload_folder)
+    result_file = Path.join(upload_folder, name)
+
+    if File.exists?(result_file) do
+      File.rm!(file.path)
+    else
+      File.cp!(file.path, result_file)
+    end
 
     %{
       "type" => "Image",
@@ -23,26 +24,48 @@ defmodule Pleroma.Upload do
         %{
           "type" => "Link",
           "mediaType" => content_type,
-          "href" => url_for(Path.join(uuid, :cow_uri.urlencode(file.filename)))
+          "href" => url_path
         }
       ],
-      "name" => file.filename,
-      "uuid" => uuid
+      "name" => name
     }
   end
 
-  def store(%{"img" => "data:image/" <> image_data}) do
+  def store(%{"img" => "data:image/" <> image_data}, should_dedupe) do
     parsed = Regex.named_captures(~r/(?<filetype>jpeg|png|gif);base64,(?<data>.*)/, image_data)
-    data = Base.decode64!(parsed["data"])
+    data = Base.decode64!(parsed["data"], ignore: :whitespace)
     uuid = UUID.generate()
-    upload_folder = Path.join(upload_path(), uuid)
+    uuidpath = Path.join(upload_path(), uuid)
+    uuid = UUID.generate()
+
+    File.mkdir_p!(upload_path())
+
+    File.write!(uuidpath, data)
+
+    content_type = get_content_type(uuidpath)
+
+    name =
+      create_name(
+        String.downcase(Base.encode16(:crypto.hash(:sha256, data))),
+        parsed["filetype"],
+        content_type
+      )
+
+    upload_folder = get_upload_path(uuid, should_dedupe)
+    url_path = get_url(name, uuid, should_dedupe)
+
     File.mkdir_p!(upload_folder)
-    filename = Base.encode16(:crypto.hash(:sha256, data)) <> ".#{parsed["filetype"]}"
-    result_file = Path.join(upload_folder, filename)
+    result_file = Path.join(upload_folder, name)
 
-    File.write!(result_file, data)
-
-    content_type = "image/#{parsed["filetype"]}"
+    if should_dedupe do
+      if !File.exists?(result_file) do
+        File.rename(uuidpath, result_file)
+      else
+        File.rm!(uuidpath)
+      end
+    else
+      File.rename(uuidpath, result_file)
+    end
 
     %{
       "type" => "Image",
@@ -50,11 +73,10 @@ defmodule Pleroma.Upload do
         %{
           "type" => "Link",
           "mediaType" => content_type,
-          "href" => url_for(Path.join(uuid, :cow_uri.urlencode(filename)))
+          "href" => url_path
         }
       ],
-      "name" => filename,
-      "uuid" => uuid
+      "name" => name
     }
   end
 
@@ -63,6 +85,46 @@ defmodule Pleroma.Upload do
     Keyword.fetch!(settings, :uploads)
   end
 
+  defp create_name(uuid, ext, type) do
+    if type == "application/octet-stream" do
+      String.downcase(Enum.join([uuid, ext], "."))
+    else
+      String.downcase(Enum.join([uuid, List.last(String.split(type, "/"))], "."))
+    end
+  end
+
+  defp get_uuid(file, should_dedupe) do
+    if should_dedupe do
+      Base.encode16(:crypto.hash(:sha256, File.read!(file.path)))
+    else
+      UUID.generate()
+    end
+  end
+
+  defp get_name(file, uuid, type, should_dedupe) do
+    if should_dedupe do
+      create_name(uuid, List.last(String.split(file.filename, ".")), type)
+    else
+      file.filename
+    end
+  end
+
+  defp get_upload_path(uuid, should_dedupe) do
+    if should_dedupe do
+      upload_path()
+    else
+      Path.join(upload_path(), uuid)
+    end
+  end
+
+  defp get_url(name, uuid, should_dedupe) do
+    if should_dedupe do
+      url_for(:cow_uri.urlencode(name))
+    else
+      url_for(Path.join(uuid, :cow_uri.urlencode(name)))
+    end
+  end
+
   defp url_for(file) do
     "#{Web.base_url()}/media/#{file}"
   end
diff --git a/lib/pleroma/web/activity_pub/activity_pub.ex b/lib/pleroma/web/activity_pub/activity_pub.ex
index 4e0be5ba2..3a03f5fe4 100644
--- a/lib/pleroma/web/activity_pub/activity_pub.ex
+++ b/lib/pleroma/web/activity_pub/activity_pub.ex
@@ -492,7 +492,7 @@ defmodule Pleroma.Web.ActivityPub.ActivityPub do
   end
 
   def upload(file) do
-    data = Upload.store(file)
+    data = Upload.store(file, Application.get_env(:pleroma, :instance)[:dedupe_media])
     Repo.insert(%Object{data: data})
   end
 
diff --git a/test/upload_test.exs b/test/upload_test.exs
index d68b3e7ba..645f10293 100644
--- a/test/upload_test.exs
+++ b/test/upload_test.exs
@@ -3,40 +3,45 @@ defmodule Pleroma.UploadTest do
   use Pleroma.DataCase
 
   describe "Storing a file" do
-    test "copies the file to the configured folder" do
+    test "copies the file to the configured folder with deduping" do
+      File.cp!("test/fixtures/image.jpg", "test/fixtures/image_tmp.jpg")
+
       file = %Plug.Upload{
         content_type: "image/jpg",
-        path: Path.absname("test/fixtures/image.jpg"),
+        path: Path.absname("test/fixtures/image_tmp.jpg"),
         filename: "an [image.jpg"
       }
 
-      data = Upload.store(file)
-      assert data["name"] == "an [image.jpg"
+      data = Upload.store(file, true)
 
-      assert List.first(data["url"])["href"] ==
-               "http://localhost:4001/media/#{data["uuid"]}/an%20%5Bimage.jpg"
+      assert data["name"] ==
+               "e7a6d0cf595bff76f14c9a98b6c199539559e8b844e02e51e5efcfd1f614a2df.jpeg"
     end
 
-    test "fixes an incorrect content type" do
+    test "copies the file to the configured folder without deduping" do
+      File.cp!("test/fixtures/image.jpg", "test/fixtures/image_tmp.jpg")
+
+      file = %Plug.Upload{
+        content_type: "image/jpg",
+        path: Path.absname("test/fixtures/image_tmp.jpg"),
+        filename: "an [image.jpg"
+      }
+
+      data = Upload.store(file, false)
+      assert data["name"] == "an [image.jpg"
+    end
+
+    test "fixes incorrect content type" do
+      File.cp!("test/fixtures/image.jpg", "test/fixtures/image_tmp.jpg")
+
       file = %Plug.Upload{
         content_type: "application/octet-stream",
-        path: Path.absname("test/fixtures/image.jpg"),
+        path: Path.absname("test/fixtures/image_tmp.jpg"),
         filename: "an [image.jpg"
       }
 
-      data = Upload.store(file)
+      data = Upload.store(file, true)
       assert hd(data["url"])["mediaType"] == "image/jpeg"
     end
-
-    test "does not modify a valid content type" do
-      file = %Plug.Upload{
-        content_type: "image/png",
-        path: Path.absname("test/fixtures/image.jpg"),
-        filename: "an [image.jpg"
-      }
-
-      data = Upload.store(file)
-      assert hd(data["url"])["mediaType"] == "image/png"
-    end
   end
 end