Citation-based Plagiarism Detection [1] is a novel approach for identifying academic plagiarism as well as documents that have been read, but not cited by an author. This later case must not always indicate plagiarism, but points to high document relatedness, which is of potential interest to a reader.
CbPD is designed for use in academic documents and is able to overcome deficits of existing text-based plagiarism detection methods. Existing text-based methods commonly fail to detect translated and strongly obfuscated plagiarism cases, since they solely analyze the words of a document for similarity.
In contrast, CbPD focuses on the semantic information contained in the citations within academic documents. CbPD aims to identify similar patterns in the citation sequences of academic works for similarity computation [2].
Our evaluations have shown that such patterns often remain detectable even if text has been translated or strongly paraphrased. Thus, in many cases CbPD allows detecting plagiarisms that could otherwise not have been detected automatically using traditional approaches.
That citation patterns in plagiarisms often have suspicious similarities with the citation patterns found in the original source documents was also confirmed in our analysis of the plagiarized doctoral thesis of Karl-Theodor zu Guttenberg [3]. CbPD is not a substitute, but rather an extension for currently used text-based plagiarism detection approaches. While the text analysis methods in use today can detect even short copies of text and slightly modified text segments, the CbPD approach requires longer passages with at least three citations in order to yield reliable results.
Related Publications
[Bibtex]
@INPROCEEDINGS{Gipp10c,
author = {Bela Gipp and Joeran Beel},
title = {{C}itation {B}ased {P}lagiarism {D}etection - {A} {N}ew {A}pproach to {I}dentify {P}lagiarized {W}ork {L}anguage {I}ndependently},
booktitle = {Proceedings of the 21st ACM Conference on Hyptertext and Hypermedia (HT'10)},
year = {2010},
pages = {273--274},
address = {New York, NY, USA},
month = jun,
publisher = {ACM},
note = {Available at http://sciplore.org/pub/},
doi = {10.1145/1810617.1810671},
isbn = {978-1-4503-0041-4},
location = {Toronto, Ontario, Canada}
}
[Bibtex]
@INPROCEEDINGS{Gipp11c,
author = {Gipp, Bela and Meuschke, Norman},
title = {{C}itation {P}attern {M}atching {A}lgorithms for {C}itation-based {P}lagiarism {D}etection: {G}reedy {C}itation {T}iling, {C}itation {C}hunking and {L}ongest {C}ommon {C}itation {S}equence},
booktitle = {Proceedings of the 11th ACM symposium on Document engineering},
year = {2011},
series = {DocEng '11},
pages = {249--258},
address = {New York, NY, USA},
month = sep,
publisher = {ACM},
note = {Available at: http://sciplore.org/pub/},
doi = {10.1145/2034691.2034741},
isbn = {978-1-4503-0863-2},
keywords = {citation order analysis, citation pattern analysis, citation-based,
plagiarism detection systems},
location = {Mountain View, California, USA},
numpages = {10},
url = {http://doi.acm.org/10.1145/2034691.2034741}
}
[Bibtex]
@INPROCEEDINGS{Gipp11,
author = {Gipp, Bela and Meuschke, Norman and Beel, Joeran},
title = {{C}omparative {E}valuation of {T}ext- and {C}itation-based {P}lagiarism {D}etection {A}pproaches using {G}utten{P}lag},
booktitle = {Proceedings of 11th ACM/IEEE-CS Joint Conference on Digital Libraries (JCDL'11)},
year = {2011},
pages = {255--258},
address = {Ottawa, Canada},
month = jun,
publisher = {ACM New York, NY, USA},
note = {Available at http://sciplore.org/pub/},
abstract = {Various approaches for plagiarism detection exist. All are based on
more or less sophisticated text analysis methods such as string matching,
fingerprinting or style comparison. In this paper a new approach
called Citation-based Plagiarism Detection is evaluated using a doctoral
thesis, in which a volunteer crowd-sourcing project called GuttenPlag
identified substantial amounts of plagiarism through careful manual
inspection. This new approach is able to identify similar and plagiarized
documents based on the citations used in the text. It is shown that
citation-based plagiarism detection performs significantly better
than text-based procedures in identifying strong paraphrasing, translation
and some idea plagiarism. Detection rates can be improved by combining
citation-based with text-based plagiarism detection.},
doi = {10.1145/1998076.1998124}
}
