Citation-based Plagiarism Detection [1] is a novel approach for identifying academic plagiarism and documents that have been read, but not cited by the particular author. The latter need not represent plagiarism, but are related, thus potentially interesting to a reader.
CbPD is designed for usage with academic documents and intended to overcome deficits of existing text-based plagiarism detection methods. These commonly fail to detect translated or strongly obfuscated plagiarism cases, since they solely analyze the words of a document for similarity.
In contrast, CbPD focuses on the semantic information contained in the citations used in academic documents. It aims to identify similar patterns in the citation sequences of academic works for similarity computation [2].
Our investigations showed that such patterns often remain detectable even if text has been translated or strongly paraphrased. Thus, in many cases CbPD allows the detection of plagiarized work that could not be detected automatically with traditional approaches.
This was also proven by analyzing the plagiarized doctoral thesis of Karl-Theodor zu Guttenberg [3]. CbPD should be considered as an extension rather than a substitute for text-based approaches. Whereas the known text analysis methods can detect even short copied or modestly modified text segments, the proposed approach requires longer passages with at least three citations in order to yield reliable results.
Related Publications
[Bibtex]
@INPROCEEDINGS{Gipp10c,
author = {Bela Gipp and Joeran Beel},
title = {{C}itation {B}ased {P}lagiarism {D}etection - {A} {N}ew {A}pproach to {I}dentify {P}lagiarized {W}ork {L}anguage {I}ndependently},
booktitle = {Proceedings of the 21st ACM Conference on Hyptertext and Hypermedia (HT'10)},
year = {2010},
pages = {273--274},
address = {New York, NY, USA},
month = jun,
publisher = {ACM},
note = {Available at http://gipp.com/pub},
doi = {10.1145/1810617.1810671},
isbn = {978-1-4503-0041-4},
location = {Toronto, Ontario, Canada}
}
[Bibtex]
@INPROCEEDINGS{Gipp11c,
author = {Gipp, Bela and Meuschke, Norman},
title = {{C}itation {P}attern {M}atching {A}lgorithms for {C}itation-based {P}lagiarism {D}etection: {G}reedy {C}itation {T}iling, {C}itation {C}hunking and {L}ongest {C}ommon {C}itation {S}equence},
booktitle = {Proceedings of the 11th ACM symposium on Document engineering},
year = {2011},
series = {DocEng '11},
pages = {249--258},
address = {New York, NY, USA},
month = sep,
publisher = {ACM},
note = {Available at: \url{http://sciplore.org/pub}.},
doi = {10.1145/2034691.2034741},
isbn = {978-1-4503-0863-2},
keywords = {citation order analysis, citation pattern analysis, citation-based,
plagiarism detection systems},
location = {Mountain View, California, USA},
numpages = {10},
url = {http://doi.acm.org/10.1145/2034691.2034741}
}
[Bibtex]
@INPROCEEDINGS{Gipp11,
author = {Gipp, Bela and Meuschke, Norman and Beel, Joeran},
title = {{C}omparative {E}valuation of {T}ext- and {C}itation-based {P}lagiarism {D}etection {A}pproaches using {G}utten{P}lag},
booktitle = {Proceedings of 11th ACM/IEEE-CS Joint Conference on Digital Libraries (JCDL'11)},
year = {2011},
pages = {255--258},
address = {Ottawa, Canada},
month = jun,
publisher = {ACM New York, NY, USA},
note = {Available at http://gipp.com/pub},
abstract = {Various approaches for plagiarism detection exist. All are based on
more or less sophisticated text analysis methods such as string matching,
fingerprinting or style comparison. In this paper a new approach
called Citation-based Plagiarism Detection is evaluated using a doctoral
thesis, in which a volunteer crowd-sourcing project called GuttenPlag
identified substantial amounts of plagiarism through careful manual
inspection. This new approach is able to identify similar and plagiarized
documents based on the citations used in the text. It is shown that
citation-based plagiarism detection performs significantly better
than text-based procedures in identifying strong paraphrasing, translation
and some idea plagiarism. Detection rates can be improved by combining
citation-based with text-based plagiarism detection.},
doi = {10.1145/1998076.1998124}
}
